Merge tag 'for-5.14-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

+2

fs/btrfs/Kconfig

··· 18 18 select RAID6_PQ 19 19 select XOR_BLOCKS 20 20 select SRCU 21 + depends on !PPC_256K_PAGES # powerpc 22 + depends on !PAGE_SIZE_256KB # hexagon 21 23 22 24 help 23 25 Btrfs is a general purpose copy-on-write filesystem with extents,

+1 -1

fs/btrfs/backref.c

··· 2675 2675 * 2676 2676 * @ref_key: The same as @ref_key in handle_direct_tree_backref() 2677 2677 * @tree_key: The first key of this tree block. 2678 - * @path: A clean (released) path, to avoid allocating path everytime 2678 + * @path: A clean (released) path, to avoid allocating path every time 2679 2679 * the function get called. 2680 2680 */ 2681 2681 static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache,

+17 -14

fs/btrfs/block-group.c

··· 1399 1399 btrfs_space_info_update_bytes_pinned(fs_info, space_info, 1400 1400 -block_group->pinned); 1401 1401 space_info->bytes_readonly += block_group->pinned; 1402 - __btrfs_mod_total_bytes_pinned(space_info, -block_group->pinned); 1403 1402 block_group->pinned = 0; 1404 1403 1405 1404 spin_unlock(&block_group->lock); ··· 1490 1491 container_of(work, struct btrfs_fs_info, reclaim_bgs_work); 1491 1492 struct btrfs_block_group *bg; 1492 1493 struct btrfs_space_info *space_info; 1493 - int ret; 1494 + LIST_HEAD(again_list); 1494 1495 1495 1496 if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1496 1497 return; ··· 1501 1502 mutex_lock(&fs_info->reclaim_bgs_lock); 1502 1503 spin_lock(&fs_info->unused_bgs_lock); 1503 1504 while (!list_empty(&fs_info->reclaim_bgs)) { 1505 + int ret = 0; 1506 + 1504 1507 bg = list_first_entry(&fs_info->reclaim_bgs, 1505 1508 struct btrfs_block_group, 1506 1509 bg_list); ··· 1548 1547 bg->start); 1549 1548 1550 1549 next: 1551 - btrfs_put_block_group(bg); 1552 1550 spin_lock(&fs_info->unused_bgs_lock); 1551 + if (ret == -EAGAIN && list_empty(&bg->bg_list)) 1552 + list_add_tail(&bg->bg_list, &again_list); 1553 + else 1554 + btrfs_put_block_group(bg); 1553 1555 } 1556 + list_splice_tail(&again_list, &fs_info->reclaim_bgs); 1554 1557 spin_unlock(&fs_info->unused_bgs_lock); 1555 1558 mutex_unlock(&fs_info->reclaim_bgs_lock); 1556 1559 btrfs_exclop_finish(fs_info); ··· 2510 2505 struct extent_changeset *data_reserved = NULL; 2511 2506 u64 alloc_hint = 0; 2512 2507 int dcs = BTRFS_DC_ERROR; 2513 - u64 num_pages = 0; 2508 + u64 cache_size = 0; 2514 2509 int retries = 0; 2515 2510 int ret = 0; 2516 2511 ··· 2622 2617 * taking up quite a bit since it's not folded into the other space 2623 2618 * cache. 2624 2619 */ 2625 - num_pages = div_u64(block_group->length, SZ_256M); 2626 - if (!num_pages) 2627 - num_pages = 1; 2620 + cache_size = div_u64(block_group->length, SZ_256M); 2621 + if (!cache_size) 2622 + cache_size = 1; 2628 2623 2629 - num_pages *= 16; 2630 - num_pages *= PAGE_SIZE; 2624 + cache_size *= 16; 2625 + cache_size *= fs_info->sectorsize; 2631 2626 2632 2627 ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0, 2633 - num_pages); 2628 + cache_size); 2634 2629 if (ret) 2635 2630 goto out_put; 2636 2631 2637 - ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 2638 - num_pages, num_pages, 2632 + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size, 2633 + cache_size, cache_size, 2639 2634 &alloc_hint); 2640 2635 /* 2641 2636 * Our cache requires contiguous chunks so that we don't modify a bunch ··· 3067 3062 spin_unlock(&cache->lock); 3068 3063 spin_unlock(&cache->space_info->lock); 3069 3064 3070 - __btrfs_mod_total_bytes_pinned(cache->space_info, 3071 - num_bytes); 3072 3065 set_extent_dirty(&trans->transaction->pinned_extents, 3073 3066 bytenr, bytenr + num_bytes - 1, 3074 3067 GFP_NOFS | __GFP_NOFAIL);

+15 -42

fs/btrfs/compression.c

··· 149 149 const u32 csum_size = fs_info->csum_size; 150 150 const u32 sectorsize = fs_info->sectorsize; 151 151 struct page *page; 152 - unsigned long i; 152 + unsigned int i; 153 153 char *kaddr; 154 154 u8 csum[BTRFS_CSUM_SIZE]; 155 155 struct compressed_bio *cb = bio->bi_private; ··· 208 208 struct compressed_bio *cb = bio->bi_private; 209 209 struct inode *inode; 210 210 struct page *page; 211 - unsigned long index; 211 + unsigned int index; 212 212 unsigned int mirror = btrfs_io_bio(bio)->mirror_num; 213 213 int ret = 0; 214 214 ··· 334 334 struct compressed_bio *cb = bio->bi_private; 335 335 struct inode *inode; 336 336 struct page *page; 337 - unsigned long index; 337 + unsigned int index; 338 338 339 339 if (bio->bi_status) 340 340 cb->errors = 1; ··· 349 349 * call back into the FS and do all the end_io operations 350 350 */ 351 351 inode = cb->inode; 352 - cb->compressed_pages[0]->mapping = cb->inode->i_mapping; 353 352 btrfs_record_physical_zoned(inode, cb->start, bio); 354 - btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], 353 + btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, 355 354 cb->start, cb->start + cb->len - 1, 356 355 bio->bi_status == BLK_STS_OK); 357 - cb->compressed_pages[0]->mapping = NULL; 358 356 359 357 end_compressed_writeback(inode, cb); 360 358 /* note, our inode could be gone now */ ··· 385 387 * the end io hooks. 386 388 */ 387 389 blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, 388 - unsigned long len, u64 disk_start, 389 - unsigned long compressed_len, 390 + unsigned int len, u64 disk_start, 391 + unsigned int compressed_len, 390 392 struct page **compressed_pages, 391 - unsigned long nr_pages, 393 + unsigned int nr_pages, 392 394 unsigned int write_flags, 393 395 struct cgroup_subsys_state *blkcg_css) 394 396 { ··· 425 427 bio->bi_end_io = end_compressed_bio_write; 426 428 427 429 if (use_append) { 428 - struct extent_map *em; 429 - struct map_lookup *map; 430 - struct block_device *bdev; 430 + struct btrfs_device *device; 431 431 432 - em = btrfs_get_chunk_map(fs_info, disk_start, PAGE_SIZE); 433 - if (IS_ERR(em)) { 432 + device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE); 433 + if (IS_ERR(device)) { 434 434 kfree(cb); 435 435 bio_put(bio); 436 436 return BLK_STS_NOTSUPP; 437 437 } 438 438 439 - map = em->map_lookup; 440 - /* We only support single profile for now */ 441 - ASSERT(map->num_stripes == 1); 442 - bdev = map->stripes[0].dev->bdev; 443 - 444 - bio_set_dev(bio, bdev); 445 - free_extent_map(em); 439 + bio_set_dev(bio, device->bdev); 446 440 } 447 441 448 442 if (blkcg_css) { ··· 505 515 } 506 516 if (bytes_left < PAGE_SIZE) { 507 517 btrfs_info(fs_info, 508 - "bytes left %lu compress len %lu nr %lu", 518 + "bytes left %lu compress len %u nr %u", 509 519 bytes_left, cb->compressed_len, cb->nr_pages); 510 520 } 511 521 bytes_left -= PAGE_SIZE; ··· 667 677 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 668 678 struct extent_map_tree *em_tree; 669 679 struct compressed_bio *cb; 670 - unsigned long compressed_len; 671 - unsigned long nr_pages; 672 - unsigned long pg_index; 680 + unsigned int compressed_len; 681 + unsigned int nr_pages; 682 + unsigned int pg_index; 673 683 struct page *page; 674 684 struct bio *comp_bio; 675 685 u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; ··· 1192 1202 * 1193 1203 * @total_out is an in/out parameter, must be set to the input length and will 1194 1204 * be also used to return the total number of compressed bytes 1195 - * 1196 - * @max_out tells us the max number of bytes that we're allowed to 1197 - * stuff into pages 1198 1205 */ 1199 1206 int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, 1200 1207 u64 start, struct page **pages, ··· 1212 1225 return ret; 1213 1226 } 1214 1227 1215 - /* 1216 - * pages_in is an array of pages with compressed data. 1217 - * 1218 - * disk_start is the starting logical offset of this array in the file 1219 - * 1220 - * orig_bio contains the pages from the file that we want to decompress into 1221 - * 1222 - * srclen is the number of bytes in pages_in 1223 - * 1224 - * The basic idea is that we have a bio that was created by readpages. 1225 - * The pages in the bio are for the uncompressed data, and they may not 1226 - * be contiguous. They all correspond to the range of bytes covered by 1227 - * the compressed extent. 1228 - */ 1229 1228 static int btrfs_decompress_bio(struct compressed_bio *cb) 1230 1229 { 1231 1230 struct list_head *workspace;

+13 -13

fs/btrfs/compression.h

··· 31 31 /* number of bios pending for this compressed extent */ 32 32 refcount_t pending_bios; 33 33 34 + /* Number of compressed pages in the array */ 35 + unsigned int nr_pages; 36 + 34 37 /* the pages with the compressed data on them */ 35 38 struct page **compressed_pages; 36 39 ··· 43 40 /* starting offset in the inode for our pages */ 44 41 u64 start; 45 42 46 - /* number of bytes in the inode we're working on */ 47 - unsigned long len; 43 + /* Number of bytes in the inode we're working on */ 44 + unsigned int len; 48 45 49 - /* number of bytes on disk */ 50 - unsigned long compressed_len; 46 + /* Number of bytes on disk */ 47 + unsigned int compressed_len; 51 48 52 - /* the compression algorithm for this bio */ 53 - int compress_type; 54 - 55 - /* number of compressed pages in the array */ 56 - unsigned long nr_pages; 49 + /* The compression algorithm for this bio */ 50 + u8 compress_type; 57 51 58 52 /* IO errors */ 59 - int errors; 53 + u8 errors; 60 54 int mirror_num; 61 55 62 56 /* for reads, this is the bio we are copying the data into */ ··· 91 91 struct bio *bio); 92 92 93 93 blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, 94 - unsigned long len, u64 disk_start, 95 - unsigned long compressed_len, 94 + unsigned int len, u64 disk_start, 95 + unsigned int compressed_len, 96 96 struct page **compressed_pages, 97 - unsigned long nr_pages, 97 + unsigned int nr_pages, 98 98 unsigned int write_flags, 99 99 struct cgroup_subsys_state *blkcg_css); 100 100 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,

+1 -4

fs/btrfs/ctree.c

··· 596 596 trans->transid, fs_info->generation); 597 597 598 598 if (!should_cow_block(trans, root, buf)) { 599 - trans->dirty = true; 600 599 *cow_ret = buf; 601 600 return 0; 602 601 } ··· 1787 1788 * then we don't want to set the path blocking, 1788 1789 * so we test it here 1789 1790 */ 1790 - if (!should_cow_block(trans, root, b)) { 1791 - trans->dirty = true; 1791 + if (!should_cow_block(trans, root, b)) 1792 1792 goto cow_done; 1793 - } 1794 1793 1795 1794 /* 1796 1795 * must have write locks on this node and the

+71 -49

fs/btrfs/ctree.h

··· 561 561 /* 562 562 * Indicate that balance has been set up from the ioctl and is in the 563 563 * main phase. The fs_info::balance_ctl is initialized. 564 - * Set and cleared while holding fs_info::balance_mutex. 565 564 */ 566 565 BTRFS_FS_BALANCE_RUNNING, 566 + 567 + /* 568 + * Indicate that relocation of a chunk has started, it's set per chunk 569 + * and is toggled between chunks. 570 + * Set, tested and cleared while holding fs_info::send_reloc_lock. 571 + */ 572 + BTRFS_FS_RELOC_RUNNING, 567 573 568 574 /* Indicate that the cleaner thread is awake and doing something. */ 569 575 BTRFS_FS_CLEANER_RUNNING, ··· 823 817 struct kobject *space_info_kobj; 824 818 struct kobject *qgroups_kobj; 825 819 826 - u64 total_pinned; 827 - 828 820 /* used to keep from writing metadata until there is a nice batch */ 829 821 struct percpu_counter dirty_metadata_bytes; 830 822 struct percpu_counter delalloc_bytes; ··· 874 870 atomic_t balance_cancel_req; 875 871 struct btrfs_balance_control *balance_ctl; 876 872 wait_queue_head_t balance_wait_q; 873 + 874 + /* Cancellation requests for chunk relocation */ 875 + atomic_t reloc_cancel_req; 877 876 878 877 u32 data_chunk_allocations; 879 878 u32 metadata_ratio; ··· 993 986 994 987 struct crypto_shash *csum_shash; 995 988 989 + spinlock_t send_reloc_lock; 996 990 /* 997 991 * Number of send operations in progress. 998 - * Updated while holding fs_info::balance_mutex. 992 + * Updated while holding fs_info::send_reloc_lock. 999 993 */ 1000 994 int send_in_progress; 1001 995 1002 - /* Type of exclusive operation running */ 1003 - unsigned long exclusive_operation; 996 + /* Type of exclusive operation running, protected by super_lock */ 997 + enum btrfs_exclusive_operation exclusive_operation; 1004 998 1005 999 /* 1006 1000 * Zone size > 0 when in ZONED mode, otherwise it's used for a check ··· 1383 1375 * 1384 1376 * Note: don't forget to add new options to btrfs_show_options() 1385 1377 */ 1386 - #define BTRFS_MOUNT_NODATASUM (1 << 0) 1387 - #define BTRFS_MOUNT_NODATACOW (1 << 1) 1388 - #define BTRFS_MOUNT_NOBARRIER (1 << 2) 1389 - #define BTRFS_MOUNT_SSD (1 << 3) 1390 - #define BTRFS_MOUNT_DEGRADED (1 << 4) 1391 - #define BTRFS_MOUNT_COMPRESS (1 << 5) 1392 - #define BTRFS_MOUNT_NOTREELOG (1 << 6) 1393 - #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) 1394 - #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) 1395 - #define BTRFS_MOUNT_NOSSD (1 << 9) 1396 - #define BTRFS_MOUNT_DISCARD_SYNC (1 << 10) 1397 - #define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) 1398 - #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) 1399 - #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) 1400 - #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) 1401 - #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1402 - #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1403 - /* bit 17 is free */ 1404 - #define BTRFS_MOUNT_USEBACKUPROOT (1 << 18) 1405 - #define BTRFS_MOUNT_SKIP_BALANCE (1 << 19) 1406 - #define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20) 1407 - #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) 1408 - #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) 1409 - #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) 1410 - #define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) 1411 - #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) 1412 - #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) 1413 - #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) 1414 - #define BTRFS_MOUNT_REF_VERIFY (1 << 28) 1415 - #define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) 1416 - #define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30) 1417 - #define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31) 1378 + enum { 1379 + BTRFS_MOUNT_NODATASUM = (1UL << 0), 1380 + BTRFS_MOUNT_NODATACOW = (1UL << 1), 1381 + BTRFS_MOUNT_NOBARRIER = (1UL << 2), 1382 + BTRFS_MOUNT_SSD = (1UL << 3), 1383 + BTRFS_MOUNT_DEGRADED = (1UL << 4), 1384 + BTRFS_MOUNT_COMPRESS = (1UL << 5), 1385 + BTRFS_MOUNT_NOTREELOG = (1UL << 6), 1386 + BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), 1387 + BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), 1388 + BTRFS_MOUNT_NOSSD = (1UL << 9), 1389 + BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), 1390 + BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), 1391 + BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), 1392 + BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), 1393 + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), 1394 + BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), 1395 + BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), 1396 + BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), 1397 + BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), 1398 + BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), 1399 + BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), 1400 + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), 1401 + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), 1402 + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), 1403 + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), 1404 + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), 1405 + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), 1406 + BTRFS_MOUNT_REF_VERIFY = (1UL << 27), 1407 + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), 1408 + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), 1409 + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), 1410 + }; 1418 1411 1419 1412 #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 1420 1413 #define BTRFS_DEFAULT_MAX_INLINE (2048) ··· 2225 2216 2226 2217 static inline bool btrfs_root_readonly(const struct btrfs_root *root) 2227 2218 { 2219 + /* Byte-swap the constant at compile time, root_item::flags is LE */ 2228 2220 return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; 2229 2221 } 2230 2222 2231 2223 static inline bool btrfs_root_dead(const struct btrfs_root *root) 2232 2224 { 2225 + /* Byte-swap the constant at compile time, root_item::flags is LE */ 2233 2226 return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; 2234 2227 } 2235 2228 ··· 2757 2746 /* 2758 2747 * Flush space by above mentioned methods and by: 2759 2748 * - Running delayed iputs 2760 - * - Commiting transaction 2749 + * - Committing transaction 2761 2750 * 2762 - * Can be interruped by fatal signal. 2751 + * Can be interrupted by a fatal signal. 2763 2752 */ 2764 2753 BTRFS_RESERVE_FLUSH_DATA, 2765 2754 BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, ··· 2769 2758 * Pretty much the same as FLUSH_ALL, but can also steal space from 2770 2759 * global rsv. 2771 2760 * 2772 - * Can be interruped by fatal signal. 2761 + * Can be interrupted by a fatal signal. 2773 2762 */ 2774 2763 BTRFS_RESERVE_FLUSH_ALL_STEAL, 2775 2764 }; ··· 2785 2774 ALLOC_CHUNK_FORCE = 8, 2786 2775 RUN_DELAYED_IPUTS = 9, 2787 2776 COMMIT_TRANS = 10, 2788 - FORCE_COMMIT_TRANS = 11, 2789 2777 }; 2790 2778 2791 2779 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, ··· 3110 3100 /* inode.c */ 3111 3101 blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, 3112 3102 int mirror_num, unsigned long bio_flags); 3113 - int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, 3114 - struct page *page, u64 start, u64 end); 3103 + unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, 3104 + struct page *page, u64 start, u64 end); 3115 3105 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, 3116 3106 u64 start, u64 len); 3117 3107 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, ··· 3135 3125 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 3136 3126 struct btrfs_root *root, 3137 3127 struct btrfs_inode *inode, u64 new_size, 3138 - u32 min_type); 3128 + u32 min_type, u64 *extents_found); 3139 3129 3140 3130 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); 3141 3131 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, ··· 3156 3146 struct extent_state *orig, u64 split); 3157 3147 int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, 3158 3148 unsigned long bio_flags); 3159 - bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, 3160 - unsigned int size); 3161 - void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end); 3149 + void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); 3162 3150 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); 3163 3151 int btrfs_readpage(struct file *file, struct page *page); 3164 3152 void btrfs_evict_inode(struct inode *inode); ··· 3195 3187 u64 start, u64 end, int *page_started, unsigned long *nr_written, 3196 3188 struct writeback_control *wbc); 3197 3189 int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); 3198 - void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, 3190 + void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, 3191 + struct page *page, u64 start, 3199 3192 u64 end, int uptodate); 3200 3193 extern const struct dentry_operations btrfs_dentry_operations; 3201 3194 extern const struct iomap_ops btrfs_dio_iomap_ops; ··· 3231 3222 struct btrfs_ioctl_balance_args *bargs); 3232 3223 bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, 3233 3224 enum btrfs_exclusive_operation type); 3225 + bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, 3226 + enum btrfs_exclusive_operation type); 3227 + void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); 3234 3228 void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); 3235 3229 3236 3230 /* file.c */ ··· 3797 3785 { 3798 3786 return fs_info->zoned != 0; 3799 3787 } 3788 + 3789 + /* 3790 + * We use page status Private2 to indicate there is an ordered extent with 3791 + * unfinished IO. 3792 + * 3793 + * Rename the Private2 accessors to Ordered, to improve readability. 3794 + */ 3795 + #define PageOrdered(page) PagePrivate2(page) 3796 + #define SetPageOrdered(page) SetPagePrivate2(page) 3797 + #define ClearPageOrdered(page) ClearPagePrivate2(page) 3800 3798 3801 3799 #endif

+1 -1

fs/btrfs/delalloc-space.c

··· 89 89 * ->outstanding_extents += 1 (current value is 1) 90 90 * 91 91 * -> set_delalloc 92 - * ->outstanding_extents += 1 (currrent value is 2) 92 + * ->outstanding_extents += 1 (current value is 2) 93 93 * 94 94 * -> btrfs_delalloc_release_extents() 95 95 * ->outstanding_extents -= 1 (current value is 1)

+23 -18

fs/btrfs/delayed-inode.c

··· 681 681 { 682 682 struct btrfs_delayed_item *curr, *next; 683 683 int free_space; 684 - int total_data_size = 0, total_size = 0; 684 + int total_size = 0; 685 685 struct extent_buffer *leaf; 686 686 char *data_ptr; 687 687 struct btrfs_key *keys; ··· 706 706 */ 707 707 while (total_size + next->data_len + sizeof(struct btrfs_item) <= 708 708 free_space) { 709 - total_data_size += next->data_len; 710 709 total_size += next->data_len + sizeof(struct btrfs_item); 711 710 list_add_tail(&next->tree_list, &head); 712 711 nitems++; ··· 973 974 974 975 static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) 975 976 { 976 - struct btrfs_delayed_root *delayed_root; 977 977 978 - ASSERT(delayed_node->root); 979 - clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); 980 - delayed_node->count--; 978 + if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { 979 + struct btrfs_delayed_root *delayed_root; 981 980 982 - delayed_root = delayed_node->root->fs_info->delayed_root; 983 - finish_one_item(delayed_root); 981 + ASSERT(delayed_node->root); 982 + delayed_node->count--; 983 + 984 + delayed_root = delayed_node->root->fs_info->delayed_root; 985 + finish_one_item(delayed_root); 986 + } 984 987 } 985 988 986 989 static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, ··· 1010 1009 nofs_flag = memalloc_nofs_save(); 1011 1010 ret = btrfs_lookup_inode(trans, root, path, &key, mod); 1012 1011 memalloc_nofs_restore(nofs_flag); 1013 - if (ret > 0) { 1014 - btrfs_release_path(path); 1015 - return -ENOENT; 1016 - } else if (ret < 0) { 1017 - return ret; 1018 - } 1012 + if (ret > 0) 1013 + ret = -ENOENT; 1014 + if (ret < 0) 1015 + goto out; 1019 1016 1020 1017 leaf = path->nodes[0]; 1021 1018 inode_item = btrfs_item_ptr(leaf, path->slots[0], ··· 1023 1024 btrfs_mark_buffer_dirty(leaf); 1024 1025 1025 1026 if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) 1026 - goto no_iref; 1027 + goto out; 1027 1028 1028 1029 path->slots[0]++; 1029 1030 if (path->slots[0] >= btrfs_header_nritems(leaf)) ··· 1045 1046 btrfs_del_item(trans, root, path); 1046 1047 out: 1047 1048 btrfs_release_delayed_iref(node); 1048 - no_iref: 1049 1049 btrfs_release_path(path); 1050 1050 err_out: 1051 1051 btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0)); 1052 1052 btrfs_release_delayed_inode(node); 1053 + 1054 + /* 1055 + * If we fail to update the delayed inode we need to abort the 1056 + * transaction, because we could leave the inode with the improper 1057 + * counts behind. 1058 + */ 1059 + if (ret && ret != -ENOENT) 1060 + btrfs_abort_transaction(trans, ret); 1053 1061 1054 1062 return ret; 1055 1063 ··· 1904 1898 btrfs_release_delayed_item(prev_item); 1905 1899 } 1906 1900 1907 - if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) 1908 - btrfs_release_delayed_iref(delayed_node); 1901 + btrfs_release_delayed_iref(delayed_node); 1909 1902 1910 1903 if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 1911 1904 btrfs_delayed_inode_release_metadata(fs_info, delayed_node, false);

-26

fs/btrfs/delayed-ref.c

··· 641 641 struct btrfs_delayed_ref_root *delayed_refs = 642 642 &trans->transaction->delayed_refs; 643 643 struct btrfs_fs_info *fs_info = trans->fs_info; 644 - u64 flags = btrfs_ref_head_to_space_flags(existing); 645 644 int old_ref_mod; 646 645 647 646 BUG_ON(existing->is_data != update->is_data); ··· 709 710 trans->delayed_ref_updates += csum_leaves; 710 711 } 711 712 } 712 - 713 - /* 714 - * This handles the following conditions: 715 - * 716 - * 1. We had a ref mod of 0 or more and went negative, indicating that 717 - * we may be freeing space, so add our space to the 718 - * total_bytes_pinned counter. 719 - * 2. We were negative and went to 0 or positive, so no longer can say 720 - * that the space would be pinned, decrement our counter from the 721 - * total_bytes_pinned counter. 722 - * 3. We are now at 0 and have ->must_insert_reserved set, which means 723 - * this was a new allocation and then we dropped it, and thus must 724 - * add our space to the total_bytes_pinned counter. 725 - */ 726 - if (existing->total_ref_mod < 0 && old_ref_mod >= 0) 727 - btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); 728 - else if (existing->total_ref_mod >= 0 && old_ref_mod < 0) 729 - btrfs_mod_total_bytes_pinned(fs_info, flags, -existing->num_bytes); 730 - else if (existing->total_ref_mod == 0 && existing->must_insert_reserved) 731 - btrfs_mod_total_bytes_pinned(fs_info, flags, existing->num_bytes); 732 713 733 714 spin_unlock(&existing->lock); 734 715 } ··· 814 835 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 815 836 head_ref = existing; 816 837 } else { 817 - u64 flags = btrfs_ref_head_to_space_flags(head_ref); 818 - 819 838 if (head_ref->is_data && head_ref->ref_mod < 0) { 820 839 delayed_refs->pending_csums += head_ref->num_bytes; 821 840 trans->delayed_ref_updates += 822 841 btrfs_csum_bytes_to_leaves(trans->fs_info, 823 842 head_ref->num_bytes); 824 843 } 825 - if (head_ref->ref_mod < 0) 826 - btrfs_mod_total_bytes_pinned(trans->fs_info, flags, 827 - head_ref->num_bytes); 828 844 delayed_refs->num_heads++; 829 845 delayed_refs->num_heads_ready++; 830 846 atomic_inc(&delayed_refs->num_entries);

+1 -1

fs/btrfs/dev-replace.c

··· 37 37 * - Write duplication 38 38 * 39 39 * All new writes will be written to both target and source devices, so even 40 - * if replace gets canceled, sources device still contans up-to-date data. 40 + * if replace gets canceled, sources device still contains up-to-date data. 41 41 * 42 42 * Location: handle_ops_on_dev_replace() from __btrfs_map_block() 43 43 * Start: btrfs_dev_replace_start()

+1 -1

fs/btrfs/discard.c

··· 624 624 * @fs_info: fs_info of interest 625 625 * 626 626 * The unused_bgs list needs to be punted to the discard lists because the 627 - * order of operations is changed. In the normal sychronous discard path, the 627 + * order of operations is changed. In the normal synchronous discard path, the 628 628 * block groups are trimmed via a single large trim in transaction commit. This 629 629 * is ultimately what we are trying to avoid with asynchronous discard. Thus, 630 630 * it must be done before going down the unused_bgs path.

+18 -37

fs/btrfs/disk-io.c

··· 241 241 { 242 242 struct extent_state *cached_state = NULL; 243 243 int ret; 244 - bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB); 245 244 246 245 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 247 246 return 0; 248 247 249 248 if (atomic) 250 249 return -EAGAIN; 251 - 252 - if (need_lock) 253 - btrfs_tree_read_lock(eb); 254 250 255 251 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, 256 252 &cached_state); ··· 260 264 eb->start, 261 265 parent_transid, btrfs_header_generation(eb)); 262 266 ret = 1; 263 - 264 - /* 265 - * Things reading via commit roots that don't have normal protection, 266 - * like send, can have a really old block in cache that may point at a 267 - * block that has been freed and re-allocated. So don't clear uptodate 268 - * if we find an eb that is under IO (dirty/writeback) because we could 269 - * end up reading in the stale data and then writing it back out and 270 - * making everybody very sad. 271 - */ 272 - if (!extent_buffer_under_io(eb)) 273 - clear_extent_buffer_uptodate(eb); 267 + clear_extent_buffer_uptodate(eb); 274 268 out: 275 269 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1, 276 270 &cached_state); 277 - if (need_lock) 278 - btrfs_tree_read_unlock(eb); 279 271 return ret; 280 272 } 281 273 ··· 568 584 const u32 csum_size = fs_info->csum_size; 569 585 u8 found_level; 570 586 u8 result[BTRFS_CSUM_SIZE]; 587 + const u8 *header_csum; 571 588 int ret = 0; 572 589 573 590 found_start = btrfs_header_bytenr(eb); ··· 593 608 } 594 609 595 610 csum_tree_block(eb, result); 611 + header_csum = page_address(eb->pages[0]) + 612 + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); 596 613 597 - if (memcmp_extent_buffer(eb, result, 0, csum_size)) { 598 - u8 val[BTRFS_CSUM_SIZE] = { 0 }; 599 - 600 - read_extent_buffer(eb, &val, 0, csum_size); 614 + if (memcmp(result, header_csum, csum_size) != 0) { 601 615 btrfs_warn_rl(fs_info, 602 - "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", 603 - fs_info->sb->s_id, eb->start, 604 - CSUM_FMT_VALUE(csum_size, val), 616 + "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", 617 + eb->start, 618 + CSUM_FMT_VALUE(csum_size, header_csum), 605 619 CSUM_FMT_VALUE(csum_size, result), 606 620 btrfs_header_level(eb)); 607 621 ret = -EUCLEAN; ··· 901 917 return btree_csum_one_bio(bio); 902 918 } 903 919 904 - static int check_async_write(struct btrfs_fs_info *fs_info, 920 + static bool should_async_write(struct btrfs_fs_info *fs_info, 905 921 struct btrfs_inode *bi) 906 922 { 907 923 if (btrfs_is_zoned(fs_info)) 908 - return 0; 924 + return false; 909 925 if (atomic_read(&bi->sync_writers)) 910 - return 0; 926 + return false; 911 927 if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) 912 - return 0; 913 - return 1; 928 + return false; 929 + return true; 914 930 } 915 931 916 932 blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, 917 933 int mirror_num, unsigned long bio_flags) 918 934 { 919 935 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 920 - int async = check_async_write(fs_info, BTRFS_I(inode)); 921 936 blk_status_t ret; 922 937 923 938 if (btrfs_op(bio) != BTRFS_MAP_WRITE) { ··· 929 946 if (ret) 930 947 goto out_w_error; 931 948 ret = btrfs_map_bio(fs_info, bio, mirror_num); 932 - } else if (!async) { 949 + } else if (!should_async_write(fs_info, BTRFS_I(inode))) { 933 950 ret = btree_csum_one_bio(bio); 934 951 if (ret) 935 952 goto out_w_error; ··· 2235 2252 atomic_set(&fs_info->balance_cancel_req, 0); 2236 2253 fs_info->balance_ctl = NULL; 2237 2254 init_waitqueue_head(&fs_info->balance_wait_q); 2255 + atomic_set(&fs_info->reloc_cancel_req, 0); 2238 2256 } 2239 2257 2240 2258 static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) ··· 2983 2999 spin_lock_init(&fs_info->swapfile_pins_lock); 2984 3000 fs_info->swapfile_pins = RB_ROOT; 2985 3001 3002 + spin_lock_init(&fs_info->send_reloc_lock); 2986 3003 fs_info->send_in_progress = 0; 2987 3004 2988 3005 fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; ··· 3456 3471 * At this point we know all the devices that make this filesystem, 3457 3472 * including the seed devices but we don't know yet if the replace 3458 3473 * target is required. So free devices that are not part of this 3459 - * filesystem but skip the replace traget device which is checked 3474 + * filesystem but skip the replace target device which is checked 3460 3475 * below in btrfs_init_dev_replace(). 3461 3476 */ 3462 3477 btrfs_free_extra_devids(fs_devices); ··· 3583 3598 if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { 3584 3599 ret = btrfsic_mount(fs_info, fs_devices, 3585 3600 btrfs_test_opt(fs_info, 3586 - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? 3587 - 1 : 0, 3601 + CHECK_INTEGRITY_DATA) ? 1 : 0, 3588 3602 fs_info->check_integrity_print_mask); 3589 3603 if (ret) 3590 3604 btrfs_warn(fs_info, ··· 4680 4696 cache->space_info->bytes_reserved -= head->num_bytes; 4681 4697 spin_unlock(&cache->lock); 4682 4698 spin_unlock(&cache->space_info->lock); 4683 - percpu_counter_add_batch( 4684 - &cache->space_info->total_bytes_pinned, 4685 - head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH); 4686 4699 4687 4700 btrfs_put_block_group(cache); 4688 4701

+1 -17

fs/btrfs/extent-tree.c

··· 1425 1425 * bytenr of the parent block. Since new extents are always 1426 1426 * created with indirect references, this will only be the case 1427 1427 * when relocating a shared extent. In that case, root_objectid 1428 - * will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must 1428 + * will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must 1429 1429 * be 0 1430 1430 * 1431 1431 * @root_objectid: The id of the root where this modification has originated, ··· 1802 1802 delayed_refs->pending_csums -= head->num_bytes; 1803 1803 spin_unlock(&delayed_refs->lock); 1804 1804 nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); 1805 - } 1806 - 1807 - /* 1808 - * We were dropping refs, or had a new ref and dropped it, and thus must 1809 - * adjust down our total_bytes_pinned, the space may or may not have 1810 - * been pinned and so is accounted for properly in the pinned space by 1811 - * now. 1812 - */ 1813 - if (head->total_ref_mod < 0 || 1814 - (head->total_ref_mod == 0 && head->must_insert_reserved)) { 1815 - u64 flags = btrfs_ref_head_to_space_flags(head); 1816 - 1817 - btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes); 1818 1805 } 1819 1806 1820 1807 btrfs_delayed_refs_rsv_release(fs_info, nr_items); ··· 2538 2551 spin_unlock(&cache->lock); 2539 2552 spin_unlock(&cache->space_info->lock); 2540 2553 2541 - __btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes); 2542 2554 set_extent_dirty(&trans->transaction->pinned_extents, bytenr, 2543 2555 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); 2544 2556 return 0; ··· 2748 2762 cache->pinned -= len; 2749 2763 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); 2750 2764 space_info->max_extent_size = 0; 2751 - __btrfs_mod_total_bytes_pinned(space_info, -len); 2752 2765 if (cache->ro) { 2753 2766 space_info->bytes_readonly += len; 2754 2767 readonly = true; ··· 4769 4784 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 4770 4785 buf->start + buf->len - 1, GFP_NOFS); 4771 4786 } 4772 - trans->dirty = true; 4773 4787 /* this returns a buffer locked for blocking */ 4774 4788 return buf; 4775 4789 }

+568 -405

fs/btrfs/extent_io.c

··· 136 136 }; 137 137 138 138 struct extent_page_data { 139 - struct bio *bio; 139 + struct btrfs_bio_ctrl bio_ctrl; 140 140 /* tells writepage not to lock the state bits for this range 141 141 * it still does the unlocking 142 142 */ ··· 185 185 /* Cleanup unsubmitted bios */ 186 186 static void end_write_bio(struct extent_page_data *epd, int ret) 187 187 { 188 - if (epd->bio) { 189 - epd->bio->bi_status = errno_to_blk_status(ret); 190 - bio_endio(epd->bio); 191 - epd->bio = NULL; 188 + struct bio *bio = epd->bio_ctrl.bio; 189 + 190 + if (bio) { 191 + bio->bi_status = errno_to_blk_status(ret); 192 + bio_endio(bio); 193 + epd->bio_ctrl.bio = NULL; 192 194 } 193 195 } 194 196 ··· 203 201 static int __must_check flush_write_bio(struct extent_page_data *epd) 204 202 { 205 203 int ret = 0; 204 + struct bio *bio = epd->bio_ctrl.bio; 206 205 207 - if (epd->bio) { 208 - ret = submit_one_bio(epd->bio, 0, 0); 206 + if (bio) { 207 + ret = submit_one_bio(bio, 0, 0); 209 208 /* 210 209 * Clean up of epd->bio is handled by its endio function. 211 210 * And endio is either triggered by successful bio execution ··· 214 211 * So at this point, no matter what happened, we don't need 215 212 * to clean up epd->bio. 216 213 */ 217 - epd->bio = NULL; 214 + epd->bio_ctrl.bio = NULL; 218 215 } 219 216 return ret; 220 217 } ··· 1808 1805 return found; 1809 1806 } 1810 1807 1808 + /* 1809 + * Process one page for __process_pages_contig(). 1810 + * 1811 + * Return >0 if we hit @page == @locked_page. 1812 + * Return 0 if we updated the page status. 1813 + * Return -EGAIN if the we need to try again. 1814 + * (For PAGE_LOCK case but got dirty page or page not belong to mapping) 1815 + */ 1816 + static int process_one_page(struct btrfs_fs_info *fs_info, 1817 + struct address_space *mapping, 1818 + struct page *page, struct page *locked_page, 1819 + unsigned long page_ops, u64 start, u64 end) 1820 + { 1821 + u32 len; 1822 + 1823 + ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); 1824 + len = end + 1 - start; 1825 + 1826 + if (page_ops & PAGE_SET_ORDERED) 1827 + btrfs_page_clamp_set_ordered(fs_info, page, start, len); 1828 + if (page_ops & PAGE_SET_ERROR) 1829 + btrfs_page_clamp_set_error(fs_info, page, start, len); 1830 + if (page_ops & PAGE_START_WRITEBACK) { 1831 + btrfs_page_clamp_clear_dirty(fs_info, page, start, len); 1832 + btrfs_page_clamp_set_writeback(fs_info, page, start, len); 1833 + } 1834 + if (page_ops & PAGE_END_WRITEBACK) 1835 + btrfs_page_clamp_clear_writeback(fs_info, page, start, len); 1836 + 1837 + if (page == locked_page) 1838 + return 1; 1839 + 1840 + if (page_ops & PAGE_LOCK) { 1841 + int ret; 1842 + 1843 + ret = btrfs_page_start_writer_lock(fs_info, page, start, len); 1844 + if (ret) 1845 + return ret; 1846 + if (!PageDirty(page) || page->mapping != mapping) { 1847 + btrfs_page_end_writer_lock(fs_info, page, start, len); 1848 + return -EAGAIN; 1849 + } 1850 + } 1851 + if (page_ops & PAGE_UNLOCK) 1852 + btrfs_page_end_writer_lock(fs_info, page, start, len); 1853 + return 0; 1854 + } 1855 + 1811 1856 static int __process_pages_contig(struct address_space *mapping, 1812 1857 struct page *locked_page, 1813 - pgoff_t start_index, pgoff_t end_index, 1814 - unsigned long page_ops, pgoff_t *index_ret); 1858 + u64 start, u64 end, unsigned long page_ops, 1859 + u64 *processed_end) 1860 + { 1861 + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); 1862 + pgoff_t start_index = start >> PAGE_SHIFT; 1863 + pgoff_t end_index = end >> PAGE_SHIFT; 1864 + pgoff_t index = start_index; 1865 + unsigned long nr_pages = end_index - start_index + 1; 1866 + unsigned long pages_processed = 0; 1867 + struct page *pages[16]; 1868 + int err = 0; 1869 + int i; 1870 + 1871 + if (page_ops & PAGE_LOCK) { 1872 + ASSERT(page_ops == PAGE_LOCK); 1873 + ASSERT(processed_end && *processed_end == start); 1874 + } 1875 + 1876 + if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 1877 + mapping_set_error(mapping, -EIO); 1878 + 1879 + while (nr_pages > 0) { 1880 + int found_pages; 1881 + 1882 + found_pages = find_get_pages_contig(mapping, index, 1883 + min_t(unsigned long, 1884 + nr_pages, ARRAY_SIZE(pages)), pages); 1885 + if (found_pages == 0) { 1886 + /* 1887 + * Only if we're going to lock these pages, we can find 1888 + * nothing at @index. 1889 + */ 1890 + ASSERT(page_ops & PAGE_LOCK); 1891 + err = -EAGAIN; 1892 + goto out; 1893 + } 1894 + 1895 + for (i = 0; i < found_pages; i++) { 1896 + int process_ret; 1897 + 1898 + process_ret = process_one_page(fs_info, mapping, 1899 + pages[i], locked_page, page_ops, 1900 + start, end); 1901 + if (process_ret < 0) { 1902 + for (; i < found_pages; i++) 1903 + put_page(pages[i]); 1904 + err = -EAGAIN; 1905 + goto out; 1906 + } 1907 + put_page(pages[i]); 1908 + pages_processed++; 1909 + } 1910 + nr_pages -= found_pages; 1911 + index += found_pages; 1912 + cond_resched(); 1913 + } 1914 + out: 1915 + if (err && processed_end) { 1916 + /* 1917 + * Update @processed_end. I know this is awful since it has 1918 + * two different return value patterns (inclusive vs exclusive). 1919 + * 1920 + * But the exclusive pattern is necessary if @start is 0, or we 1921 + * underflow and check against processed_end won't work as 1922 + * expected. 1923 + */ 1924 + if (pages_processed) 1925 + *processed_end = min(end, 1926 + ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); 1927 + else 1928 + *processed_end = start; 1929 + } 1930 + return err; 1931 + } 1815 1932 1816 1933 static noinline void __unlock_for_delalloc(struct inode *inode, 1817 1934 struct page *locked_page, ··· 1944 1821 if (index == locked_page->index && end_index == index) 1945 1822 return; 1946 1823 1947 - __process_pages_contig(inode->i_mapping, locked_page, index, end_index, 1824 + __process_pages_contig(inode->i_mapping, locked_page, start, end, 1948 1825 PAGE_UNLOCK, NULL); 1949 1826 } 1950 1827 ··· 1954 1831 u64 delalloc_end) 1955 1832 { 1956 1833 unsigned long index = delalloc_start >> PAGE_SHIFT; 1957 - unsigned long index_ret = index; 1958 1834 unsigned long end_index = delalloc_end >> PAGE_SHIFT; 1835 + u64 processed_end = delalloc_start; 1959 1836 int ret; 1960 1837 1961 1838 ASSERT(locked_page); 1962 1839 if (index == locked_page->index && index == end_index) 1963 1840 return 0; 1964 1841 1965 - ret = __process_pages_contig(inode->i_mapping, locked_page, index, 1966 - end_index, PAGE_LOCK, &index_ret); 1967 - if (ret == -EAGAIN) 1842 + ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, 1843 + delalloc_end, PAGE_LOCK, &processed_end); 1844 + if (ret == -EAGAIN && processed_end > delalloc_start) 1968 1845 __unlock_for_delalloc(inode, locked_page, delalloc_start, 1969 - (u64)index_ret << PAGE_SHIFT); 1846 + processed_end); 1970 1847 return ret; 1971 1848 } 1972 1849 ··· 2059 1936 return found; 2060 1937 } 2061 1938 2062 - static int __process_pages_contig(struct address_space *mapping, 2063 - struct page *locked_page, 2064 - pgoff_t start_index, pgoff_t end_index, 2065 - unsigned long page_ops, pgoff_t *index_ret) 2066 - { 2067 - unsigned long nr_pages = end_index - start_index + 1; 2068 - unsigned long pages_processed = 0; 2069 - pgoff_t index = start_index; 2070 - struct page *pages[16]; 2071 - unsigned ret; 2072 - int err = 0; 2073 - int i; 2074 - 2075 - if (page_ops & PAGE_LOCK) { 2076 - ASSERT(page_ops == PAGE_LOCK); 2077 - ASSERT(index_ret && *index_ret == start_index); 2078 - } 2079 - 2080 - if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) 2081 - mapping_set_error(mapping, -EIO); 2082 - 2083 - while (nr_pages > 0) { 2084 - ret = find_get_pages_contig(mapping, index, 2085 - min_t(unsigned long, 2086 - nr_pages, ARRAY_SIZE(pages)), pages); 2087 - if (ret == 0) { 2088 - /* 2089 - * Only if we're going to lock these pages, 2090 - * can we find nothing at @index. 2091 - */ 2092 - ASSERT(page_ops & PAGE_LOCK); 2093 - err = -EAGAIN; 2094 - goto out; 2095 - } 2096 - 2097 - for (i = 0; i < ret; i++) { 2098 - if (page_ops & PAGE_SET_PRIVATE2) 2099 - SetPagePrivate2(pages[i]); 2100 - 2101 - if (locked_page && pages[i] == locked_page) { 2102 - put_page(pages[i]); 2103 - pages_processed++; 2104 - continue; 2105 - } 2106 - if (page_ops & PAGE_START_WRITEBACK) { 2107 - clear_page_dirty_for_io(pages[i]); 2108 - set_page_writeback(pages[i]); 2109 - } 2110 - if (page_ops & PAGE_SET_ERROR) 2111 - SetPageError(pages[i]); 2112 - if (page_ops & PAGE_END_WRITEBACK) 2113 - end_page_writeback(pages[i]); 2114 - if (page_ops & PAGE_UNLOCK) 2115 - unlock_page(pages[i]); 2116 - if (page_ops & PAGE_LOCK) { 2117 - lock_page(pages[i]); 2118 - if (!PageDirty(pages[i]) || 2119 - pages[i]->mapping != mapping) { 2120 - unlock_page(pages[i]); 2121 - for (; i < ret; i++) 2122 - put_page(pages[i]); 2123 - err = -EAGAIN; 2124 - goto out; 2125 - } 2126 - } 2127 - put_page(pages[i]); 2128 - pages_processed++; 2129 - } 2130 - nr_pages -= ret; 2131 - index += ret; 2132 - cond_resched(); 2133 - } 2134 - out: 2135 - if (err && index_ret) 2136 - *index_ret = start_index + pages_processed - 1; 2137 - return err; 2138 - } 2139 - 2140 1939 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, 2141 1940 struct page *locked_page, 2142 1941 u32 clear_bits, unsigned long page_ops) ··· 2066 2021 clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL); 2067 2022 2068 2023 __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, 2069 - start >> PAGE_SHIFT, end >> PAGE_SHIFT, 2070 - page_ops, NULL); 2024 + start, end, page_ops, NULL); 2071 2025 } 2072 2026 2073 2027 /* ··· 2425 2381 2426 2382 BUG_ON(!failrec->this_mirror); 2427 2383 2428 - if (failrec->in_validation) { 2429 - /* there was no real error, just free the record */ 2430 - btrfs_debug(fs_info, 2431 - "clean_io_failure: freeing dummy error at %llu", 2432 - failrec->start); 2433 - goto out; 2434 - } 2435 2384 if (sb_rdonly(fs_info->sb)) 2436 2385 goto out; 2437 2386 ··· 2486 2449 } 2487 2450 2488 2451 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, 2489 - u64 start, u64 end) 2452 + u64 start) 2490 2453 { 2491 2454 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2492 2455 struct io_failure_record *failrec; ··· 2494 2457 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2495 2458 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2496 2459 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2460 + const u32 sectorsize = fs_info->sectorsize; 2497 2461 int ret; 2498 2462 u64 logical; 2499 2463 2500 2464 failrec = get_state_failrec(failure_tree, start); 2501 2465 if (!IS_ERR(failrec)) { 2502 2466 btrfs_debug(fs_info, 2503 - "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d", 2504 - failrec->logical, failrec->start, failrec->len, 2505 - failrec->in_validation); 2467 + "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu", 2468 + failrec->logical, failrec->start, failrec->len); 2506 2469 /* 2507 2470 * when data can be on disk more than twice, add to failrec here 2508 2471 * (e.g. with a list for failed_mirror) to make ··· 2517 2480 return ERR_PTR(-ENOMEM); 2518 2481 2519 2482 failrec->start = start; 2520 - failrec->len = end - start + 1; 2483 + failrec->len = sectorsize; 2521 2484 failrec->this_mirror = 0; 2522 2485 failrec->bio_flags = 0; 2523 - failrec->in_validation = 0; 2524 2486 2525 2487 read_lock(&em_tree->lock); 2526 2488 em = lookup_extent_mapping(em_tree, start, failrec->len); ··· 2555 2519 free_extent_map(em); 2556 2520 2557 2521 /* Set the bits in the private failure tree */ 2558 - ret = set_extent_bits(failure_tree, start, end, 2522 + ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, 2559 2523 EXTENT_LOCKED | EXTENT_DIRTY); 2560 2524 if (ret >= 0) { 2561 2525 ret = set_state_failrec(failure_tree, start, failrec); 2562 2526 /* Set the bits in the inode's tree */ 2563 - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); 2527 + ret = set_extent_bits(tree, start, start + sectorsize - 1, 2528 + EXTENT_DAMAGED); 2564 2529 } else if (ret < 0) { 2565 2530 kfree(failrec); 2566 2531 return ERR_PTR(ret); ··· 2570 2533 return failrec; 2571 2534 } 2572 2535 2573 - static bool btrfs_check_repairable(struct inode *inode, bool needs_validation, 2536 + static bool btrfs_check_repairable(struct inode *inode, 2574 2537 struct io_failure_record *failrec, 2575 2538 int failed_mirror) 2576 2539 { ··· 2590 2553 return false; 2591 2554 } 2592 2555 2556 + /* The failure record should only contain one sector */ 2557 + ASSERT(failrec->len == fs_info->sectorsize); 2558 + 2593 2559 /* 2594 - * there are two premises: 2595 - * a) deliver good data to the caller 2596 - * b) correct the bad sectors on disk 2560 + * There are two premises: 2561 + * a) deliver good data to the caller 2562 + * b) correct the bad sectors on disk 2563 + * 2564 + * Since we're only doing repair for one sector, we only need to get 2565 + * a good copy of the failed sector and if we succeed, we have setup 2566 + * everything for repair_io_failure to do the rest for us. 2597 2567 */ 2598 - if (needs_validation) { 2599 - /* 2600 - * to fulfill b), we need to know the exact failing sectors, as 2601 - * we don't want to rewrite any more than the failed ones. thus, 2602 - * we need separate read requests for the failed bio 2603 - * 2604 - * if the following BUG_ON triggers, our validation request got 2605 - * merged. we need separate requests for our algorithm to work. 2606 - */ 2607 - BUG_ON(failrec->in_validation); 2608 - failrec->in_validation = 1; 2609 - failrec->this_mirror = failed_mirror; 2610 - } else { 2611 - /* 2612 - * we're ready to fulfill a) and b) alongside. get a good copy 2613 - * of the failed sector and if we succeed, we have setup 2614 - * everything for repair_io_failure to do the rest for us. 2615 - */ 2616 - if (failrec->in_validation) { 2617 - BUG_ON(failrec->this_mirror != failed_mirror); 2618 - failrec->in_validation = 0; 2619 - failrec->this_mirror = 0; 2620 - } 2621 - failrec->failed_mirror = failed_mirror; 2568 + failrec->failed_mirror = failed_mirror; 2569 + failrec->this_mirror++; 2570 + if (failrec->this_mirror == failed_mirror) 2622 2571 failrec->this_mirror++; 2623 - if (failrec->this_mirror == failed_mirror) 2624 - failrec->this_mirror++; 2625 - } 2626 2572 2627 2573 if (failrec->this_mirror > num_copies) { 2628 2574 btrfs_debug(fs_info, ··· 2617 2597 return true; 2618 2598 } 2619 2599 2620 - static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio) 2621 - { 2622 - u64 len = 0; 2623 - const u32 blocksize = inode->i_sb->s_blocksize; 2624 - 2625 - /* 2626 - * If bi_status is BLK_STS_OK, then this was a checksum error, not an 2627 - * I/O error. In this case, we already know exactly which sector was 2628 - * bad, so we don't need to validate. 2629 - */ 2630 - if (bio->bi_status == BLK_STS_OK) 2631 - return false; 2632 - 2633 - /* 2634 - * We need to validate each sector individually if the failed I/O was 2635 - * for multiple sectors. 2636 - * 2637 - * There are a few possible bios that can end up here: 2638 - * 1. A buffered read bio, which is not cloned. 2639 - * 2. A direct I/O read bio, which is cloned. 2640 - * 3. A (buffered or direct) repair bio, which is not cloned. 2641 - * 2642 - * For cloned bios (case 2), we can get the size from 2643 - * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get 2644 - * it from the bvecs. 2645 - */ 2646 - if (bio_flagged(bio, BIO_CLONED)) { 2647 - if (btrfs_io_bio(bio)->iter.bi_size > blocksize) 2648 - return true; 2649 - } else { 2650 - struct bio_vec *bvec; 2651 - int i; 2652 - 2653 - bio_for_each_bvec_all(bvec, bio, i) { 2654 - len += bvec->bv_len; 2655 - if (len > blocksize) 2656 - return true; 2657 - } 2658 - } 2659 - return false; 2660 - } 2661 - 2662 - blk_status_t btrfs_submit_read_repair(struct inode *inode, 2663 - struct bio *failed_bio, u32 bio_offset, 2664 - struct page *page, unsigned int pgoff, 2665 - u64 start, u64 end, int failed_mirror, 2666 - submit_bio_hook_t *submit_bio_hook) 2600 + int btrfs_repair_one_sector(struct inode *inode, 2601 + struct bio *failed_bio, u32 bio_offset, 2602 + struct page *page, unsigned int pgoff, 2603 + u64 start, int failed_mirror, 2604 + submit_bio_hook_t *submit_bio_hook) 2667 2605 { 2668 2606 struct io_failure_record *failrec; 2669 2607 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); ··· 2629 2651 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2630 2652 struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); 2631 2653 const int icsum = bio_offset >> fs_info->sectorsize_bits; 2632 - bool need_validation; 2633 2654 struct bio *repair_bio; 2634 2655 struct btrfs_io_bio *repair_io_bio; 2635 2656 blk_status_t status; ··· 2638 2661 2639 2662 BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2640 2663 2641 - failrec = btrfs_get_io_failure_record(inode, start, end); 2664 + failrec = btrfs_get_io_failure_record(inode, start); 2642 2665 if (IS_ERR(failrec)) 2643 - return errno_to_blk_status(PTR_ERR(failrec)); 2666 + return PTR_ERR(failrec); 2644 2667 2645 - need_validation = btrfs_io_needs_validation(inode, failed_bio); 2646 2668 2647 - if (!btrfs_check_repairable(inode, need_validation, failrec, 2648 - failed_mirror)) { 2669 + if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { 2649 2670 free_io_failure(failure_tree, tree, failrec); 2650 - return BLK_STS_IOERR; 2671 + return -EIO; 2651 2672 } 2652 2673 2653 2674 repair_bio = btrfs_io_bio_alloc(1); 2654 2675 repair_io_bio = btrfs_io_bio(repair_bio); 2655 2676 repair_bio->bi_opf = REQ_OP_READ; 2656 - if (need_validation) 2657 - repair_bio->bi_opf |= REQ_FAILFAST_DEV; 2658 2677 repair_bio->bi_end_io = failed_bio->bi_end_io; 2659 2678 repair_bio->bi_iter.bi_sector = failrec->logical >> 9; 2660 2679 repair_bio->bi_private = failed_bio->bi_private; ··· 2668 2695 repair_io_bio->iter = repair_bio->bi_iter; 2669 2696 2670 2697 btrfs_debug(btrfs_sb(inode->i_sb), 2671 - "repair read error: submitting new read to mirror %d, in_validation=%d", 2672 - failrec->this_mirror, failrec->in_validation); 2698 + "repair read error: submitting new read to mirror %d", 2699 + failrec->this_mirror); 2673 2700 2674 2701 status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, 2675 2702 failrec->bio_flags); ··· 2677 2704 free_io_failure(failure_tree, tree, failrec); 2678 2705 bio_put(repair_bio); 2679 2706 } 2680 - return status; 2707 + return blk_status_to_errno(status); 2708 + } 2709 + 2710 + static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) 2711 + { 2712 + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 2713 + 2714 + ASSERT(page_offset(page) <= start && 2715 + start + len <= page_offset(page) + PAGE_SIZE); 2716 + 2717 + if (uptodate) { 2718 + btrfs_page_set_uptodate(fs_info, page, start, len); 2719 + } else { 2720 + btrfs_page_clear_uptodate(fs_info, page, start, len); 2721 + btrfs_page_set_error(fs_info, page, start, len); 2722 + } 2723 + 2724 + if (fs_info->sectorsize == PAGE_SIZE) 2725 + unlock_page(page); 2726 + else 2727 + btrfs_subpage_end_reader(fs_info, page, start, len); 2728 + } 2729 + 2730 + static blk_status_t submit_read_repair(struct inode *inode, 2731 + struct bio *failed_bio, u32 bio_offset, 2732 + struct page *page, unsigned int pgoff, 2733 + u64 start, u64 end, int failed_mirror, 2734 + unsigned int error_bitmap, 2735 + submit_bio_hook_t *submit_bio_hook) 2736 + { 2737 + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2738 + const u32 sectorsize = fs_info->sectorsize; 2739 + const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; 2740 + int error = 0; 2741 + int i; 2742 + 2743 + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); 2744 + 2745 + /* We're here because we had some read errors or csum mismatch */ 2746 + ASSERT(error_bitmap); 2747 + 2748 + /* 2749 + * We only get called on buffered IO, thus page must be mapped and bio 2750 + * must not be cloned. 2751 + */ 2752 + ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); 2753 + 2754 + /* Iterate through all the sectors in the range */ 2755 + for (i = 0; i < nr_bits; i++) { 2756 + const unsigned int offset = i * sectorsize; 2757 + struct extent_state *cached = NULL; 2758 + bool uptodate = false; 2759 + int ret; 2760 + 2761 + if (!(error_bitmap & (1U << i))) { 2762 + /* 2763 + * This sector has no error, just end the page read 2764 + * and unlock the range. 2765 + */ 2766 + uptodate = true; 2767 + goto next; 2768 + } 2769 + 2770 + ret = btrfs_repair_one_sector(inode, failed_bio, 2771 + bio_offset + offset, 2772 + page, pgoff + offset, start + offset, 2773 + failed_mirror, submit_bio_hook); 2774 + if (!ret) { 2775 + /* 2776 + * We have submitted the read repair, the page release 2777 + * will be handled by the endio function of the 2778 + * submitted repair bio. 2779 + * Thus we don't need to do any thing here. 2780 + */ 2781 + continue; 2782 + } 2783 + /* 2784 + * Repair failed, just record the error but still continue. 2785 + * Or the remaining sectors will not be properly unlocked. 2786 + */ 2787 + if (!error) 2788 + error = ret; 2789 + next: 2790 + end_page_read(page, uptodate, start + offset, sectorsize); 2791 + if (uptodate) 2792 + set_extent_uptodate(&BTRFS_I(inode)->io_tree, 2793 + start + offset, 2794 + start + offset + sectorsize - 1, 2795 + &cached, GFP_ATOMIC); 2796 + unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, 2797 + start + offset, 2798 + start + offset + sectorsize - 1, 2799 + &cached); 2800 + } 2801 + return errno_to_blk_status(error); 2681 2802 } 2682 2803 2683 2804 /* lots and lots of room for performance fixes in the end_bio funcs */ 2684 2805 2685 2806 void end_extent_writepage(struct page *page, int err, u64 start, u64 end) 2686 2807 { 2808 + struct btrfs_inode *inode; 2687 2809 int uptodate = (err == 0); 2688 2810 int ret = 0; 2689 2811 2690 - btrfs_writepage_endio_finish_ordered(page, start, end, uptodate); 2812 + ASSERT(page && page->mapping); 2813 + inode = BTRFS_I(page->mapping->host); 2814 + btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); 2691 2815 2692 2816 if (!uptodate) { 2693 2817 ClearPageUptodate(page); ··· 2817 2747 struct page *page = bvec->bv_page; 2818 2748 struct inode *inode = page->mapping->host; 2819 2749 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2750 + const u32 sectorsize = fs_info->sectorsize; 2820 2751 2821 - /* We always issue full-page reads, but if some block 2822 - * in a page fails to read, blk_update_request() will 2823 - * advance bv_offset and adjust bv_len to compensate. 2824 - * Print a warning for nonzero offsets, and an error 2825 - * if they don't add up to a full page. */ 2826 - if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { 2827 - if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) 2828 - btrfs_err(fs_info, 2829 - "partial page write in btrfs with offset %u and length %u", 2830 - bvec->bv_offset, bvec->bv_len); 2831 - else 2832 - btrfs_info(fs_info, 2833 - "incomplete page write in btrfs with offset %u and length %u", 2834 - bvec->bv_offset, bvec->bv_len); 2835 - } 2752 + /* Our read/write should always be sector aligned. */ 2753 + if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) 2754 + btrfs_err(fs_info, 2755 + "partial page write in btrfs with offset %u and length %u", 2756 + bvec->bv_offset, bvec->bv_len); 2757 + else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) 2758 + btrfs_info(fs_info, 2759 + "incomplete page write with offset %u and length %u", 2760 + bvec->bv_offset, bvec->bv_len); 2836 2761 2837 - start = page_offset(page); 2838 - end = start + bvec->bv_offset + bvec->bv_len - 1; 2762 + start = page_offset(page) + bvec->bv_offset; 2763 + end = start + bvec->bv_len - 1; 2839 2764 2840 2765 if (first_bvec) { 2841 2766 btrfs_record_physical_zoned(inode, start, bio); ··· 2838 2773 } 2839 2774 2840 2775 end_extent_writepage(page, error, start, end); 2841 - end_page_writeback(page); 2776 + 2777 + btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len); 2842 2778 } 2843 2779 2844 2780 bio_put(bio); ··· 2928 2862 btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); 2929 2863 } 2930 2864 2931 - static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) 2932 - { 2933 - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); 2934 - 2935 - ASSERT(page_offset(page) <= start && 2936 - start + len <= page_offset(page) + PAGE_SIZE); 2937 - 2938 - if (uptodate) { 2939 - btrfs_page_set_uptodate(fs_info, page, start, len); 2940 - } else { 2941 - btrfs_page_clear_uptodate(fs_info, page, start, len); 2942 - btrfs_page_set_error(fs_info, page, start, len); 2943 - } 2944 - 2945 - if (fs_info->sectorsize == PAGE_SIZE) 2946 - unlock_page(page); 2947 - else if (is_data_inode(page->mapping->host)) 2948 - /* 2949 - * For subpage data, unlock the page if we're the last reader. 2950 - * For subpage metadata, page lock is not utilized for read. 2951 - */ 2952 - btrfs_subpage_end_reader(fs_info, page, start, len); 2953 - } 2954 - 2955 2865 /* 2956 2866 * Find extent buffer for a givne bytenr. 2957 2867 * ··· 2971 2929 static void end_bio_extent_readpage(struct bio *bio) 2972 2930 { 2973 2931 struct bio_vec *bvec; 2974 - int uptodate = !bio->bi_status; 2975 2932 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); 2976 2933 struct extent_io_tree *tree, *failure_tree; 2977 2934 struct processed_extent processed = { 0 }; ··· 2985 2944 2986 2945 ASSERT(!bio_flagged(bio, BIO_CLONED)); 2987 2946 bio_for_each_segment_all(bvec, bio, iter_all) { 2947 + bool uptodate = !bio->bi_status; 2988 2948 struct page *page = bvec->bv_page; 2989 2949 struct inode *inode = page->mapping->host; 2990 2950 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2991 2951 const u32 sectorsize = fs_info->sectorsize; 2952 + unsigned int error_bitmap = (unsigned int)-1; 2992 2953 u64 start; 2993 2954 u64 end; 2994 2955 u32 len; ··· 3025 2982 3026 2983 mirror = io_bio->mirror_num; 3027 2984 if (likely(uptodate)) { 3028 - if (is_data_inode(inode)) 3029 - ret = btrfs_verify_data_csum(io_bio, 2985 + if (is_data_inode(inode)) { 2986 + error_bitmap = btrfs_verify_data_csum(io_bio, 3030 2987 bio_offset, page, start, end); 3031 - else 2988 + ret = error_bitmap; 2989 + } else { 3032 2990 ret = btrfs_validate_metadata_buffer(io_bio, 3033 2991 page, start, end, mirror); 2992 + } 3034 2993 if (ret) 3035 - uptodate = 0; 2994 + uptodate = false; 3036 2995 else 3037 2996 clean_io_failure(BTRFS_I(inode)->root->fs_info, 3038 2997 failure_tree, tree, start, ··· 3046 3001 goto readpage_ok; 3047 3002 3048 3003 if (is_data_inode(inode)) { 3049 - 3050 3004 /* 3051 - * The generic bio_readpage_error handles errors the 3052 - * following way: If possible, new read requests are 3053 - * created and submitted and will end up in 3054 - * end_bio_extent_readpage as well (if we're lucky, 3055 - * not in the !uptodate case). In that case it returns 3056 - * 0 and we just go on with the next page in our bio. 3057 - * If it can't handle the error it will return -EIO and 3058 - * we remain responsible for that page. 3005 + * btrfs_submit_read_repair() will handle all the good 3006 + * and bad sectors, we just continue to the next bvec. 3059 3007 */ 3060 - if (!btrfs_submit_read_repair(inode, bio, bio_offset, 3061 - page, 3062 - start - page_offset(page), 3063 - start, end, mirror, 3064 - btrfs_submit_data_bio)) { 3065 - uptodate = !bio->bi_status; 3066 - ASSERT(bio_offset + len > bio_offset); 3067 - bio_offset += len; 3068 - continue; 3069 - } 3008 + submit_read_repair(inode, bio, bio_offset, page, 3009 + start - page_offset(page), start, 3010 + end, mirror, error_bitmap, 3011 + btrfs_submit_data_bio); 3012 + 3013 + ASSERT(bio_offset + len > bio_offset); 3014 + bio_offset += len; 3015 + continue; 3070 3016 } else { 3071 3017 struct extent_buffer *eb; 3072 3018 ··· 3187 3151 * 3188 3152 * Return true if successfully page added. Otherwise, return false. 3189 3153 */ 3190 - static bool btrfs_bio_add_page(struct bio *bio, struct page *page, 3154 + static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, 3155 + struct page *page, 3191 3156 u64 disk_bytenr, unsigned int size, 3192 3157 unsigned int pg_offset, 3193 - unsigned long prev_bio_flags, 3194 3158 unsigned long bio_flags) 3195 3159 { 3160 + struct bio *bio = bio_ctrl->bio; 3161 + u32 bio_size = bio->bi_iter.bi_size; 3196 3162 const sector_t sector = disk_bytenr >> SECTOR_SHIFT; 3197 3163 bool contig; 3198 3164 int ret; 3199 3165 3200 - if (prev_bio_flags != bio_flags) 3166 + ASSERT(bio); 3167 + /* The limit should be calculated when bio_ctrl->bio is allocated */ 3168 + ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); 3169 + if (bio_ctrl->bio_flags != bio_flags) 3201 3170 return false; 3202 3171 3203 - if (prev_bio_flags & EXTENT_BIO_COMPRESSED) 3172 + if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) 3204 3173 contig = bio->bi_iter.bi_sector == sector; 3205 3174 else 3206 3175 contig = bio_end_sector(bio) == sector; 3207 3176 if (!contig) 3208 3177 return false; 3209 3178 3210 - if (btrfs_bio_fits_in_stripe(page, size, bio, bio_flags)) 3179 + if (bio_size + size > bio_ctrl->len_to_oe_boundary || 3180 + bio_size + size > bio_ctrl->len_to_stripe_boundary) 3211 3181 return false; 3212 3182 3213 - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 3214 - struct page *first_page = bio_first_bvec_all(bio)->bv_page; 3215 - 3216 - if (!btrfs_bio_fits_in_ordered_extent(first_page, bio, size)) 3217 - return false; 3183 + if (bio_op(bio) == REQ_OP_ZONE_APPEND) 3218 3184 ret = bio_add_zone_append_page(bio, page, size, pg_offset); 3219 - } else { 3185 + else 3220 3186 ret = bio_add_page(bio, page, size, pg_offset); 3221 - } 3222 3187 3223 3188 return ret == size; 3189 + } 3190 + 3191 + static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, 3192 + struct btrfs_inode *inode) 3193 + { 3194 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 3195 + struct btrfs_io_geometry geom; 3196 + struct btrfs_ordered_extent *ordered; 3197 + struct extent_map *em; 3198 + u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT); 3199 + int ret; 3200 + 3201 + /* 3202 + * Pages for compressed extent are never submitted to disk directly, 3203 + * thus it has no real boundary, just set them to U32_MAX. 3204 + * 3205 + * The split happens for real compressed bio, which happens in 3206 + * btrfs_submit_compressed_read/write(). 3207 + */ 3208 + if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { 3209 + bio_ctrl->len_to_oe_boundary = U32_MAX; 3210 + bio_ctrl->len_to_stripe_boundary = U32_MAX; 3211 + return 0; 3212 + } 3213 + em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); 3214 + if (IS_ERR(em)) 3215 + return PTR_ERR(em); 3216 + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio), 3217 + logical, &geom); 3218 + free_extent_map(em); 3219 + if (ret < 0) { 3220 + return ret; 3221 + } 3222 + if (geom.len > U32_MAX) 3223 + bio_ctrl->len_to_stripe_boundary = U32_MAX; 3224 + else 3225 + bio_ctrl->len_to_stripe_boundary = (u32)geom.len; 3226 + 3227 + if (!btrfs_is_zoned(fs_info) || 3228 + bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) { 3229 + bio_ctrl->len_to_oe_boundary = U32_MAX; 3230 + return 0; 3231 + } 3232 + 3233 + ASSERT(fs_info->max_zone_append_size > 0); 3234 + /* Ordered extent not yet created, so we're good */ 3235 + ordered = btrfs_lookup_ordered_extent(inode, logical); 3236 + if (!ordered) { 3237 + bio_ctrl->len_to_oe_boundary = U32_MAX; 3238 + return 0; 3239 + } 3240 + 3241 + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, 3242 + ordered->disk_bytenr + ordered->disk_num_bytes - logical); 3243 + btrfs_put_ordered_extent(ordered); 3244 + return 0; 3224 3245 } 3225 3246 3226 3247 /* ··· 3296 3203 */ 3297 3204 static int submit_extent_page(unsigned int opf, 3298 3205 struct writeback_control *wbc, 3206 + struct btrfs_bio_ctrl *bio_ctrl, 3299 3207 struct page *page, u64 disk_bytenr, 3300 3208 size_t size, unsigned long pg_offset, 3301 - struct bio **bio_ret, 3302 3209 bio_end_io_t end_io_func, 3303 3210 int mirror_num, 3304 - unsigned long prev_bio_flags, 3305 3211 unsigned long bio_flags, 3306 3212 bool force_bio_submit) 3307 3213 { ··· 3311 3219 struct extent_io_tree *tree = &inode->io_tree; 3312 3220 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3313 3221 3314 - ASSERT(bio_ret); 3222 + ASSERT(bio_ctrl); 3315 3223 3316 - if (*bio_ret) { 3317 - bio = *bio_ret; 3224 + ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && 3225 + pg_offset + size <= PAGE_SIZE); 3226 + if (bio_ctrl->bio) { 3227 + bio = bio_ctrl->bio; 3318 3228 if (force_bio_submit || 3319 - !btrfs_bio_add_page(bio, page, disk_bytenr, io_size, 3320 - pg_offset, prev_bio_flags, bio_flags)) { 3321 - ret = submit_one_bio(bio, mirror_num, prev_bio_flags); 3322 - if (ret < 0) { 3323 - *bio_ret = NULL; 3229 + !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size, 3230 + pg_offset, bio_flags)) { 3231 + ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags); 3232 + bio_ctrl->bio = NULL; 3233 + if (ret < 0) 3324 3234 return ret; 3325 - } 3326 - bio = NULL; 3327 3235 } else { 3328 3236 if (wbc) 3329 3237 wbc_account_cgroup_owner(wbc, page, io_size); ··· 3346 3254 wbc_account_cgroup_owner(wbc, page, io_size); 3347 3255 } 3348 3256 if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { 3349 - struct extent_map *em; 3350 - struct map_lookup *map; 3257 + struct btrfs_device *device; 3351 3258 3352 - em = btrfs_get_chunk_map(fs_info, disk_bytenr, io_size); 3353 - if (IS_ERR(em)) 3354 - return PTR_ERR(em); 3259 + device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size); 3260 + if (IS_ERR(device)) 3261 + return PTR_ERR(device); 3355 3262 3356 - map = em->map_lookup; 3357 - /* We only support single profile for now */ 3358 - ASSERT(map->num_stripes == 1); 3359 - btrfs_io_bio(bio)->device = map->stripes[0].dev; 3360 - 3361 - free_extent_map(em); 3263 + btrfs_io_bio(bio)->device = device; 3362 3264 } 3363 3265 3364 - *bio_ret = bio; 3266 + bio_ctrl->bio = bio; 3267 + bio_ctrl->bio_flags = bio_flags; 3268 + ret = calc_bio_boundaries(bio_ctrl, inode); 3365 3269 3366 3270 return ret; 3367 3271 } ··· 3470 3382 * return 0 on success, otherwise return error 3471 3383 */ 3472 3384 int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, 3473 - struct bio **bio, unsigned long *bio_flags, 3385 + struct btrfs_bio_ctrl *bio_ctrl, 3474 3386 unsigned int read_flags, u64 *prev_em_start) 3475 3387 { 3476 3388 struct inode *inode = page->mapping->host; ··· 3646 3558 } 3647 3559 3648 3560 ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, 3649 - page, disk_bytenr, iosize, 3650 - pg_offset, bio, 3561 + bio_ctrl, page, disk_bytenr, iosize, 3562 + pg_offset, 3651 3563 end_bio_extent_readpage, 0, 3652 - *bio_flags, 3653 3564 this_bio_flag, 3654 3565 force_bio_submit); 3655 3566 if (!ret) { 3656 3567 nr++; 3657 - *bio_flags = this_bio_flag; 3658 3568 } else { 3659 3569 unlock_extent(tree, cur, cur + iosize - 1); 3660 3570 end_page_read(page, false, cur, iosize); ··· 3666 3580 } 3667 3581 3668 3582 static inline void contiguous_readpages(struct page *pages[], int nr_pages, 3669 - u64 start, u64 end, 3670 - struct extent_map **em_cached, 3671 - struct bio **bio, 3672 - unsigned long *bio_flags, 3673 - u64 *prev_em_start) 3583 + u64 start, u64 end, 3584 + struct extent_map **em_cached, 3585 + struct btrfs_bio_ctrl *bio_ctrl, 3586 + u64 *prev_em_start) 3674 3587 { 3675 3588 struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); 3676 3589 int index; ··· 3677 3592 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 3678 3593 3679 3594 for (index = 0; index < nr_pages; index++) { 3680 - btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, 3595 + btrfs_do_readpage(pages[index], em_cached, bio_ctrl, 3681 3596 REQ_RAHEAD, prev_em_start); 3682 3597 put_page(pages[index]); 3683 3598 } ··· 3765 3680 } 3766 3681 3767 3682 /* 3683 + * Find the first byte we need to write. 3684 + * 3685 + * For subpage, one page can contain several sectors, and 3686 + * __extent_writepage_io() will just grab all extent maps in the page 3687 + * range and try to submit all non-inline/non-compressed extents. 3688 + * 3689 + * This is a big problem for subpage, we shouldn't re-submit already written 3690 + * data at all. 3691 + * This function will lookup subpage dirty bit to find which range we really 3692 + * need to submit. 3693 + * 3694 + * Return the next dirty range in [@start, @end). 3695 + * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE. 3696 + */ 3697 + static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, 3698 + struct page *page, u64 *start, u64 *end) 3699 + { 3700 + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 3701 + u64 orig_start = *start; 3702 + /* Declare as unsigned long so we can use bitmap ops */ 3703 + unsigned long dirty_bitmap; 3704 + unsigned long flags; 3705 + int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits; 3706 + int range_start_bit = nbits; 3707 + int range_end_bit; 3708 + 3709 + /* 3710 + * For regular sector size == page size case, since one page only 3711 + * contains one sector, we return the page offset directly. 3712 + */ 3713 + if (fs_info->sectorsize == PAGE_SIZE) { 3714 + *start = page_offset(page); 3715 + *end = page_offset(page) + PAGE_SIZE; 3716 + return; 3717 + } 3718 + 3719 + /* We should have the page locked, but just in case */ 3720 + spin_lock_irqsave(&subpage->lock, flags); 3721 + dirty_bitmap = subpage->dirty_bitmap; 3722 + spin_unlock_irqrestore(&subpage->lock, flags); 3723 + 3724 + bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit, 3725 + BTRFS_SUBPAGE_BITMAP_SIZE); 3726 + *start = page_offset(page) + range_start_bit * fs_info->sectorsize; 3727 + *end = page_offset(page) + range_end_bit * fs_info->sectorsize; 3728 + } 3729 + 3730 + /* 3768 3731 * helper for __extent_writepage. This calls the writepage start hooks, 3769 3732 * and does the loop to map the page into extents and bios. 3770 3733 * ··· 3829 3696 int *nr_ret) 3830 3697 { 3831 3698 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3832 - struct extent_io_tree *tree = &inode->io_tree; 3833 3699 u64 start = page_offset(page); 3834 3700 u64 end = start + PAGE_SIZE - 1; 3835 3701 u64 cur = start; ··· 3859 3727 while (cur <= end) { 3860 3728 u64 disk_bytenr; 3861 3729 u64 em_end; 3730 + u64 dirty_range_start = cur; 3731 + u64 dirty_range_end; 3862 3732 u32 iosize; 3863 3733 3864 3734 if (cur >= i_size) { 3865 - btrfs_writepage_endio_finish_ordered(page, cur, end, 1); 3735 + btrfs_writepage_endio_finish_ordered(inode, page, cur, 3736 + end, 1); 3866 3737 break; 3867 3738 } 3739 + 3740 + find_next_dirty_byte(fs_info, page, &dirty_range_start, 3741 + &dirty_range_end); 3742 + if (cur < dirty_range_start) { 3743 + cur = dirty_range_start; 3744 + continue; 3745 + } 3746 + 3868 3747 em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); 3869 3748 if (IS_ERR_OR_NULL(em)) { 3870 - SetPageError(page); 3749 + btrfs_page_set_error(fs_info, page, cur, end - cur + 1); 3871 3750 ret = PTR_ERR_OR_ZERO(em); 3872 3751 break; 3873 3752 } ··· 3893 3750 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 3894 3751 disk_bytenr = em->block_start + extent_offset; 3895 3752 3896 - /* Note that em_end from extent_map_end() is exclusive */ 3897 - iosize = min(em_end, end + 1) - cur; 3753 + /* 3754 + * Note that em_end from extent_map_end() and dirty_range_end from 3755 + * find_next_dirty_byte() are all exclusive 3756 + */ 3757 + iosize = min(min(em_end, end + 1), dirty_range_end) - cur; 3898 3758 3899 3759 if (btrfs_use_zone_append(inode, em->block_start)) 3900 3760 opf = REQ_OP_ZONE_APPEND; ··· 3914 3768 if (compressed) 3915 3769 nr++; 3916 3770 else 3917 - btrfs_writepage_endio_finish_ordered(page, cur, 3918 - cur + iosize - 1, 1); 3771 + btrfs_writepage_endio_finish_ordered(inode, 3772 + page, cur, cur + iosize - 1, 1); 3919 3773 cur += iosize; 3920 3774 continue; 3921 3775 } 3922 3776 3923 - btrfs_set_range_writeback(tree, cur, cur + iosize - 1); 3777 + btrfs_set_range_writeback(inode, cur, cur + iosize - 1); 3924 3778 if (!PageWriteback(page)) { 3925 3779 btrfs_err(inode->root->fs_info, 3926 3780 "page %lu not writeback, cur %llu end %llu", 3927 3781 page->index, cur, end); 3928 3782 } 3929 3783 3930 - ret = submit_extent_page(opf | write_flags, wbc, page, 3784 + /* 3785 + * Although the PageDirty bit is cleared before entering this 3786 + * function, subpage dirty bit is not cleared. 3787 + * So clear subpage dirty bit here so next time we won't submit 3788 + * page for range already written to disk. 3789 + */ 3790 + btrfs_page_clear_dirty(fs_info, page, cur, iosize); 3791 + 3792 + ret = submit_extent_page(opf | write_flags, wbc, 3793 + &epd->bio_ctrl, page, 3931 3794 disk_bytenr, iosize, 3932 - cur - page_offset(page), &epd->bio, 3795 + cur - page_offset(page), 3933 3796 end_bio_extent_writepage, 3934 - 0, 0, 0, false); 3797 + 0, 0, false); 3935 3798 if (ret) { 3936 - SetPageError(page); 3799 + btrfs_page_set_error(fs_info, page, cur, iosize); 3937 3800 if (PageWriteback(page)) 3938 - end_page_writeback(page); 3801 + btrfs_page_clear_writeback(fs_info, page, cur, 3802 + iosize); 3939 3803 } 3940 3804 3941 3805 cur += iosize; ··· 4254 4098 * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() 4255 4099 * after all extent buffers in the page has finished their writeback. 4256 4100 */ 4257 - static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, 4258 - struct bio *bio) 4101 + static void end_bio_subpage_eb_writepage(struct bio *bio) 4259 4102 { 4103 + struct btrfs_fs_info *fs_info; 4260 4104 struct bio_vec *bvec; 4261 4105 struct bvec_iter_all iter_all; 4106 + 4107 + fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); 4108 + ASSERT(fs_info->sectorsize < PAGE_SIZE); 4262 4109 4263 4110 ASSERT(!bio_flagged(bio, BIO_CLONED)); 4264 4111 bio_for_each_segment_all(bvec, bio, iter_all) { ··· 4313 4154 4314 4155 static void end_bio_extent_buffer_writepage(struct bio *bio) 4315 4156 { 4316 - struct btrfs_fs_info *fs_info; 4317 4157 struct bio_vec *bvec; 4318 4158 struct extent_buffer *eb; 4319 4159 int done; 4320 4160 struct bvec_iter_all iter_all; 4321 - 4322 - fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); 4323 - if (fs_info->sectorsize < PAGE_SIZE) 4324 - return end_bio_subpage_eb_writepage(fs_info, bio); 4325 4161 4326 4162 ASSERT(!bio_flagged(bio, BIO_CLONED)); 4327 4163 bio_for_each_segment_all(bvec, bio, iter_all) { ··· 4343 4189 bio_put(bio); 4344 4190 } 4345 4191 4192 + static void prepare_eb_write(struct extent_buffer *eb) 4193 + { 4194 + u32 nritems; 4195 + unsigned long start; 4196 + unsigned long end; 4197 + 4198 + clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 4199 + atomic_set(&eb->io_pages, num_extent_pages(eb)); 4200 + 4201 + /* Set btree blocks beyond nritems with 0 to avoid stale content */ 4202 + nritems = btrfs_header_nritems(eb); 4203 + if (btrfs_header_level(eb) > 0) { 4204 + end = btrfs_node_key_ptr_offset(nritems); 4205 + memzero_extent_buffer(eb, end, eb->len - end); 4206 + } else { 4207 + /* 4208 + * Leaf: 4209 + * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 4210 + */ 4211 + start = btrfs_item_nr_offset(nritems); 4212 + end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); 4213 + memzero_extent_buffer(eb, start, end - start); 4214 + } 4215 + } 4216 + 4346 4217 /* 4347 4218 * Unlike the work in write_one_eb(), we rely completely on extent locking. 4348 4219 * Page locking is only utilized at minimum to keep the VMM code happy. 4349 - * 4350 - * Caller should still call write_one_eb() other than this function directly. 4351 - * As write_one_eb() has extra preparation before submitting the extent buffer. 4352 4220 */ 4353 4221 static int write_one_subpage_eb(struct extent_buffer *eb, 4354 4222 struct writeback_control *wbc, ··· 4382 4206 bool no_dirty_ebs = false; 4383 4207 int ret; 4384 4208 4209 + prepare_eb_write(eb); 4210 + 4385 4211 /* clear_page_dirty_for_io() in subpage helper needs page locked */ 4386 4212 lock_page(page); 4387 4213 btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); ··· 4394 4216 if (no_dirty_ebs) 4395 4217 clear_page_dirty_for_io(page); 4396 4218 4397 - ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, 4398 - eb->start, eb->len, eb->start - page_offset(page), 4399 - &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, 4400 - false); 4219 + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, 4220 + &epd->bio_ctrl, page, eb->start, eb->len, 4221 + eb->start - page_offset(page), 4222 + end_bio_subpage_eb_writepage, 0, 0, false); 4401 4223 if (ret) { 4402 4224 btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); 4403 4225 set_btree_ioerr(page, eb); ··· 4422 4244 struct extent_page_data *epd) 4423 4245 { 4424 4246 u64 disk_bytenr = eb->start; 4425 - u32 nritems; 4426 4247 int i, num_pages; 4427 - unsigned long start, end; 4428 4248 unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; 4429 4249 int ret = 0; 4430 4250 4431 - clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); 4251 + prepare_eb_write(eb); 4252 + 4432 4253 num_pages = num_extent_pages(eb); 4433 - atomic_set(&eb->io_pages, num_pages); 4434 - 4435 - /* set btree blocks beyond nritems with 0 to avoid stale content. */ 4436 - nritems = btrfs_header_nritems(eb); 4437 - if (btrfs_header_level(eb) > 0) { 4438 - end = btrfs_node_key_ptr_offset(nritems); 4439 - 4440 - memzero_extent_buffer(eb, end, eb->len - end); 4441 - } else { 4442 - /* 4443 - * leaf: 4444 - * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 4445 - */ 4446 - start = btrfs_item_nr_offset(nritems); 4447 - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); 4448 - memzero_extent_buffer(eb, start, end - start); 4449 - } 4450 - 4451 - if (eb->fs_info->sectorsize < PAGE_SIZE) 4452 - return write_one_subpage_eb(eb, wbc, epd); 4453 - 4454 4254 for (i = 0; i < num_pages; i++) { 4455 4255 struct page *p = eb->pages[i]; 4456 4256 4457 4257 clear_page_dirty_for_io(p); 4458 4258 set_page_writeback(p); 4459 4259 ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, 4460 - p, disk_bytenr, PAGE_SIZE, 0, 4461 - &epd->bio, 4260 + &epd->bio_ctrl, p, disk_bytenr, 4261 + PAGE_SIZE, 0, 4462 4262 end_bio_extent_buffer_writepage, 4463 - 0, 0, 0, false); 4263 + 0, 0, false); 4464 4264 if (ret) { 4465 4265 set_btree_ioerr(p, eb); 4466 4266 if (PageWriteback(p)) ··· 4542 4386 free_extent_buffer(eb); 4543 4387 goto cleanup; 4544 4388 } 4545 - ret = write_one_eb(eb, wbc, epd); 4389 + ret = write_one_subpage_eb(eb, wbc, epd); 4546 4390 free_extent_buffer(eb); 4547 4391 if (ret < 0) 4548 4392 goto cleanup; ··· 4654 4498 { 4655 4499 struct extent_buffer *eb_context = NULL; 4656 4500 struct extent_page_data epd = { 4657 - .bio = NULL, 4501 + .bio_ctrl = { 0 }, 4658 4502 .extent_locked = 0, 4659 4503 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4660 4504 }; ··· 4936 4780 { 4937 4781 int ret; 4938 4782 struct extent_page_data epd = { 4939 - .bio = NULL, 4783 + .bio_ctrl = { 0 }, 4940 4784 .extent_locked = 0, 4941 4785 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 4942 4786 }; ··· 4963 4807 PAGE_SHIFT; 4964 4808 4965 4809 struct extent_page_data epd = { 4966 - .bio = NULL, 4810 + .bio_ctrl = { 0 }, 4967 4811 .extent_locked = 1, 4968 4812 .sync_io = mode == WB_SYNC_ALL, 4969 4813 }; ··· 4983 4827 if (clear_page_dirty_for_io(page)) 4984 4828 ret = __extent_writepage(page, &wbc_writepages, &epd); 4985 4829 else { 4986 - btrfs_writepage_endio_finish_ordered(page, start, 4987 - start + PAGE_SIZE - 1, 1); 4830 + btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), 4831 + page, start, start + PAGE_SIZE - 1, 1); 4988 4832 unlock_page(page); 4989 4833 } 4990 4834 put_page(page); ··· 5006 4850 { 5007 4851 int ret = 0; 5008 4852 struct extent_page_data epd = { 5009 - .bio = NULL, 4853 + .bio_ctrl = { 0 }, 5010 4854 .extent_locked = 0, 5011 4855 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 5012 4856 }; ··· 5023 4867 5024 4868 void extent_readahead(struct readahead_control *rac) 5025 4869 { 5026 - struct bio *bio = NULL; 5027 - unsigned long bio_flags = 0; 4870 + struct btrfs_bio_ctrl bio_ctrl = { 0 }; 5028 4871 struct page *pagepool[16]; 5029 4872 struct extent_map *em_cached = NULL; 5030 4873 u64 prev_em_start = (u64)-1; ··· 5034 4879 u64 contig_end = contig_start + readahead_batch_length(rac) - 1; 5035 4880 5036 4881 contiguous_readpages(pagepool, nr, contig_start, contig_end, 5037 - &em_cached, &bio, &bio_flags, &prev_em_start); 4882 + &em_cached, &bio_ctrl, &prev_em_start); 5038 4883 } 5039 4884 5040 4885 if (em_cached) 5041 4886 free_extent_map(em_cached); 5042 4887 5043 - if (bio) { 5044 - if (submit_one_bio(bio, 0, bio_flags)) 4888 + if (bio_ctrl.bio) { 4889 + if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) 5045 4890 return; 5046 4891 } 5047 4892 } ··· 5584 5429 subpage = (struct btrfs_subpage *)page->private; 5585 5430 if (atomic_read(&subpage->eb_refs)) 5586 5431 return true; 5432 + /* 5433 + * Even there is no eb refs here, we may still have 5434 + * end_page_read() call relying on page::private. 5435 + */ 5436 + if (atomic_read(&subpage->readers)) 5437 + return true; 5587 5438 } 5588 5439 return false; 5589 5440 } ··· 5650 5489 5651 5490 /* 5652 5491 * We can only detach the page private if there are no other ebs in the 5653 - * page range. 5492 + * page range and no unfinished IO. 5654 5493 */ 5655 5494 if (!page_range_has_eb(fs_info, page)) 5656 5495 btrfs_detach_subpage(fs_info, page); ··· 6337 6176 struct btrfs_fs_info *fs_info = eb->fs_info; 6338 6177 struct extent_io_tree *io_tree; 6339 6178 struct page *page = eb->pages[0]; 6340 - struct bio *bio = NULL; 6179 + struct btrfs_bio_ctrl bio_ctrl = { 0 }; 6341 6180 int ret = 0; 6342 6181 6343 6182 ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); ··· 6345 6184 io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; 6346 6185 6347 6186 if (wait == WAIT_NONE) { 6348 - ret = try_lock_extent(io_tree, eb->start, 6349 - eb->start + eb->len - 1); 6350 - if (ret <= 0) 6351 - return ret; 6187 + if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) 6188 + return -EAGAIN; 6352 6189 } else { 6353 6190 ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1); 6354 6191 if (ret < 0) ··· 6368 6209 check_buffer_tree_ref(eb); 6369 6210 btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); 6370 6211 6371 - ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, page, eb->start, 6372 - eb->len, eb->start - page_offset(page), &bio, 6373 - end_bio_extent_readpage, mirror_num, 0, 0, 6212 + btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); 6213 + ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, 6214 + page, eb->start, eb->len, 6215 + eb->start - page_offset(page), 6216 + end_bio_extent_readpage, mirror_num, 0, 6374 6217 true); 6375 6218 if (ret) { 6376 6219 /* ··· 6382 6221 */ 6383 6222 atomic_dec(&eb->io_pages); 6384 6223 } 6385 - if (bio) { 6224 + if (bio_ctrl.bio) { 6386 6225 int tmp; 6387 6226 6388 - tmp = submit_one_bio(bio, mirror_num, 0); 6227 + tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); 6228 + bio_ctrl.bio = NULL; 6389 6229 if (tmp < 0) 6390 6230 return tmp; 6391 6231 } ··· 6409 6247 int all_uptodate = 1; 6410 6248 int num_pages; 6411 6249 unsigned long num_reads = 0; 6412 - struct bio *bio = NULL; 6413 - unsigned long bio_flags = 0; 6250 + struct btrfs_bio_ctrl bio_ctrl = { 0 }; 6414 6251 6415 6252 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 6416 6253 return 0; ··· 6473 6312 6474 6313 ClearPageError(page); 6475 6314 err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, 6476 - page, page_offset(page), PAGE_SIZE, 0, 6477 - &bio, end_bio_extent_readpage, 6478 - mirror_num, 0, 0, false); 6315 + &bio_ctrl, page, page_offset(page), 6316 + PAGE_SIZE, 0, end_bio_extent_readpage, 6317 + mirror_num, 0, false); 6479 6318 if (err) { 6480 6319 /* 6481 6320 * We failed to submit the bio so it's the ··· 6492 6331 } 6493 6332 } 6494 6333 6495 - if (bio) { 6496 - err = submit_one_bio(bio, mirror_num, bio_flags); 6334 + if (bio_ctrl.bio) { 6335 + err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); 6336 + bio_ctrl.bio = NULL; 6497 6337 if (err) 6498 6338 return err; 6499 6339 } ··· 6677 6515 char *kaddr; 6678 6516 6679 6517 assert_eb_page_uptodate(eb, eb->pages[0]); 6680 - kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); 6681 - memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, 6682 - BTRFS_FSID_SIZE); 6518 + kaddr = page_address(eb->pages[0]) + 6519 + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, 6520 + chunk_tree_uuid)); 6521 + memcpy(kaddr, srcv, BTRFS_FSID_SIZE); 6683 6522 } 6684 6523 6685 6524 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) ··· 6688 6525 char *kaddr; 6689 6526 6690 6527 assert_eb_page_uptodate(eb, eb->pages[0]); 6691 - kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); 6692 - memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, 6693 - BTRFS_FSID_SIZE); 6528 + kaddr = page_address(eb->pages[0]) + 6529 + get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); 6530 + memcpy(kaddr, srcv, BTRFS_FSID_SIZE); 6694 6531 } 6695 6532 6696 6533 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,

+19 -10

fs/btrfs/extent_io.h

··· 39 39 /* Page starts writeback, clear dirty bit and set writeback bit */ 40 40 #define PAGE_START_WRITEBACK (1 << 1) 41 41 #define PAGE_END_WRITEBACK (1 << 2) 42 - #define PAGE_SET_PRIVATE2 (1 << 3) 42 + #define PAGE_SET_ORDERED (1 << 3) 43 43 #define PAGE_SET_ERROR (1 << 4) 44 44 #define PAGE_LOCK (1 << 5) 45 45 ··· 99 99 #ifdef CONFIG_BTRFS_DEBUG 100 100 struct list_head leak_list; 101 101 #endif 102 + }; 103 + 104 + /* 105 + * Structure to record info about the bio being assembled, and other info like 106 + * how many bytes are there before stripe/ordered extent boundary. 107 + */ 108 + struct btrfs_bio_ctrl { 109 + struct bio *bio; 110 + unsigned long bio_flags; 111 + u32 len_to_stripe_boundary; 112 + u32 len_to_oe_boundary; 102 113 }; 103 114 104 115 /* ··· 180 169 int __must_check submit_one_bio(struct bio *bio, int mirror_num, 181 170 unsigned long bio_flags); 182 171 int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, 183 - struct bio **bio, unsigned long *bio_flags, 172 + struct btrfs_bio_ctrl *bio_ctrl, 184 173 unsigned int read_flags, u64 *prev_em_start); 185 174 int extent_write_full_page(struct page *page, struct writeback_control *wbc); 186 175 int extent_write_locked_range(struct inode *inode, u64 start, u64 end, ··· 292 281 * When IO fails, either with EIO or csum verification fails, we 293 282 * try other mirrors that might have a good copy of the data. This 294 283 * io_failure_record is used to record state as we go through all the 295 - * mirrors. If another mirror has good data, the page is set up to date 284 + * mirrors. If another mirror has good data, the sector is set up to date 296 285 * and things continue. If a good mirror can't be found, the original 297 286 * bio end_io callback is called to indicate things have failed. 298 287 */ ··· 304 293 unsigned long bio_flags; 305 294 int this_mirror; 306 295 int failed_mirror; 307 - int in_validation; 308 296 }; 309 297 310 - 311 - blk_status_t btrfs_submit_read_repair(struct inode *inode, 312 - struct bio *failed_bio, u32 bio_offset, 313 - struct page *page, unsigned int pgoff, 314 - u64 start, u64 end, int failed_mirror, 315 - submit_bio_hook_t *submit_bio_hook); 298 + int btrfs_repair_one_sector(struct inode *inode, 299 + struct bio *failed_bio, u32 bio_offset, 300 + struct page *page, unsigned int pgoff, 301 + u64 start, int failed_mirror, 302 + submit_bio_hook_t *submit_bio_hook); 316 303 317 304 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 318 305 bool find_lock_delalloc_range(struct inode *inode,

+1 -1

fs/btrfs/file-item.c

··· 618 618 * @file_start: offset in file this bio begins to describe 619 619 * @contig: Boolean. If true/1 means all bio vecs in this bio are 620 620 * contiguous and they begin at @file_start in the file. False/0 621 - * means this bio can contains potentially discontigous bio vecs 621 + * means this bio can contain potentially discontiguous bio vecs 622 622 * so the logical offset of each should be calculated separately. 623 623 */ 624 624 blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,

+28 -16

fs/btrfs/file.c

··· 28 28 #include "compression.h" 29 29 #include "delalloc-space.h" 30 30 #include "reflink.h" 31 + #include "subpage.h" 31 32 32 33 static struct kmem_cache *btrfs_inode_defrag_cachep; 33 34 /* ··· 483 482 start_pos = round_down(pos, fs_info->sectorsize); 484 483 num_bytes = round_up(write_bytes + pos - start_pos, 485 484 fs_info->sectorsize); 485 + ASSERT(num_bytes <= U32_MAX); 486 486 487 487 end_of_last_block = start_pos + num_bytes - 1; 488 488 ··· 502 500 503 501 for (i = 0; i < num_pages; i++) { 504 502 struct page *p = pages[i]; 505 - SetPageUptodate(p); 503 + 504 + btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); 506 505 ClearPageChecked(p); 507 - set_page_dirty(p); 506 + btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); 508 507 } 509 508 510 509 /* ··· 2486 2483 const u64 lockend, 2487 2484 struct extent_state **cached_state) 2488 2485 { 2486 + /* 2487 + * For subpage case, if the range is not at page boundary, we could 2488 + * have pages at the leading/tailing part of the range. 2489 + * This could lead to dead loop since filemap_range_has_page() 2490 + * will always return true. 2491 + * So here we need to do extra page alignment for 2492 + * filemap_range_has_page(). 2493 + */ 2494 + const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); 2495 + const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; 2496 + 2489 2497 while (1) { 2490 2498 struct btrfs_ordered_extent *ordered; 2491 2499 int ret; ··· 2517 2503 (ordered->file_offset + ordered->num_bytes <= lockstart || 2518 2504 ordered->file_offset > lockend)) && 2519 2505 !filemap_range_has_page(inode->i_mapping, 2520 - lockstart, lockend)) { 2506 + page_lockstart, page_lockend)) { 2521 2507 if (ordered) 2522 2508 btrfs_put_ordered_extent(ordered); 2523 2509 break; ··· 3048 3034 */ 3049 3035 static int add_falloc_range(struct list_head *head, u64 start, u64 len) 3050 3036 { 3051 - struct falloc_range *prev = NULL; 3052 3037 struct falloc_range *range = NULL; 3053 3038 3054 - if (list_empty(head)) 3055 - goto insert; 3056 - 3057 - /* 3058 - * As fallocate iterate by bytenr order, we only need to check 3059 - * the last range. 3060 - */ 3061 - prev = list_entry(head->prev, struct falloc_range, list); 3062 - if (prev->start + prev->len == start) { 3063 - prev->len += len; 3064 - return 0; 3039 + if (!list_empty(head)) { 3040 + /* 3041 + * As fallocate iterates by bytenr order, we only need to check 3042 + * the last range. 3043 + */ 3044 + range = list_last_entry(head, struct falloc_range, list); 3045 + if (range->start + range->len == start) { 3046 + range->len += len; 3047 + return 0; 3048 + } 3065 3049 } 3066 - insert: 3050 + 3067 3051 range = kmalloc(sizeof(*range), GFP_KERNEL); 3068 3052 if (!range) 3069 3053 return -ENOMEM;

+1 -1

fs/btrfs/free-space-cache.c

··· 327 327 * need to check for -EAGAIN. 328 328 */ 329 329 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 330 - 0, BTRFS_EXTENT_DATA_KEY); 330 + 0, BTRFS_EXTENT_DATA_KEY, NULL); 331 331 if (ret) 332 332 goto fail; 333 333

+286 -235

fs/btrfs/inode.c

··· 51 51 #include "block-group.h" 52 52 #include "space-info.h" 53 53 #include "zoned.h" 54 + #include "subpage.h" 54 55 55 56 struct btrfs_iget_args { 56 57 u64 ino; ··· 167 166 struct page *page; 168 167 169 168 while (index <= end_index) { 169 + /* 170 + * For locked page, we will call end_extent_writepage() on it 171 + * in run_delalloc_range() for the error handling. That 172 + * end_extent_writepage() function will call 173 + * btrfs_mark_ordered_io_finished() to clear page Ordered and 174 + * run the ordered extent accounting. 175 + * 176 + * Here we can't just clear the Ordered bit, or 177 + * btrfs_mark_ordered_io_finished() would skip the accounting 178 + * for the page range, and the ordered extent will never finish. 179 + */ 180 + if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { 181 + index++; 182 + continue; 183 + } 170 184 page = find_get_page(inode->vfs_inode.i_mapping, index); 171 185 index++; 172 186 if (!page) 173 187 continue; 174 - ClearPagePrivate2(page); 188 + 189 + /* 190 + * Here we just clear all Ordered bits for every page in the 191 + * range, then __endio_write_update_ordered() will handle 192 + * the ordered extent accounting for the range. 193 + */ 194 + btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, 195 + offset, bytes); 175 196 put_page(page); 176 197 } 177 198 199 + /* The locked page covers the full range, nothing needs to be done */ 200 + if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) 201 + return; 178 202 /* 179 203 * In case this page belongs to the delalloc range being instantiated 180 204 * then skip it, since the first page of a range is going to be 181 205 * properly cleaned up by the caller of run_delalloc_range 182 206 */ 183 207 if (page_start >= offset && page_end <= (offset + bytes - 1)) { 184 - offset += PAGE_SIZE; 185 - bytes -= PAGE_SIZE; 208 + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; 209 + offset = page_offset(locked_page) + PAGE_SIZE; 186 210 } 187 211 188 212 return __endio_write_update_ordered(inode, offset, bytes, false); ··· 629 603 * inode has not been flagged as nocompress. This flag can 630 604 * change at any time if we discover bad compression ratios. 631 605 */ 632 - if (inode_need_compress(BTRFS_I(inode), start, end)) { 606 + if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) { 633 607 WARN_ON(pages); 634 608 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 635 609 if (!pages) { ··· 972 946 const u64 end = start + async_extent->ram_size - 1; 973 947 974 948 p->mapping = inode->vfs_inode.i_mapping; 975 - btrfs_writepage_endio_finish_ordered(p, start, end, 0); 949 + btrfs_writepage_endio_finish_ordered(inode, p, start, 950 + end, 0); 976 951 977 952 p->mapping = NULL; 978 953 extent_clear_unlock_delalloc(inode, start, end, NULL, 0, ··· 1091 1064 * our outstanding extent for clearing delalloc for this 1092 1065 * range. 1093 1066 */ 1094 - extent_clear_unlock_delalloc(inode, start, end, NULL, 1067 + extent_clear_unlock_delalloc(inode, start, end, 1068 + locked_page, 1095 1069 EXTENT_LOCKED | EXTENT_DELALLOC | 1096 1070 EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 1097 1071 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | ··· 1100 1072 *nr_written = *nr_written + 1101 1073 (end - start + PAGE_SIZE) / PAGE_SIZE; 1102 1074 *page_started = 1; 1075 + /* 1076 + * locked_page is locked by the caller of 1077 + * writepage_delalloc(), not locked by 1078 + * __process_pages_contig(). 1079 + * 1080 + * We can't let __process_pages_contig() to unlock it, 1081 + * as it doesn't have any subpage::writers recorded. 1082 + * 1083 + * Here we manually unlock the page, since the caller 1084 + * can't use page_started to determine if it's an 1085 + * inline extent or a compressed extent. 1086 + */ 1087 + unlock_page(locked_page); 1103 1088 goto out; 1104 1089 } else if (ret < 0) { 1105 1090 goto out_unlock; ··· 1191 1150 1192 1151 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1193 1152 1194 - /* we're not doing compressed IO, don't unlock the first 1195 - * page (which the caller expects to stay locked), don't 1196 - * clear any dirty bits and don't set any writeback bits 1153 + /* 1154 + * We're not doing compressed IO, don't unlock the first page 1155 + * (which the caller expects to stay locked), don't clear any 1156 + * dirty bits and don't set any writeback bits 1197 1157 * 1198 - * Do set the Private2 bit so we know this page was properly 1199 - * setup for writepage 1158 + * Do set the Ordered (Private2) bit so we know this page was 1159 + * properly setup for writepage. 1200 1160 */ 1201 1161 page_ops = unlock ? PAGE_UNLOCK : 0; 1202 - page_ops |= PAGE_SET_PRIVATE2; 1162 + page_ops |= PAGE_SET_ORDERED; 1203 1163 1204 1164 extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, 1205 1165 locked_page, ··· 1864 1822 locked_page, EXTENT_LOCKED | 1865 1823 EXTENT_DELALLOC | 1866 1824 EXTENT_CLEAR_DATA_RESV, 1867 - PAGE_UNLOCK | PAGE_SET_PRIVATE2); 1825 + PAGE_UNLOCK | PAGE_SET_ORDERED); 1868 1826 1869 1827 cur_offset = extent_end; 1870 1828 ··· 2235 2193 struct inode *inode = page->mapping->host; 2236 2194 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2237 2195 u64 logical = bio->bi_iter.bi_sector << 9; 2196 + u32 bio_len = bio->bi_iter.bi_size; 2238 2197 struct extent_map *em; 2239 - u64 length = 0; 2240 - u64 map_length; 2241 2198 int ret = 0; 2242 2199 struct btrfs_io_geometry geom; 2243 2200 2244 2201 if (bio_flags & EXTENT_BIO_COMPRESSED) 2245 2202 return 0; 2246 2203 2247 - length = bio->bi_iter.bi_size; 2248 - map_length = length; 2249 - em = btrfs_get_chunk_map(fs_info, logical, map_length); 2204 + em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); 2250 2205 if (IS_ERR(em)) 2251 2206 return PTR_ERR(em); 2252 - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, 2253 - map_length, &geom); 2207 + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); 2254 2208 if (ret < 0) 2255 2209 goto out; 2256 2210 2257 - if (geom.len < length + size) 2211 + if (geom.len < bio_len + size) 2258 2212 ret = 1; 2259 2213 out: 2260 2214 free_extent_map(em); ··· 2269 2231 u64 dio_file_offset) 2270 2232 { 2271 2233 return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0); 2272 - } 2273 - 2274 - bool btrfs_bio_fits_in_ordered_extent(struct page *page, struct bio *bio, 2275 - unsigned int size) 2276 - { 2277 - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 2278 - struct btrfs_fs_info *fs_info = inode->root->fs_info; 2279 - struct btrfs_ordered_extent *ordered; 2280 - u64 len = bio->bi_iter.bi_size + size; 2281 - bool ret = true; 2282 - 2283 - ASSERT(btrfs_is_zoned(fs_info)); 2284 - ASSERT(fs_info->max_zone_append_size > 0); 2285 - ASSERT(bio_op(bio) == REQ_OP_ZONE_APPEND); 2286 - 2287 - /* Ordered extent not yet created, so we're good */ 2288 - ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); 2289 - if (!ordered) 2290 - return ret; 2291 - 2292 - if ((bio->bi_iter.bi_sector << SECTOR_SHIFT) + len > 2293 - ordered->disk_bytenr + ordered->disk_num_bytes) 2294 - ret = false; 2295 - 2296 - btrfs_put_ordered_extent(ordered); 2297 - 2298 - return ret; 2299 2234 } 2300 2235 2301 2236 static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, ··· 2612 2601 lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); 2613 2602 2614 2603 /* already ordered? We're done */ 2615 - if (PagePrivate2(page)) 2604 + if (PageOrdered(page)) 2616 2605 goto out_reserved; 2617 2606 2618 2607 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); ··· 2687 2676 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 2688 2677 struct btrfs_writepage_fixup *fixup; 2689 2678 2690 - /* this page is properly in the ordered list */ 2691 - if (TestClearPagePrivate2(page)) 2679 + /* This page has ordered extent covering it already */ 2680 + if (PageOrdered(page)) 2692 2681 return 0; 2693 2682 2694 2683 /* ··· 2784 2773 /* 2785 2774 * If we dropped an inline extent here, we know the range where it is 2786 2775 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the 2787 - * number of bytes only for that range contaning the inline extent. 2776 + * number of bytes only for that range containing the inline extent. 2788 2777 * The remaining of the range will be processed when clearning the 2789 2778 * EXTENT_DELALLOC_BIT bit through the ordered extent completion. 2790 2779 */ ··· 3080 3069 btrfs_finish_ordered_io(ordered_extent); 3081 3070 } 3082 3071 3083 - void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, 3072 + void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, 3073 + struct page *page, u64 start, 3084 3074 u64 end, int uptodate) 3085 3075 { 3086 - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 3087 - struct btrfs_fs_info *fs_info = inode->root->fs_info; 3088 - struct btrfs_ordered_extent *ordered_extent = NULL; 3089 - struct btrfs_workqueue *wq; 3076 + trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); 3090 3077 3091 - trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 3092 - 3093 - ClearPagePrivate2(page); 3094 - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 3095 - end - start + 1, uptodate)) 3096 - return; 3097 - 3098 - if (btrfs_is_free_space_inode(inode)) 3099 - wq = fs_info->endio_freespace_worker; 3100 - else 3101 - wq = fs_info->endio_write_workers; 3102 - 3103 - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); 3104 - btrfs_queue_work(wq, &ordered_extent->work); 3078 + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, 3079 + finish_ordered_fn, uptodate); 3105 3080 } 3106 3081 3107 3082 /* ··· 3149 3152 * @bio_offset: offset to the beginning of the bio (in bytes) 3150 3153 * @start: file offset of the range start 3151 3154 * @end: file offset of the range end (inclusive) 3155 + * 3156 + * Return a bitmap where bit set means a csum mismatch, and bit not set means 3157 + * csum match. 3152 3158 */ 3153 - int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, 3154 - struct page *page, u64 start, u64 end) 3159 + unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, 3160 + struct page *page, u64 start, u64 end) 3155 3161 { 3156 3162 struct inode *inode = page->mapping->host; 3157 3163 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3158 3164 struct btrfs_root *root = BTRFS_I(inode)->root; 3159 3165 const u32 sectorsize = root->fs_info->sectorsize; 3160 3166 u32 pg_off; 3167 + unsigned int result = 0; 3161 3168 3162 3169 if (PageChecked(page)) { 3163 3170 ClearPageChecked(page); ··· 3189 3188 3190 3189 ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, 3191 3190 page_offset(page) + pg_off); 3192 - if (ret < 0) 3193 - return -EIO; 3191 + if (ret < 0) { 3192 + const int nr_bit = (pg_off - offset_in_page(start)) >> 3193 + root->fs_info->sectorsize_bits; 3194 + 3195 + result |= (1U << nr_bit); 3196 + } 3194 3197 } 3195 - return 0; 3198 + return result; 3196 3199 } 3197 3200 3198 3201 /* ··· 4114 4109 * This is a placeholder inode for a subvolume we didn't have a 4115 4110 * reference to at the time of the snapshot creation. In the meantime 4116 4111 * we could have renamed the real subvol link into our snapshot, so 4117 - * depending on btrfs_del_root_ref to return -ENOENT here is incorret. 4112 + * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. 4118 4113 * Instead simply lookup the dir_index_item for this entry so we can 4119 4114 * remove it. Otherwise we know we have a ref to the root and we can 4120 4115 * call btrfs_del_root_ref, and it _shouldn't_ fail. ··· 4469 4464 #define NEED_TRUNCATE_BLOCK 1 4470 4465 4471 4466 /* 4472 - * this can truncate away extent items, csum items and directory items. 4473 - * It starts at a high offset and removes keys until it can't find 4474 - * any higher than new_size 4467 + * Remove inode items from a given root. 4475 4468 * 4476 - * csum items that cross the new i_size are truncated to the new size 4477 - * as well. 4469 + * @trans: A transaction handle. 4470 + * @root: The root from which to remove items. 4471 + * @inode: The inode whose items we want to remove. 4472 + * @new_size: The new i_size for the inode. This is only applicable when 4473 + * @min_type is BTRFS_EXTENT_DATA_KEY, must be 0 otherwise. 4474 + * @min_type: The minimum key type to remove. All keys with a type 4475 + * greater than this value are removed and all keys with 4476 + * this type are removed only if their offset is >= @new_size. 4477 + * @extents_found: Output parameter that will contain the number of file 4478 + * extent items that were removed or adjusted to the new 4479 + * inode i_size. The caller is responsible for initializing 4480 + * the counter. Also, it can be NULL if the caller does not 4481 + * need this counter. 4478 4482 * 4479 - * min_type is the minimum key type to truncate down to. If set to 0, this 4480 - * will kill all the items on this inode, including the INODE_ITEM_KEY. 4483 + * Remove all keys associated with the inode from the given root that have a key 4484 + * with a type greater than or equals to @min_type. When @min_type has a value of 4485 + * BTRFS_EXTENT_DATA_KEY, only remove file extent items that have an offset value 4486 + * greater than or equals to @new_size. If a file extent item that starts before 4487 + * @new_size and ends after it is found, its length is adjusted. 4488 + * 4489 + * Returns: 0 on success, < 0 on error and NEED_TRUNCATE_BLOCK when @min_type is 4490 + * BTRFS_EXTENT_DATA_KEY and the caller must truncate the last block. 4481 4491 */ 4482 4492 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, 4483 4493 struct btrfs_root *root, 4484 4494 struct btrfs_inode *inode, 4485 - u64 new_size, u32 min_type) 4495 + u64 new_size, u32 min_type, 4496 + u64 *extents_found) 4486 4497 { 4487 4498 struct btrfs_fs_info *fs_info = root->fs_info; 4488 4499 struct btrfs_path *path; ··· 4643 4622 /* FIXME, shrink the extent if the ref count is only 1 */ 4644 4623 if (found_type != BTRFS_EXTENT_DATA_KEY) 4645 4624 goto delete; 4625 + 4626 + if (extents_found != NULL) 4627 + (*extents_found)++; 4646 4628 4647 4629 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 4648 4630 u64 num_dec; ··· 4965 4941 flush_dcache_page(page); 4966 4942 } 4967 4943 ClearPageChecked(page); 4968 - set_page_dirty(page); 4944 + btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); 4969 4945 unlock_extent_cached(io_tree, block_start, block_end, &cached_state); 4970 4946 4971 4947 if (only_release_metadata) ··· 5479 5455 trans->block_rsv = rsv; 5480 5456 5481 5457 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 5482 - 0, 0); 5458 + 0, 0, NULL); 5483 5459 trans->block_rsv = &fs_info->trans_block_rsv; 5484 5460 btrfs_end_transaction(trans); 5485 5461 btrfs_btree_balance_dirty(fs_info); ··· 7961 7937 btrfs_ino(BTRFS_I(inode)), 7962 7938 pgoff); 7963 7939 } else { 7964 - blk_status_t status; 7940 + int ret; 7965 7941 7966 7942 ASSERT((start - io_bio->logical) < UINT_MAX); 7967 - status = btrfs_submit_read_repair(inode, 7968 - &io_bio->bio, 7969 - start - io_bio->logical, 7970 - bvec.bv_page, pgoff, 7971 - start, 7972 - start + sectorsize - 1, 7973 - io_bio->mirror_num, 7974 - submit_dio_repair_bio); 7975 - if (status) 7976 - err = status; 7943 + ret = btrfs_repair_one_sector(inode, 7944 + &io_bio->bio, 7945 + start - io_bio->logical, 7946 + bvec.bv_page, pgoff, 7947 + start, io_bio->mirror_num, 7948 + submit_dio_repair_bio); 7949 + if (ret) 7950 + err = errno_to_blk_status(ret); 7977 7951 } 7978 7952 start += sectorsize; 7979 7953 ASSERT(bio_offset + sectorsize > bio_offset); ··· 7986 7964 const u64 offset, const u64 bytes, 7987 7965 const bool uptodate) 7988 7966 { 7989 - struct btrfs_fs_info *fs_info = inode->root->fs_info; 7990 - struct btrfs_ordered_extent *ordered = NULL; 7991 - struct btrfs_workqueue *wq; 7992 - u64 ordered_offset = offset; 7993 - u64 ordered_bytes = bytes; 7994 - u64 last_offset; 7995 - 7996 - if (btrfs_is_free_space_inode(inode)) 7997 - wq = fs_info->endio_freespace_worker; 7998 - else 7999 - wq = fs_info->endio_write_workers; 8000 - 8001 - while (ordered_offset < offset + bytes) { 8002 - last_offset = ordered_offset; 8003 - if (btrfs_dec_test_first_ordered_pending(inode, &ordered, 8004 - &ordered_offset, 8005 - ordered_bytes, 8006 - uptodate)) { 8007 - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, 8008 - NULL); 8009 - btrfs_queue_work(wq, &ordered->work); 8010 - } 8011 - 8012 - /* No ordered extent found in the range, exit */ 8013 - if (ordered_offset == last_offset) 8014 - return; 8015 - /* 8016 - * Our bio might span multiple ordered extents. In this case 8017 - * we keep going until we have accounted the whole dio. 8018 - */ 8019 - if (ordered_offset < offset + bytes) { 8020 - ordered_bytes = offset + bytes - ordered_offset; 8021 - ordered = NULL; 8022 - } 8023 - } 7967 + btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, 7968 + finish_ordered_fn, uptodate); 8024 7969 } 8025 7970 8026 7971 static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, ··· 8161 8172 goto out_err_em; 8162 8173 } 8163 8174 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio), 8164 - logical, submit_len, &geom); 8175 + logical, &geom); 8165 8176 if (ret) { 8166 8177 status = errno_to_blk_status(ret); 8167 8178 goto out_err_em; ··· 8265 8276 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 8266 8277 u64 start = page_offset(page); 8267 8278 u64 end = start + PAGE_SIZE - 1; 8268 - unsigned long bio_flags = 0; 8269 - struct bio *bio = NULL; 8279 + struct btrfs_bio_ctrl bio_ctrl = { 0 }; 8270 8280 int ret; 8271 8281 8272 8282 btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); 8273 8283 8274 - ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); 8275 - if (bio) 8276 - ret = submit_one_bio(bio, 0, bio_flags); 8284 + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); 8285 + if (bio_ctrl.bio) 8286 + ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); 8277 8287 return ret; 8278 8288 } 8279 8289 ··· 8341 8353 if (page_has_private(page)) 8342 8354 attach_page_private(newpage, detach_page_private(page)); 8343 8355 8344 - if (PagePrivate2(page)) { 8345 - ClearPagePrivate2(page); 8346 - SetPagePrivate2(newpage); 8356 + if (PageOrdered(page)) { 8357 + ClearPageOrdered(page); 8358 + SetPageOrdered(newpage); 8347 8359 } 8348 8360 8349 8361 if (mode != MIGRATE_SYNC_NO_COPY) ··· 8358 8370 unsigned int length) 8359 8371 { 8360 8372 struct btrfs_inode *inode = BTRFS_I(page->mapping->host); 8373 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 8361 8374 struct extent_io_tree *tree = &inode->io_tree; 8362 - struct btrfs_ordered_extent *ordered; 8363 8375 struct extent_state *cached_state = NULL; 8364 8376 u64 page_start = page_offset(page); 8365 8377 u64 page_end = page_start + PAGE_SIZE - 1; 8366 - u64 start; 8367 - u64 end; 8378 + u64 cur; 8368 8379 int inode_evicting = inode->vfs_inode.i_state & I_FREEING; 8369 - bool found_ordered = false; 8370 - bool completed_ordered = false; 8371 8380 8372 8381 /* 8373 - * we have the page locked, so new writeback can't start, 8374 - * and the dirty bit won't be cleared while we are here. 8382 + * We have page locked so no new ordered extent can be created on this 8383 + * page, nor bio can be submitted for this page. 8375 8384 * 8376 - * Wait for IO on this page so that we can safely clear 8377 - * the PagePrivate2 bit and do ordered accounting 8385 + * But already submitted bio can still be finished on this page. 8386 + * Furthermore, endio function won't skip page which has Ordered 8387 + * (Private2) already cleared, so it's possible for endio and 8388 + * invalidatepage to do the same ordered extent accounting twice 8389 + * on one page. 8390 + * 8391 + * So here we wait for any submitted bios to finish, so that we won't 8392 + * do double ordered extent accounting on the same page. 8378 8393 */ 8379 8394 wait_on_page_writeback(page); 8380 8395 8381 - if (offset) { 8396 + /* 8397 + * For subpage case, we have call sites like 8398 + * btrfs_punch_hole_lock_range() which passes range not aligned to 8399 + * sectorsize. 8400 + * If the range doesn't cover the full page, we don't need to and 8401 + * shouldn't clear page extent mapped, as page->private can still 8402 + * record subpage dirty bits for other part of the range. 8403 + * 8404 + * For cases that can invalidate the full even the range doesn't 8405 + * cover the full page, like invalidating the last page, we're 8406 + * still safe to wait for ordered extent to finish. 8407 + */ 8408 + if (!(offset == 0 && length == PAGE_SIZE)) { 8382 8409 btrfs_releasepage(page, GFP_NOFS); 8383 8410 return; 8384 8411 } ··· 8401 8398 if (!inode_evicting) 8402 8399 lock_extent_bits(tree, page_start, page_end, &cached_state); 8403 8400 8404 - start = page_start; 8405 - again: 8406 - ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); 8407 - if (ordered) { 8408 - found_ordered = true; 8409 - end = min(page_end, 8410 - ordered->file_offset + ordered->num_bytes - 1); 8401 + cur = page_start; 8402 + while (cur < page_end) { 8403 + struct btrfs_ordered_extent *ordered; 8404 + bool delete_states; 8405 + u64 range_end; 8406 + u32 range_len; 8407 + 8408 + ordered = btrfs_lookup_first_ordered_range(inode, cur, 8409 + page_end + 1 - cur); 8410 + if (!ordered) { 8411 + range_end = page_end; 8412 + /* 8413 + * No ordered extent covering this range, we are safe 8414 + * to delete all extent states in the range. 8415 + */ 8416 + delete_states = true; 8417 + goto next; 8418 + } 8419 + if (ordered->file_offset > cur) { 8420 + /* 8421 + * There is a range between [cur, oe->file_offset) not 8422 + * covered by any ordered extent. 8423 + * We are safe to delete all extent states, and handle 8424 + * the ordered extent in the next iteration. 8425 + */ 8426 + range_end = ordered->file_offset - 1; 8427 + delete_states = true; 8428 + goto next; 8429 + } 8430 + 8431 + range_end = min(ordered->file_offset + ordered->num_bytes - 1, 8432 + page_end); 8433 + ASSERT(range_end + 1 - cur < U32_MAX); 8434 + range_len = range_end + 1 - cur; 8435 + if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) { 8436 + /* 8437 + * If Ordered (Private2) is cleared, it means endio has 8438 + * already been executed for the range. 8439 + * We can't delete the extent states as 8440 + * btrfs_finish_ordered_io() may still use some of them. 8441 + */ 8442 + delete_states = false; 8443 + goto next; 8444 + } 8445 + btrfs_page_clear_ordered(fs_info, page, cur, range_len); 8446 + 8411 8447 /* 8412 8448 * IO on this page will never be started, so we need to account 8413 8449 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW 8414 8450 * here, must leave that up for the ordered extent completion. 8451 + * 8452 + * This will also unlock the range for incoming 8453 + * btrfs_finish_ordered_io(). 8415 8454 */ 8416 8455 if (!inode_evicting) 8417 - clear_extent_bit(tree, start, end, 8456 + clear_extent_bit(tree, cur, range_end, 8418 8457 EXTENT_DELALLOC | 8419 8458 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | 8420 8459 EXTENT_DEFRAG, 1, 0, &cached_state); 8421 - /* 8422 - * whoever cleared the private bit is responsible 8423 - * for the finish_ordered_io 8424 - */ 8425 - if (TestClearPagePrivate2(page)) { 8426 - spin_lock_irq(&inode->ordered_tree.lock); 8427 - set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8428 - ordered->truncated_len = min(ordered->truncated_len, 8429 - start - ordered->file_offset); 8430 - spin_unlock_irq(&inode->ordered_tree.lock); 8431 8460 8432 - if (btrfs_dec_test_ordered_pending(inode, &ordered, 8433 - start, 8434 - end - start + 1, 1)) { 8435 - btrfs_finish_ordered_io(ordered); 8436 - completed_ordered = true; 8437 - } 8461 + spin_lock_irq(&inode->ordered_tree.lock); 8462 + set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); 8463 + ordered->truncated_len = min(ordered->truncated_len, 8464 + cur - ordered->file_offset); 8465 + spin_unlock_irq(&inode->ordered_tree.lock); 8466 + 8467 + if (btrfs_dec_test_ordered_pending(inode, &ordered, 8468 + cur, range_end + 1 - cur, 1)) { 8469 + btrfs_finish_ordered_io(ordered); 8470 + /* 8471 + * The ordered extent has finished, now we're again 8472 + * safe to delete all extent states of the range. 8473 + */ 8474 + delete_states = true; 8475 + } else { 8476 + /* 8477 + * btrfs_finish_ordered_io() will get executed by endio 8478 + * of other pages, thus we can't delete extent states 8479 + * anymore 8480 + */ 8481 + delete_states = false; 8438 8482 } 8439 - btrfs_put_ordered_extent(ordered); 8483 + next: 8484 + if (ordered) 8485 + btrfs_put_ordered_extent(ordered); 8486 + /* 8487 + * Qgroup reserved space handler 8488 + * Sector(s) here will be either: 8489 + * 8490 + * 1) Already written to disk or bio already finished 8491 + * Then its QGROUP_RESERVED bit in io_tree is already cleared. 8492 + * Qgroup will be handled by its qgroup_record then. 8493 + * btrfs_qgroup_free_data() call will do nothing here. 8494 + * 8495 + * 2) Not written to disk yet 8496 + * Then btrfs_qgroup_free_data() call will clear the 8497 + * QGROUP_RESERVED bit of its io_tree, and free the qgroup 8498 + * reserved data space. 8499 + * Since the IO will never happen for this page. 8500 + */ 8501 + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); 8440 8502 if (!inode_evicting) { 8441 - cached_state = NULL; 8442 - lock_extent_bits(tree, start, end, 8443 - &cached_state); 8444 - } 8445 - 8446 - start = end + 1; 8447 - if (start < page_end) 8448 - goto again; 8449 - } 8450 - 8451 - /* 8452 - * Qgroup reserved space handler 8453 - * Page here will be either 8454 - * 1) Already written to disk or ordered extent already submitted 8455 - * Then its QGROUP_RESERVED bit in io_tree is already cleaned. 8456 - * Qgroup will be handled by its qgroup_record then. 8457 - * btrfs_qgroup_free_data() call will do nothing here. 8458 - * 8459 - * 2) Not written to disk yet 8460 - * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED 8461 - * bit of its io_tree, and free the qgroup reserved data space. 8462 - * Since the IO will never happen for this page. 8463 - */ 8464 - btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); 8465 - if (!inode_evicting) { 8466 - bool delete = true; 8467 - 8468 - /* 8469 - * If there's an ordered extent for this range and we have not 8470 - * finished it ourselves, we must leave EXTENT_DELALLOC_NEW set 8471 - * in the range for the ordered extent completion. We must also 8472 - * not delete the range, otherwise we would lose that bit (and 8473 - * any other bits set in the range). Make sure EXTENT_UPTODATE 8474 - * is cleared if we don't delete, otherwise it can lead to 8475 - * corruptions if the i_size is extented later. 8476 - */ 8477 - if (found_ordered && !completed_ordered) 8478 - delete = false; 8479 - clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | 8503 + clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | 8480 8504 EXTENT_DELALLOC | EXTENT_UPTODATE | 8481 8505 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 8482 - delete, &cached_state); 8483 - 8484 - __btrfs_releasepage(page, GFP_NOFS); 8506 + delete_states, &cached_state); 8507 + } 8508 + cur = range_end + 1; 8485 8509 } 8486 - 8510 + /* 8511 + * We have iterated through all ordered extents of the page, the page 8512 + * should not have Ordered (Private2) anymore, or the above iteration 8513 + * did something wrong. 8514 + */ 8515 + ASSERT(!PageOrdered(page)); 8516 + if (!inode_evicting) 8517 + __btrfs_releasepage(page, GFP_NOFS); 8487 8518 ClearPageChecked(page); 8488 8519 clear_page_extent_mapped(page); 8489 8520 } ··· 8663 8626 flush_dcache_page(page); 8664 8627 } 8665 8628 ClearPageChecked(page); 8666 - set_page_dirty(page); 8667 - SetPageUptodate(page); 8629 + btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); 8630 + btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); 8668 8631 8669 8632 btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); 8670 8633 ··· 8698 8661 struct btrfs_trans_handle *trans; 8699 8662 u64 mask = fs_info->sectorsize - 1; 8700 8663 u64 min_size = btrfs_calc_metadata_size(fs_info, 1); 8664 + u64 extents_found = 0; 8701 8665 8702 8666 if (!skip_writeback) { 8703 8667 ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), ··· 8756 8718 min_size, false); 8757 8719 BUG_ON(ret); 8758 8720 8759 - /* 8760 - * So if we truncate and then write and fsync we normally would just 8761 - * write the extents that changed, which is a problem if we need to 8762 - * first truncate that entire inode. So set this flag so we write out 8763 - * all of the extents in the inode to the sync log so we're completely 8764 - * safe. 8765 - */ 8766 - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 8767 8721 trans->block_rsv = rsv; 8768 8722 8769 8723 while (1) { 8770 8724 ret = btrfs_truncate_inode_items(trans, root, BTRFS_I(inode), 8771 8725 inode->i_size, 8772 - BTRFS_EXTENT_DATA_KEY); 8726 + BTRFS_EXTENT_DATA_KEY, 8727 + &extents_found); 8773 8728 trans->block_rsv = &fs_info->trans_block_rsv; 8774 8729 if (ret != -ENOSPC && ret != -EAGAIN) 8775 8730 break; ··· 8824 8793 } 8825 8794 out: 8826 8795 btrfs_free_block_rsv(fs_info, rsv); 8796 + /* 8797 + * So if we truncate and then write and fsync we normally would just 8798 + * write the extents that changed, which is a problem if we need to 8799 + * first truncate that entire inode. So set this flag so we write out 8800 + * all of the extents in the inode to the sync log so we're completely 8801 + * safe. 8802 + * 8803 + * If no extents were dropped or trimmed we don't need to force the next 8804 + * fsync to truncate all the inode's items from the log and re-log them 8805 + * all. This means the truncate operation did not change the file size, 8806 + * or changed it to a smaller size but there was only an implicit hole 8807 + * between the old i_size and the new i_size, and there were no prealloc 8808 + * extents beyond i_size to drop. 8809 + */ 8810 + if (extents_found > 0) 8811 + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 8827 8812 8828 8813 return ret; 8829 8814 } ··· 10246 10199 return ret; 10247 10200 } 10248 10201 10249 - void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 10202 + void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) 10250 10203 { 10251 - struct inode *inode = tree->private_data; 10204 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 10252 10205 unsigned long index = start >> PAGE_SHIFT; 10253 10206 unsigned long end_index = end >> PAGE_SHIFT; 10254 10207 struct page *page; 10208 + u32 len; 10255 10209 10210 + ASSERT(end + 1 - start <= U32_MAX); 10211 + len = end + 1 - start; 10256 10212 while (index <= end_index) { 10257 - page = find_get_page(inode->i_mapping, index); 10213 + page = find_get_page(inode->vfs_inode.i_mapping, index); 10258 10214 ASSERT(page); /* Pages should be in the extent_io_tree */ 10259 - set_page_writeback(page); 10215 + 10216 + btrfs_page_set_writeback(fs_info, page, start, len); 10260 10217 put_page(page); 10261 10218 index++; 10262 10219 }

+139 -47

fs/btrfs/ioctl.c

··· 353 353 return ret; 354 354 } 355 355 356 + /* 357 + * Start exclusive operation @type, return true on success 358 + */ 356 359 bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, 357 360 enum btrfs_exclusive_operation type) 358 361 { 359 - return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); 362 + bool ret = false; 363 + 364 + spin_lock(&fs_info->super_lock); 365 + if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) { 366 + fs_info->exclusive_operation = type; 367 + ret = true; 368 + } 369 + spin_unlock(&fs_info->super_lock); 370 + 371 + return ret; 372 + } 373 + 374 + /* 375 + * Conditionally allow to enter the exclusive operation in case it's compatible 376 + * with the running one. This must be paired with btrfs_exclop_start_unlock and 377 + * btrfs_exclop_finish. 378 + * 379 + * Compatibility: 380 + * - the same type is already running 381 + * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller 382 + * must check the condition first that would allow none -> @type 383 + */ 384 + bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, 385 + enum btrfs_exclusive_operation type) 386 + { 387 + spin_lock(&fs_info->super_lock); 388 + if (fs_info->exclusive_operation == type) 389 + return true; 390 + 391 + spin_unlock(&fs_info->super_lock); 392 + return false; 393 + } 394 + 395 + void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info) 396 + { 397 + spin_unlock(&fs_info->super_lock); 360 398 } 361 399 362 400 void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) 363 401 { 402 + spin_lock(&fs_info->super_lock); 364 403 WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); 404 + spin_unlock(&fs_info->super_lock); 365 405 sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); 366 406 } 367 407 ··· 1495 1455 if (btrfs_defrag_cancelled(fs_info)) { 1496 1456 btrfs_debug(fs_info, "defrag_file cancelled"); 1497 1457 ret = -EAGAIN; 1498 - break; 1458 + goto error; 1499 1459 } 1500 1460 1501 1461 if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, ··· 1573 1533 } 1574 1534 } 1575 1535 1536 + ret = defrag_count; 1537 + error: 1576 1538 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { 1577 1539 filemap_flush(inode->i_mapping); 1578 1540 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, ··· 1588 1546 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); 1589 1547 } 1590 1548 1591 - ret = defrag_count; 1592 - 1593 1549 out_ra: 1594 1550 if (do_compress) { 1595 1551 btrfs_inode_lock(inode, 0); ··· 1598 1558 kfree(ra); 1599 1559 kfree(pages); 1600 1560 return ret; 1561 + } 1562 + 1563 + /* 1564 + * Try to start exclusive operation @type or cancel it if it's running. 1565 + * 1566 + * Return: 1567 + * 0 - normal mode, newly claimed op started 1568 + * >0 - normal mode, something else is running, 1569 + * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space 1570 + * ECANCELED - cancel mode, successful cancel 1571 + * ENOTCONN - cancel mode, operation not running anymore 1572 + */ 1573 + static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info, 1574 + enum btrfs_exclusive_operation type, bool cancel) 1575 + { 1576 + if (!cancel) { 1577 + /* Start normal op */ 1578 + if (!btrfs_exclop_start(fs_info, type)) 1579 + return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 1580 + /* Exclusive operation is now claimed */ 1581 + return 0; 1582 + } 1583 + 1584 + /* Cancel running op */ 1585 + if (btrfs_exclop_start_try_lock(fs_info, type)) { 1586 + /* 1587 + * This blocks any exclop finish from setting it to NONE, so we 1588 + * request cancellation. Either it runs and we will wait for it, 1589 + * or it has finished and no waiting will happen. 1590 + */ 1591 + atomic_inc(&fs_info->reloc_cancel_req); 1592 + btrfs_exclop_start_unlock(fs_info); 1593 + 1594 + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) 1595 + wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING, 1596 + TASK_INTERRUPTIBLE); 1597 + 1598 + return -ECANCELED; 1599 + } 1600 + 1601 + /* Something else is running or none */ 1602 + return -ENOTCONN; 1601 1603 } 1602 1604 1603 1605 static noinline int btrfs_ioctl_resize(struct file *file, ··· 1659 1577 char *devstr = NULL; 1660 1578 int ret = 0; 1661 1579 int mod = 0; 1580 + bool cancel; 1662 1581 1663 1582 if (!capable(CAP_SYS_ADMIN)) 1664 1583 return -EPERM; ··· 1668 1585 if (ret) 1669 1586 return ret; 1670 1587 1671 - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { 1672 - mnt_drop_write_file(file); 1673 - return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 1674 - } 1675 - 1588 + /* 1589 + * Read the arguments before checking exclusivity to be able to 1590 + * distinguish regular resize and cancel 1591 + */ 1676 1592 vol_args = memdup_user(arg, sizeof(*vol_args)); 1677 1593 if (IS_ERR(vol_args)) { 1678 1594 ret = PTR_ERR(vol_args); 1679 - goto out; 1595 + goto out_drop; 1680 1596 } 1681 - 1682 1597 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1683 - 1684 1598 sizestr = vol_args->name; 1599 + cancel = (strcmp("cancel", sizestr) == 0); 1600 + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel); 1601 + if (ret) 1602 + goto out_free; 1603 + /* Exclusive operation is now claimed */ 1604 + 1685 1605 devstr = strchr(sizestr, ':'); 1686 1606 if (devstr) { 1687 1607 sizestr = devstr + 1; ··· 1692 1606 devstr = vol_args->name; 1693 1607 ret = kstrtoull(devstr, 10, &devid); 1694 1608 if (ret) 1695 - goto out_free; 1609 + goto out_finish; 1696 1610 if (!devid) { 1697 1611 ret = -EINVAL; 1698 - goto out_free; 1612 + goto out_finish; 1699 1613 } 1700 1614 btrfs_info(fs_info, "resizing devid %llu", devid); 1701 1615 } ··· 1705 1619 btrfs_info(fs_info, "resizer unable to find device %llu", 1706 1620 devid); 1707 1621 ret = -ENODEV; 1708 - goto out_free; 1622 + goto out_finish; 1709 1623 } 1710 1624 1711 1625 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { ··· 1713 1627 "resizer unable to apply on readonly device %llu", 1714 1628 devid); 1715 1629 ret = -EPERM; 1716 - goto out_free; 1630 + goto out_finish; 1717 1631 } 1718 1632 1719 1633 if (!strcmp(sizestr, "max")) ··· 1729 1643 new_size = memparse(sizestr, &retptr); 1730 1644 if (*retptr != '\0' || new_size == 0) { 1731 1645 ret = -EINVAL; 1732 - goto out_free; 1646 + goto out_finish; 1733 1647 } 1734 1648 } 1735 1649 1736 1650 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1737 1651 ret = -EPERM; 1738 - goto out_free; 1652 + goto out_finish; 1739 1653 } 1740 1654 1741 1655 old_size = btrfs_device_get_total_bytes(device); ··· 1743 1657 if (mod < 0) { 1744 1658 if (new_size > old_size) { 1745 1659 ret = -EINVAL; 1746 - goto out_free; 1660 + goto out_finish; 1747 1661 } 1748 1662 new_size = old_size - new_size; 1749 1663 } else if (mod > 0) { 1750 1664 if (new_size > ULLONG_MAX - old_size) { 1751 1665 ret = -ERANGE; 1752 - goto out_free; 1666 + goto out_finish; 1753 1667 } 1754 1668 new_size = old_size + new_size; 1755 1669 } 1756 1670 1757 1671 if (new_size < SZ_256M) { 1758 1672 ret = -EINVAL; 1759 - goto out_free; 1673 + goto out_finish; 1760 1674 } 1761 1675 if (new_size > device->bdev->bd_inode->i_size) { 1762 1676 ret = -EFBIG; 1763 - goto out_free; 1677 + goto out_finish; 1764 1678 } 1765 1679 1766 1680 new_size = round_down(new_size, fs_info->sectorsize); ··· 1769 1683 trans = btrfs_start_transaction(root, 0); 1770 1684 if (IS_ERR(trans)) { 1771 1685 ret = PTR_ERR(trans); 1772 - goto out_free; 1686 + goto out_finish; 1773 1687 } 1774 1688 ret = btrfs_grow_device(trans, device, new_size); 1775 1689 btrfs_commit_transaction(trans); ··· 1782 1696 "resize device %s (devid %llu) from %llu to %llu", 1783 1697 rcu_str_deref(device->name), device->devid, 1784 1698 old_size, new_size); 1699 + out_finish: 1700 + btrfs_exclop_finish(fs_info); 1785 1701 out_free: 1786 1702 kfree(vol_args); 1787 - out: 1788 - btrfs_exclop_finish(fs_info); 1703 + out_drop: 1789 1704 mnt_drop_write_file(file); 1790 1705 return ret; 1791 1706 } ··· 2984 2897 err = PTR_ERR(subvol_name_ptr); 2985 2898 goto free_parent; 2986 2899 } 2987 - /* subvol_name_ptr is already NULL termined */ 2900 + /* subvol_name_ptr is already nul terminated */ 2988 2901 subvol_name = (char *)kbasename(subvol_name_ptr); 2989 2902 } 2990 2903 } else { ··· 3206 3119 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3207 3120 struct btrfs_ioctl_vol_args_v2 *vol_args; 3208 3121 int ret; 3122 + bool cancel = false; 3209 3123 3210 3124 if (!capable(CAP_SYS_ADMIN)) 3211 3125 return -EPERM; ··· 3225 3137 ret = -EOPNOTSUPP; 3226 3138 goto out; 3227 3139 } 3140 + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 3141 + if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) && 3142 + strcmp("cancel", vol_args->name) == 0) 3143 + cancel = true; 3228 3144 3229 - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { 3230 - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 3145 + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, 3146 + cancel); 3147 + if (ret) 3231 3148 goto out; 3232 - } 3149 + /* Exclusive operation is now claimed */ 3233 3150 3234 - if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { 3151 + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) 3235 3152 ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); 3236 - } else { 3237 - vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; 3153 + else 3238 3154 ret = btrfs_rm_device(fs_info, vol_args->name, 0); 3239 - } 3155 + 3240 3156 btrfs_exclop_finish(fs_info); 3241 3157 3242 3158 if (!ret) { ··· 3264 3172 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 3265 3173 struct btrfs_ioctl_vol_args *vol_args; 3266 3174 int ret; 3175 + bool cancel; 3267 3176 3268 3177 if (!capable(CAP_SYS_ADMIN)) 3269 3178 return -EPERM; ··· 3273 3180 if (ret) 3274 3181 return ret; 3275 3182 3276 - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { 3277 - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 3278 - goto out_drop_write; 3279 - } 3280 - 3281 3183 vol_args = memdup_user(arg, sizeof(*vol_args)); 3282 3184 if (IS_ERR(vol_args)) { 3283 3185 ret = PTR_ERR(vol_args); 3284 - goto out; 3186 + goto out_drop_write; 3187 + } 3188 + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 3189 + cancel = (strcmp("cancel", vol_args->name) == 0); 3190 + 3191 + ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, 3192 + cancel); 3193 + if (ret == 0) { 3194 + ret = btrfs_rm_device(fs_info, vol_args->name, 0); 3195 + if (!ret) 3196 + btrfs_info(fs_info, "disk deleted %s", vol_args->name); 3197 + btrfs_exclop_finish(fs_info); 3285 3198 } 3286 3199 3287 - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 3288 - ret = btrfs_rm_device(fs_info, vol_args->name, 0); 3289 - 3290 - if (!ret) 3291 - btrfs_info(fs_info, "disk deleted %s", vol_args->name); 3292 3200 kfree(vol_args); 3293 - out: 3294 - btrfs_exclop_finish(fs_info); 3295 3201 out_drop_write: 3296 3202 mnt_drop_write_file(file); 3297 3203 ··· 3643 3551 goto out; 3644 3552 } 3645 3553 transid = trans->transid; 3646 - ret = btrfs_commit_transaction_async(trans, 0); 3554 + ret = btrfs_commit_transaction_async(trans); 3647 3555 if (ret) { 3648 3556 btrfs_end_transaction(trans); 3649 3557 return ret;

+2 -2

fs/btrfs/locking.c

··· 57 57 /* 58 58 * Try-lock for read. 59 59 * 60 - * Retrun 1 if the rwlock has been taken, 0 otherwise 60 + * Return 1 if the rwlock has been taken, 0 otherwise 61 61 */ 62 62 int btrfs_try_tree_read_lock(struct extent_buffer *eb) 63 63 { ··· 72 72 /* 73 73 * Try-lock for write. 74 74 * 75 - * Retrun 1 if the rwlock has been taken, 0 otherwise 75 + * Return 1 if the rwlock has been taken, 0 otherwise 76 76 */ 77 77 int btrfs_try_tree_write_lock(struct extent_buffer *eb) 78 78 {

+196 -59

fs/btrfs/ordered-data.c

··· 16 16 #include "compression.h" 17 17 #include "delalloc-space.h" 18 18 #include "qgroup.h" 19 + #include "subpage.h" 19 20 20 21 static struct kmem_cache *btrfs_ordered_extent_cache; 21 22 ··· 301 300 } 302 301 303 302 /* 304 - * Finish IO for one ordered extent across a given range. The range can 305 - * contain several ordered extents. 303 + * Mark all ordered extents io inside the specified range finished. 306 304 * 307 - * @found_ret: Return the finished ordered extent 308 - * @file_offset: File offset for the finished IO 309 - * Will also be updated to one byte past the range that is 310 - * recordered as finished. This allows caller to walk forward. 311 - * @io_size: Length of the finish IO range 312 - * @uptodate: If the IO finished without problem 305 + * @page: The invovled page for the opeartion. 306 + * For uncompressed buffered IO, the page status also needs to be 307 + * updated to indicate whether the pending ordered io is finished. 308 + * Can be NULL for direct IO and compressed write. 309 + * For these cases, callers are ensured they won't execute the 310 + * endio function twice. 311 + * @finish_func: The function to be executed when all the IO of an ordered 312 + * extent are finished. 313 313 * 314 - * Return true if any ordered extent is finished in the range, and update 315 - * @found_ret and @file_offset. 316 - * Return false otherwise. 317 - * 318 - * NOTE: Although The range can cross multiple ordered extents, only one 319 - * ordered extent will be updated during one call. The caller is responsible to 320 - * iterate all ordered extents in the range. 314 + * This function is called for endio, thus the range must have ordered 315 + * extent(s) coveri it. 321 316 */ 322 - bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, 323 - struct btrfs_ordered_extent **finished_ret, 324 - u64 *file_offset, u64 io_size, int uptodate) 317 + void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, 318 + struct page *page, u64 file_offset, 319 + u64 num_bytes, btrfs_func_t finish_func, 320 + bool uptodate) 325 321 { 326 - struct btrfs_fs_info *fs_info = inode->root->fs_info; 327 322 struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; 323 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 324 + struct btrfs_workqueue *wq; 328 325 struct rb_node *node; 329 326 struct btrfs_ordered_extent *entry = NULL; 330 - bool finished = false; 331 327 unsigned long flags; 332 - u64 dec_end; 333 - u64 dec_start; 334 - u64 to_dec; 328 + u64 cur = file_offset; 329 + 330 + if (btrfs_is_free_space_inode(inode)) 331 + wq = fs_info->endio_freespace_worker; 332 + else 333 + wq = fs_info->endio_write_workers; 334 + 335 + if (page) 336 + ASSERT(page->mapping && page_offset(page) <= file_offset && 337 + file_offset + num_bytes <= page_offset(page) + PAGE_SIZE); 335 338 336 339 spin_lock_irqsave(&tree->lock, flags); 337 - node = tree_search(tree, *file_offset); 338 - if (!node) 339 - goto out; 340 + while (cur < file_offset + num_bytes) { 341 + u64 entry_end; 342 + u64 end; 343 + u32 len; 340 344 341 - entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 342 - if (!in_range(*file_offset, entry->file_offset, entry->num_bytes)) 343 - goto out; 345 + node = tree_search(tree, cur); 346 + /* No ordered extents at all */ 347 + if (!node) 348 + break; 344 349 345 - dec_start = max(*file_offset, entry->file_offset); 346 - dec_end = min(*file_offset + io_size, 347 - entry->file_offset + entry->num_bytes); 348 - *file_offset = dec_end; 349 - if (dec_start > dec_end) { 350 - btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", 351 - dec_start, dec_end); 352 - } 353 - to_dec = dec_end - dec_start; 354 - if (to_dec > entry->bytes_left) { 355 - btrfs_crit(fs_info, 356 - "bad ordered accounting left %llu size %llu", 357 - entry->bytes_left, to_dec); 358 - } 359 - entry->bytes_left -= to_dec; 360 - if (!uptodate) 361 - set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 362 - 363 - if (entry->bytes_left == 0) { 350 + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 351 + entry_end = entry->file_offset + entry->num_bytes; 364 352 /* 365 - * Ensure only one caller can set the flag and finished_ret 366 - * accordingly 353 + * |<-- OE --->| | 354 + * cur 355 + * Go to next OE. 367 356 */ 368 - finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 369 - /* test_and_set_bit implies a barrier */ 370 - cond_wake_up_nomb(&entry->wait); 371 - } 372 - out: 373 - if (finished && finished_ret && entry) { 374 - *finished_ret = entry; 375 - refcount_inc(&entry->refs); 357 + if (cur >= entry_end) { 358 + node = rb_next(node); 359 + /* No more ordered extents, exit */ 360 + if (!node) 361 + break; 362 + entry = rb_entry(node, struct btrfs_ordered_extent, 363 + rb_node); 364 + 365 + /* Go to next ordered extent and continue */ 366 + cur = entry->file_offset; 367 + continue; 368 + } 369 + /* 370 + * | |<--- OE --->| 371 + * cur 372 + * Go to the start of OE. 373 + */ 374 + if (cur < entry->file_offset) { 375 + cur = entry->file_offset; 376 + continue; 377 + } 378 + 379 + /* 380 + * Now we are definitely inside one ordered extent. 381 + * 382 + * |<--- OE --->| 383 + * | 384 + * cur 385 + */ 386 + end = min(entry->file_offset + entry->num_bytes, 387 + file_offset + num_bytes) - 1; 388 + ASSERT(end + 1 - cur < U32_MAX); 389 + len = end + 1 - cur; 390 + 391 + if (page) { 392 + /* 393 + * Ordered (Private2) bit indicates whether we still 394 + * have pending io unfinished for the ordered extent. 395 + * 396 + * If there's no such bit, we need to skip to next range. 397 + */ 398 + if (!btrfs_page_test_ordered(fs_info, page, cur, len)) { 399 + cur += len; 400 + continue; 401 + } 402 + btrfs_page_clear_ordered(fs_info, page, cur, len); 403 + } 404 + 405 + /* Now we're fine to update the accounting */ 406 + if (unlikely(len > entry->bytes_left)) { 407 + WARN_ON(1); 408 + btrfs_crit(fs_info, 409 + "bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu", 410 + inode->root->root_key.objectid, 411 + btrfs_ino(inode), 412 + entry->file_offset, 413 + entry->num_bytes, 414 + len, entry->bytes_left); 415 + entry->bytes_left = 0; 416 + } else { 417 + entry->bytes_left -= len; 418 + } 419 + 420 + if (!uptodate) 421 + set_bit(BTRFS_ORDERED_IOERR, &entry->flags); 422 + 423 + /* 424 + * All the IO of the ordered extent is finished, we need to queue 425 + * the finish_func to be executed. 426 + */ 427 + if (entry->bytes_left == 0) { 428 + set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 429 + cond_wake_up(&entry->wait); 430 + refcount_inc(&entry->refs); 431 + spin_unlock_irqrestore(&tree->lock, flags); 432 + btrfs_init_work(&entry->work, finish_func, NULL, NULL); 433 + btrfs_queue_work(wq, &entry->work); 434 + spin_lock_irqsave(&tree->lock, flags); 435 + } 436 + cur += len; 376 437 } 377 438 spin_unlock_irqrestore(&tree->lock, flags); 378 - return finished; 379 439 } 380 440 381 441 /* ··· 927 865 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 928 866 refcount_inc(&entry->refs); 929 867 out: 868 + spin_unlock_irq(&tree->lock); 869 + return entry; 870 + } 871 + 872 + /* 873 + * Lookup the first ordered extent that overlaps the range 874 + * [@file_offset, @file_offset + @len). 875 + * 876 + * The difference between this and btrfs_lookup_first_ordered_extent() is 877 + * that this one won't return any ordered extent that does not overlap the range. 878 + * And the difference against btrfs_lookup_ordered_extent() is, this function 879 + * ensures the first ordered extent gets returned. 880 + */ 881 + struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( 882 + struct btrfs_inode *inode, u64 file_offset, u64 len) 883 + { 884 + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; 885 + struct rb_node *node; 886 + struct rb_node *cur; 887 + struct rb_node *prev; 888 + struct rb_node *next; 889 + struct btrfs_ordered_extent *entry = NULL; 890 + 891 + spin_lock_irq(&tree->lock); 892 + node = tree->tree.rb_node; 893 + /* 894 + * Here we don't want to use tree_search() which will use tree->last 895 + * and screw up the search order. 896 + * And __tree_search() can't return the adjacent ordered extents 897 + * either, thus here we do our own search. 898 + */ 899 + while (node) { 900 + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 901 + 902 + if (file_offset < entry->file_offset) { 903 + node = node->rb_left; 904 + } else if (file_offset >= entry_end(entry)) { 905 + node = node->rb_right; 906 + } else { 907 + /* 908 + * Direct hit, got an ordered extent that starts at 909 + * @file_offset 910 + */ 911 + goto out; 912 + } 913 + } 914 + if (!entry) { 915 + /* Empty tree */ 916 + goto out; 917 + } 918 + 919 + cur = &entry->rb_node; 920 + /* We got an entry around @file_offset, check adjacent entries */ 921 + if (entry->file_offset < file_offset) { 922 + prev = cur; 923 + next = rb_next(cur); 924 + } else { 925 + prev = rb_prev(cur); 926 + next = cur; 927 + } 928 + if (prev) { 929 + entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node); 930 + if (range_overlaps(entry, file_offset, len)) 931 + goto out; 932 + } 933 + if (next) { 934 + entry = rb_entry(next, struct btrfs_ordered_extent, rb_node); 935 + if (range_overlaps(entry, file_offset, len)) 936 + goto out; 937 + } 938 + /* No ordered extent in the range */ 939 + entry = NULL; 940 + out: 941 + if (entry) 942 + refcount_inc(&entry->refs); 930 943 spin_unlock_irq(&tree->lock); 931 944 return entry; 932 945 }

+6 -4

fs/btrfs/ordered-data.h

··· 172 172 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); 173 173 void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, 174 174 struct btrfs_ordered_extent *entry); 175 + void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, 176 + struct page *page, u64 file_offset, 177 + u64 num_bytes, btrfs_func_t finish_func, 178 + bool uptodate); 175 179 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, 176 180 struct btrfs_ordered_extent **cached, 177 181 u64 file_offset, u64 io_size, int uptodate); 178 - bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, 179 - struct btrfs_ordered_extent **finished_ret, 180 - u64 *file_offset, u64 io_size, 181 - int uptodate); 182 182 int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, 183 183 u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, 184 184 int type); ··· 196 196 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 197 197 struct btrfs_ordered_extent * 198 198 btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); 199 + struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( 200 + struct btrfs_inode *inode, u64 file_offset, u64 len); 199 201 struct btrfs_ordered_extent *btrfs_lookup_ordered_range( 200 202 struct btrfs_inode *inode, 201 203 u64 file_offset,

+15 -1

fs/btrfs/props.c

··· 260 260 if (btrfs_compress_is_valid_type(value, len)) 261 261 return 0; 262 262 263 + if ((len == 2 && strncmp("no", value, 2) == 0) || 264 + (len == 4 && strncmp("none", value, 4) == 0)) 265 + return 0; 266 + 263 267 return -EINVAL; 264 268 } 265 269 ··· 273 269 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 274 270 int type; 275 271 272 + /* Reset to defaults */ 276 273 if (len == 0) { 274 + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; 275 + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; 276 + BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; 277 + return 0; 278 + } 279 + 280 + /* Set NOCOMPRESS flag */ 281 + if ((len == 2 && strncmp("no", value, 2) == 0) || 282 + (len == 4 && strncmp("none", value, 4) == 0)) { 277 283 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 278 284 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; 279 285 BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; ··· 362 348 363 349 /* 364 350 * This is not strictly necessary as the property should be 365 - * valid, but in case it isn't, don't propagate it futher. 351 + * valid, but in case it isn't, don't propagate it further. 366 352 */ 367 353 ret = h->validate(value, strlen(value)); 368 354 if (ret)

+2 -8

fs/btrfs/qgroup.c

··· 2521 2521 int ret = 0; 2522 2522 2523 2523 /* 2524 - * If quotas get disabled meanwhile, the resouces need to be freed and 2524 + * If quotas get disabled meanwhile, the resources need to be freed and 2525 2525 * we can't just exit here. 2526 2526 */ 2527 2527 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) ··· 3545 3545 struct btrfs_trans_handle *trans; 3546 3546 int ret; 3547 3547 3548 - /* 3549 - * Can't hold an open transaction or we run the risk of deadlocking, 3550 - * and can't either be under the context of a send operation (where 3551 - * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that 3552 - * would result in a crash when starting a transaction and does not 3553 - * make sense either (send is a read-only operation). 3554 - */ 3548 + /* Can't hold an open transaction or we run the risk of deadlocking. */ 3555 3549 ASSERT(current->journal_info == NULL); 3556 3550 if (WARN_ON(current->journal_info)) 3557 3551 return 0;

+9 -5

fs/btrfs/reflink.c

··· 7 7 #include "delalloc-space.h" 8 8 #include "reflink.h" 9 9 #include "transaction.h" 10 + #include "subpage.h" 10 11 11 12 #define BTRFS_MAX_DEDUPE_LEN SZ_16M 12 13 ··· 53 52 const u64 datal, 54 53 const u8 comp_type) 55 54 { 56 - const u64 block_size = btrfs_inode_sectorsize(inode); 55 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 56 + const u32 block_size = fs_info->sectorsize; 57 57 const u64 range_end = file_offset + block_size - 1; 58 58 const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); 59 59 char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); ··· 108 106 set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags); 109 107 110 108 if (comp_type == BTRFS_COMPRESS_NONE) { 111 - memcpy_to_page(page, 0, data_start, datal); 109 + memcpy_to_page(page, offset_in_page(file_offset), data_start, 110 + datal); 112 111 flush_dcache_page(page); 113 112 } else { 114 - ret = btrfs_decompress(comp_type, data_start, page, 0, 113 + ret = btrfs_decompress(comp_type, data_start, page, 114 + offset_in_page(file_offset), 115 115 inline_size, datal); 116 116 if (ret) 117 117 goto out_unlock; ··· 137 133 flush_dcache_page(page); 138 134 } 139 135 140 - SetPageUptodate(page); 136 + btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); 141 137 ClearPageChecked(page); 142 - set_page_dirty(page); 138 + btrfs_page_set_dirty(fs_info, page, file_offset, block_size); 143 139 out_unlock: 144 140 if (page) { 145 141 unlock_page(page);

+73 -2

fs/btrfs/relocation.c

··· 2876 2876 } 2877 2877 2878 2878 /* 2879 - * Allow error injection to test balance cancellation 2879 + * Allow error injection to test balance/relocation cancellation 2880 2880 */ 2881 2881 noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) 2882 2882 { 2883 2883 return atomic_read(&fs_info->balance_cancel_req) || 2884 + atomic_read(&fs_info->reloc_cancel_req) || 2884 2885 fatal_signal_pending(current); 2885 2886 } 2886 2887 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); ··· 3781 3780 return inode; 3782 3781 } 3783 3782 3783 + /* 3784 + * Mark start of chunk relocation that is cancellable. Check if the cancellation 3785 + * has been requested meanwhile and don't start in that case. 3786 + * 3787 + * Return: 3788 + * 0 success 3789 + * -EINPROGRESS operation is already in progress, that's probably a bug 3790 + * -ECANCELED cancellation request was set before the operation started 3791 + * -EAGAIN can not start because there are ongoing send operations 3792 + */ 3793 + static int reloc_chunk_start(struct btrfs_fs_info *fs_info) 3794 + { 3795 + spin_lock(&fs_info->send_reloc_lock); 3796 + if (fs_info->send_in_progress) { 3797 + btrfs_warn_rl(fs_info, 3798 + "cannot run relocation while send operations are in progress (%d in progress)", 3799 + fs_info->send_in_progress); 3800 + spin_unlock(&fs_info->send_reloc_lock); 3801 + return -EAGAIN; 3802 + } 3803 + if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 3804 + /* This should not happen */ 3805 + spin_unlock(&fs_info->send_reloc_lock); 3806 + btrfs_err(fs_info, "reloc already running, cannot start"); 3807 + return -EINPROGRESS; 3808 + } 3809 + spin_unlock(&fs_info->send_reloc_lock); 3810 + 3811 + if (atomic_read(&fs_info->reloc_cancel_req) > 0) { 3812 + btrfs_info(fs_info, "chunk relocation canceled on start"); 3813 + /* 3814 + * On cancel, clear all requests but let the caller mark 3815 + * the end after cleanup operations. 3816 + */ 3817 + atomic_set(&fs_info->reloc_cancel_req, 0); 3818 + return -ECANCELED; 3819 + } 3820 + return 0; 3821 + } 3822 + 3823 + /* 3824 + * Mark end of chunk relocation that is cancellable and wake any waiters. 3825 + */ 3826 + static void reloc_chunk_end(struct btrfs_fs_info *fs_info) 3827 + { 3828 + /* Requested after start, clear bit first so any waiters can continue */ 3829 + if (atomic_read(&fs_info->reloc_cancel_req) > 0) 3830 + btrfs_info(fs_info, "chunk relocation canceled during operation"); 3831 + spin_lock(&fs_info->send_reloc_lock); 3832 + clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); 3833 + spin_unlock(&fs_info->send_reloc_lock); 3834 + atomic_set(&fs_info->reloc_cancel_req, 0); 3835 + } 3836 + 3784 3837 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) 3785 3838 { 3786 3839 struct reloc_control *rc; ··· 3915 3860 if (!rc) { 3916 3861 btrfs_put_block_group(bg); 3917 3862 return -ENOMEM; 3863 + } 3864 + 3865 + ret = reloc_chunk_start(fs_info); 3866 + if (ret < 0) { 3867 + err = ret; 3868 + goto out_put_bg; 3918 3869 } 3919 3870 3920 3871 rc->extent_root = extent_root; ··· 4013 3952 if (err && rw) 4014 3953 btrfs_dec_block_group_ro(rc->block_group); 4015 3954 iput(rc->data_inode); 4016 - btrfs_put_block_group(rc->block_group); 3955 + out_put_bg: 3956 + btrfs_put_block_group(bg); 3957 + reloc_chunk_end(fs_info); 4017 3958 free_reloc_control(rc); 4018 3959 return err; 4019 3960 } ··· 4136 4073 goto out; 4137 4074 } 4138 4075 4076 + ret = reloc_chunk_start(fs_info); 4077 + if (ret < 0) { 4078 + err = ret; 4079 + goto out_end; 4080 + } 4081 + 4139 4082 rc->extent_root = fs_info->extent_root; 4140 4083 4141 4084 set_reloc_control(rc); ··· 4206 4137 err = ret; 4207 4138 out_unset: 4208 4139 unset_reloc_control(rc); 4140 + out_end: 4141 + reloc_chunk_end(fs_info); 4209 4142 free_reloc_control(rc); 4210 4143 out: 4211 4144 free_reloc_roots(&reloc_roots);

+111 -48

fs/btrfs/scrub.c

··· 165 165 int readonly; 166 166 int pages_per_rd_bio; 167 167 168 + /* State of IO submission throttling affecting the associated device */ 169 + ktime_t throttle_deadline; 170 + u64 throttle_sent; 171 + 168 172 int is_dev_replace; 169 173 u64 write_pointer; 170 174 ··· 609 605 spin_lock_init(&sctx->list_lock); 610 606 spin_lock_init(&sctx->stat_lock); 611 607 init_waitqueue_head(&sctx->list_wait); 608 + sctx->throttle_deadline = 0; 612 609 613 610 WARN_ON(sctx->wr_curr_bio != NULL); 614 611 mutex_init(&sctx->wr_lock); ··· 631 626 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, 632 627 void *warn_ctx) 633 628 { 634 - u64 isize; 635 629 u32 nlink; 636 630 int ret; 637 631 int i; ··· 666 662 eb = swarn->path->nodes[0]; 667 663 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 668 664 struct btrfs_inode_item); 669 - isize = btrfs_inode_size(eb, inode_item); 670 665 nlink = btrfs_inode_nlink(eb, inode_item); 671 666 btrfs_release_path(swarn->path); 672 667 ··· 694 691 */ 695 692 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 696 693 btrfs_warn_in_rcu(fs_info, 697 - "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", 694 + "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", 698 695 swarn->errstr, swarn->logical, 699 696 rcu_str_deref(swarn->dev->name), 700 697 swarn->physical, 701 698 root, inum, offset, 702 - min(isize - offset, (u64)PAGE_SIZE), nlink, 699 + fs_info->sectorsize, nlink, 703 700 (char *)(unsigned long)ipath->fspath->val[i]); 704 701 705 702 btrfs_put_root(local_root); ··· 888 885 * read all mirrors one after the other. This includes to 889 886 * re-read the extent or metadata block that failed (that was 890 887 * the cause that this fixup code is called) another time, 891 - * page by page this time in order to know which pages 888 + * sector by sector this time in order to know which sectors 892 889 * caused I/O errors and which ones are good (for all mirrors). 893 890 * It is the goal to handle the situation when more than one 894 891 * mirror contains I/O errors, but the errors do not 895 892 * overlap, i.e. the data can be repaired by selecting the 896 - * pages from those mirrors without I/O error on the 897 - * particular pages. One example (with blocks >= 2 * PAGE_SIZE) 898 - * would be that mirror #1 has an I/O error on the first page, 899 - * the second page is good, and mirror #2 has an I/O error on 900 - * the second page, but the first page is good. 901 - * Then the first page of the first mirror can be repaired by 902 - * taking the first page of the second mirror, and the 903 - * second page of the second mirror can be repaired by 904 - * copying the contents of the 2nd page of the 1st mirror. 905 - * One more note: if the pages of one mirror contain I/O 893 + * sectors from those mirrors without I/O error on the 894 + * particular sectors. One example (with blocks >= 2 * sectorsize) 895 + * would be that mirror #1 has an I/O error on the first sector, 896 + * the second sector is good, and mirror #2 has an I/O error on 897 + * the second sector, but the first sector is good. 898 + * Then the first sector of the first mirror can be repaired by 899 + * taking the first sector of the second mirror, and the 900 + * second sector of the second mirror can be repaired by 901 + * copying the contents of the 2nd sector of the 1st mirror. 902 + * One more note: if the sectors of one mirror contain I/O 906 903 * errors, the checksum cannot be verified. In order to get 907 904 * the best data for repairing, the first attempt is to find 908 905 * a mirror without I/O errors and with a validated checksum. 909 - * Only if this is not possible, the pages are picked from 906 + * Only if this is not possible, the sectors are picked from 910 907 * mirrors with I/O errors without considering the checksum. 911 908 * If the latter is the case, at the end, the checksum of the 912 909 * repaired area is verified in order to correctly maintain ··· 1063 1060 1064 1061 /* 1065 1062 * In case of I/O errors in the area that is supposed to be 1066 - * repaired, continue by picking good copies of those pages. 1067 - * Select the good pages from mirrors to rewrite bad pages from 1063 + * repaired, continue by picking good copies of those sectors. 1064 + * Select the good sectors from mirrors to rewrite bad sectors from 1068 1065 * the area to fix. Afterwards verify the checksum of the block 1069 1066 * that is supposed to be repaired. This verification step is 1070 1067 * only done for the purpose of statistic counting and for the 1071 1068 * final scrub report, whether errors remain. 1072 1069 * A perfect algorithm could make use of the checksum and try 1073 - * all possible combinations of pages from the different mirrors 1070 + * all possible combinations of sectors from the different mirrors 1074 1071 * until the checksum verification succeeds. For example, when 1075 - * the 2nd page of mirror #1 faces I/O errors, and the 2nd page 1072 + * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector 1076 1073 * of mirror #2 is readable but the final checksum test fails, 1077 - * then the 2nd page of mirror #3 could be tried, whether now 1074 + * then the 2nd sector of mirror #3 could be tried, whether now 1078 1075 * the final checksum succeeds. But this would be a rare 1079 1076 * exception and is therefore not implemented. At least it is 1080 1077 * avoided that the good copy is overwritten. 1081 1078 * A more useful improvement would be to pick the sectors 1082 1079 * without I/O error based on sector sizes (512 bytes on legacy 1083 - * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one 1080 + * disks) instead of on sectorsize. Then maybe 512 byte of one 1084 1081 * mirror could be repaired by taking 512 byte of a different 1085 - * mirror, even if other 512 byte sectors in the same PAGE_SIZE 1082 + * mirror, even if other 512 byte sectors in the same sectorsize 1086 1083 * area are unreadable. 1087 1084 */ 1088 1085 success = 1; ··· 1263 1260 { 1264 1261 struct scrub_ctx *sctx = original_sblock->sctx; 1265 1262 struct btrfs_fs_info *fs_info = sctx->fs_info; 1266 - u64 length = original_sblock->page_count * PAGE_SIZE; 1263 + u64 length = original_sblock->page_count * fs_info->sectorsize; 1267 1264 u64 logical = original_sblock->pagev[0]->logical; 1268 1265 u64 generation = original_sblock->pagev[0]->generation; 1269 1266 u64 flags = original_sblock->pagev[0]->flags; ··· 1286 1283 */ 1287 1284 1288 1285 while (length > 0) { 1289 - sublen = min_t(u64, length, PAGE_SIZE); 1286 + sublen = min_t(u64, length, fs_info->sectorsize); 1290 1287 mapped_length = sublen; 1291 1288 bbio = NULL; 1292 1289 1293 1290 /* 1294 - * with a length of PAGE_SIZE, each returned stripe 1295 - * represents one mirror 1291 + * With a length of sectorsize, each returned stripe represents 1292 + * one mirror 1296 1293 */ 1297 1294 btrfs_bio_counter_inc_blocked(fs_info); 1298 1295 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, ··· 1483 1480 bio = btrfs_io_bio_alloc(1); 1484 1481 bio_set_dev(bio, spage->dev->bdev); 1485 1482 1486 - bio_add_page(bio, spage->page, PAGE_SIZE, 0); 1483 + bio_add_page(bio, spage->page, fs_info->sectorsize, 0); 1487 1484 bio->bi_iter.bi_sector = spage->physical >> 9; 1488 1485 bio->bi_opf = REQ_OP_READ; 1489 1486 ··· 1547 1544 struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; 1548 1545 struct scrub_page *spage_good = sblock_good->pagev[page_num]; 1549 1546 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; 1547 + const u32 sectorsize = fs_info->sectorsize; 1550 1548 1551 1549 BUG_ON(spage_bad->page == NULL); 1552 1550 BUG_ON(spage_good->page == NULL); ··· 1567 1563 bio->bi_iter.bi_sector = spage_bad->physical >> 9; 1568 1564 bio->bi_opf = REQ_OP_WRITE; 1569 1565 1570 - ret = bio_add_page(bio, spage_good->page, PAGE_SIZE, 0); 1571 - if (PAGE_SIZE != ret) { 1566 + ret = bio_add_page(bio, spage_good->page, sectorsize, 0); 1567 + if (ret != sectorsize) { 1572 1568 bio_put(bio); 1573 1569 return -EIO; 1574 1570 } ··· 1646 1642 { 1647 1643 struct scrub_bio *sbio; 1648 1644 int ret; 1645 + const u32 sectorsize = sctx->fs_info->sectorsize; 1649 1646 1650 1647 mutex_lock(&sctx->wr_lock); 1651 1648 again: ··· 1686 1681 bio->bi_iter.bi_sector = sbio->physical >> 9; 1687 1682 bio->bi_opf = REQ_OP_WRITE; 1688 1683 sbio->status = 0; 1689 - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1684 + } else if (sbio->physical + sbio->page_count * sectorsize != 1690 1685 spage->physical_for_dev_replace || 1691 - sbio->logical + sbio->page_count * PAGE_SIZE != 1686 + sbio->logical + sbio->page_count * sectorsize != 1692 1687 spage->logical) { 1693 1688 scrub_wr_submit(sctx); 1694 1689 goto again; 1695 1690 } 1696 1691 1697 - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 1698 - if (ret != PAGE_SIZE) { 1692 + ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); 1693 + if (ret != sectorsize) { 1699 1694 if (sbio->page_count < 1) { 1700 1695 bio_put(sbio->bio); 1701 1696 sbio->bio = NULL; ··· 1734 1729 btrfsic_submit_bio(sbio->bio); 1735 1730 1736 1731 if (btrfs_is_zoned(sctx->fs_info)) 1737 - sctx->write_pointer = sbio->physical + sbio->page_count * PAGE_SIZE; 1732 + sctx->write_pointer = sbio->physical + sbio->page_count * 1733 + sctx->fs_info->sectorsize; 1738 1734 } 1739 1735 1740 1736 static void scrub_wr_bio_end_io(struct bio *bio) ··· 1994 1988 } 1995 1989 } 1996 1990 1991 + /* 1992 + * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 1993 + * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. 1994 + */ 1995 + static void scrub_throttle(struct scrub_ctx *sctx) 1996 + { 1997 + const int time_slice = 1000; 1998 + struct scrub_bio *sbio; 1999 + struct btrfs_device *device; 2000 + s64 delta; 2001 + ktime_t now; 2002 + u32 div; 2003 + u64 bwlimit; 2004 + 2005 + sbio = sctx->bios[sctx->curr]; 2006 + device = sbio->dev; 2007 + bwlimit = READ_ONCE(device->scrub_speed_max); 2008 + if (bwlimit == 0) 2009 + return; 2010 + 2011 + /* 2012 + * Slice is divided into intervals when the IO is submitted, adjust by 2013 + * bwlimit and maximum of 64 intervals. 2014 + */ 2015 + div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); 2016 + div = min_t(u32, 64, div); 2017 + 2018 + /* Start new epoch, set deadline */ 2019 + now = ktime_get(); 2020 + if (sctx->throttle_deadline == 0) { 2021 + sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); 2022 + sctx->throttle_sent = 0; 2023 + } 2024 + 2025 + /* Still in the time to send? */ 2026 + if (ktime_before(now, sctx->throttle_deadline)) { 2027 + /* If current bio is within the limit, send it */ 2028 + sctx->throttle_sent += sbio->bio->bi_iter.bi_size; 2029 + if (sctx->throttle_sent <= div_u64(bwlimit, div)) 2030 + return; 2031 + 2032 + /* We're over the limit, sleep until the rest of the slice */ 2033 + delta = ktime_ms_delta(sctx->throttle_deadline, now); 2034 + } else { 2035 + /* New request after deadline, start new epoch */ 2036 + delta = 0; 2037 + } 2038 + 2039 + if (delta) { 2040 + long timeout; 2041 + 2042 + timeout = div_u64(delta * HZ, 1000); 2043 + schedule_timeout_interruptible(timeout); 2044 + } 2045 + 2046 + /* Next call will start the deadline period */ 2047 + sctx->throttle_deadline = 0; 2048 + } 2049 + 1997 2050 static void scrub_submit(struct scrub_ctx *sctx) 1998 2051 { 1999 2052 struct scrub_bio *sbio; 2000 2053 2001 2054 if (sctx->curr == -1) 2002 2055 return; 2056 + 2057 + scrub_throttle(sctx); 2003 2058 2004 2059 sbio = sctx->bios[sctx->curr]; 2005 2060 sctx->curr = -1; ··· 2073 2006 { 2074 2007 struct scrub_block *sblock = spage->sblock; 2075 2008 struct scrub_bio *sbio; 2009 + const u32 sectorsize = sctx->fs_info->sectorsize; 2076 2010 int ret; 2077 2011 2078 2012 again: ··· 2112 2044 bio->bi_iter.bi_sector = sbio->physical >> 9; 2113 2045 bio->bi_opf = REQ_OP_READ; 2114 2046 sbio->status = 0; 2115 - } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 2047 + } else if (sbio->physical + sbio->page_count * sectorsize != 2116 2048 spage->physical || 2117 - sbio->logical + sbio->page_count * PAGE_SIZE != 2049 + sbio->logical + sbio->page_count * sectorsize != 2118 2050 spage->logical || 2119 2051 sbio->dev != spage->dev) { 2120 2052 scrub_submit(sctx); ··· 2122 2054 } 2123 2055 2124 2056 sbio->pagev[sbio->page_count] = spage; 2125 - ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 2126 - if (ret != PAGE_SIZE) { 2057 + ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); 2058 + if (ret != sectorsize) { 2127 2059 if (sbio->page_count < 1) { 2128 2060 bio_put(sbio->bio); 2129 2061 sbio->bio = NULL; ··· 2466 2398 if (sblock->sparity && corrupted && !sblock->data_corrected) { 2467 2399 u64 start = sblock->pagev[0]->logical; 2468 2400 u64 end = sblock->pagev[sblock->page_count - 1]->logical + 2469 - PAGE_SIZE; 2401 + sblock->sctx->fs_info->sectorsize; 2470 2402 2471 2403 ASSERT(end - start <= U32_MAX); 2472 2404 scrub_parity_mark_sectors_error(sblock->sparity, ··· 2486 2418 * the csum into @csum. 2487 2419 * 2488 2420 * The search source is sctx->csum_list, which is a pre-populated list 2489 - * storing bytenr ordered csum ranges. We're reponsible to cleanup any range 2421 + * storing bytenr ordered csum ranges. We're responsible to cleanup any range 2490 2422 * that is before @logical. 2491 2423 * 2492 2424 * Return 0 if there is no csum for the range. ··· 3206 3138 physical = map->stripes[num].physical; 3207 3139 offset = 0; 3208 3140 nstripes = div64_u64(length, map->stripe_len); 3141 + mirror_num = 1; 3142 + increment = map->stripe_len; 3209 3143 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3210 3144 offset = map->stripe_len * num; 3211 3145 increment = map->stripe_len * map->num_stripes; 3212 - mirror_num = 1; 3213 3146 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3214 3147 int factor = map->num_stripes / map->sub_stripes; 3215 3148 offset = map->stripe_len * (num / map->sub_stripes); 3216 3149 increment = map->stripe_len * factor; 3217 3150 mirror_num = num % map->sub_stripes + 1; 3218 3151 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 3219 - increment = map->stripe_len; 3220 3152 mirror_num = num % map->num_stripes + 1; 3221 3153 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3222 - increment = map->stripe_len; 3223 3154 mirror_num = num % map->num_stripes + 1; 3224 3155 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3225 3156 get_raid56_logic_offset(physical, num, map, &offset, NULL); 3226 3157 increment = map->stripe_len * nr_data_stripes(map); 3227 - mirror_num = 1; 3228 - } else { 3229 - increment = map->stripe_len; 3230 - mirror_num = 1; 3231 3158 } 3232 3159 3233 3160 path = btrfs_alloc_path();

+26 -21

fs/btrfs/send.c

··· 2078 2078 } 2079 2079 2080 2080 /* 2081 - * Removes the entry from the list and adds it back to the end. This marks the 2082 - * entry as recently used so that name_cache_clean_unused does not remove it. 2083 - */ 2084 - static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce) 2085 - { 2086 - list_del(&nce->list); 2087 - list_add_tail(&nce->list, &sctx->name_cache_list); 2088 - } 2089 - 2090 - /* 2091 2081 * Remove some entries from the beginning of name_cache_list. 2092 2082 */ 2093 2083 static void name_cache_clean_unused(struct send_ctx *sctx) ··· 2137 2147 kfree(nce); 2138 2148 nce = NULL; 2139 2149 } else { 2140 - name_cache_used(sctx, nce); 2150 + /* 2151 + * Removes the entry from the list and adds it back to 2152 + * the end. This marks the entry as recently used so 2153 + * that name_cache_clean_unused does not remove it. 2154 + */ 2155 + list_move_tail(&nce->list, &sctx->name_cache_list); 2156 + 2141 2157 *parent_ino = nce->parent_ino; 2142 2158 *parent_gen = nce->parent_gen; 2143 2159 ret = fs_path_add(dest, nce->name, nce->name_len); ··· 4060 4064 if (ret < 0) 4061 4065 goto out; 4062 4066 } else { 4067 + /* 4068 + * If we previously orphanized a directory that 4069 + * collided with a new reference that we already 4070 + * processed, recompute the current path because 4071 + * that directory may be part of the path. 4072 + */ 4073 + if (orphanized_dir) { 4074 + ret = refresh_ref_path(sctx, cur); 4075 + if (ret < 0) 4076 + goto out; 4077 + } 4063 4078 ret = send_unlink(sctx, cur->full_path); 4064 4079 if (ret < 0) 4065 4080 goto out; ··· 6514 6507 * updates the inode item, but it only changes the iversion (sequence 6515 6508 * field in the inode item) of the inode, so if a file is deduplicated 6516 6509 * the same amount of times in both the parent and send snapshots, its 6517 - * iversion becames the same in both snapshots, whence the inode item is 6510 + * iversion becomes the same in both snapshots, whence the inode item is 6518 6511 * the same on both snapshots. 6519 6512 */ 6520 6513 if (sctx->cur_ino != sctx->cmp_key->objectid) ··· 7416 7409 if (ret) 7417 7410 goto out; 7418 7411 7419 - mutex_lock(&fs_info->balance_mutex); 7420 - if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 7421 - mutex_unlock(&fs_info->balance_mutex); 7412 + spin_lock(&fs_info->send_reloc_lock); 7413 + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { 7414 + spin_unlock(&fs_info->send_reloc_lock); 7422 7415 btrfs_warn_rl(fs_info, 7423 - "cannot run send because a balance operation is in progress"); 7416 + "cannot run send because a relocation operation is in progress"); 7424 7417 ret = -EAGAIN; 7425 7418 goto out; 7426 7419 } 7427 7420 fs_info->send_in_progress++; 7428 - mutex_unlock(&fs_info->balance_mutex); 7421 + spin_unlock(&fs_info->send_reloc_lock); 7429 7422 7430 - current->journal_info = BTRFS_SEND_TRANS_STUB; 7431 7423 ret = send_subvol(sctx); 7432 - current->journal_info = NULL; 7433 - mutex_lock(&fs_info->balance_mutex); 7424 + spin_lock(&fs_info->send_reloc_lock); 7434 7425 fs_info->send_in_progress--; 7435 - mutex_unlock(&fs_info->balance_mutex); 7426 + spin_unlock(&fs_info->send_reloc_lock); 7436 7427 if (ret < 0) 7437 7428 goto out; 7438 7429

+50 -183

fs/btrfs/space-info.c

··· 133 133 * operations, however they won't be usable until the transaction commits. 134 134 * 135 135 * COMMIT_TRANS 136 - * may_commit_transaction() is the ultimate arbiter on whether we commit the 137 - * transaction or not. In order to avoid constantly churning we do all the 138 - * above flushing first and then commit the transaction as the last resort. 139 - * However we need to take into account things like pinned space that would 140 - * be freed, plus any delayed work we may not have gotten rid of in the case 141 - * of metadata. 142 - * 143 - * FORCE_COMMIT_TRANS 144 - * For use by the preemptive flusher. We use this to bypass the ticketing 145 - * checks in may_commit_transaction, as we have more information about the 146 - * overall state of the system and may want to commit the transaction ahead 147 - * of actual ENOSPC conditions. 136 + * This will commit the transaction. Historically we had a lot of logic 137 + * surrounding whether or not we'd commit the transaction, but this waits born 138 + * out of a pre-tickets era where we could end up committing the transaction 139 + * thousands of times in a row without making progress. Now thanks to our 140 + * ticketing system we know if we're not making progress and can error 141 + * everybody out after a few commits rather than burning the disk hoping for 142 + * a different answer. 148 143 * 149 144 * OVERCOMMIT 150 145 * ··· 191 196 space_info = kzalloc(sizeof(*space_info), GFP_NOFS); 192 197 if (!space_info) 193 198 return -ENOMEM; 194 - 195 - ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, 196 - GFP_KERNEL); 197 - if (ret) { 198 - kfree(space_info); 199 - return ret; 200 - } 201 199 202 200 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 203 201 INIT_LIST_HEAD(&space_info->block_groups[i]); ··· 377 389 378 390 ticket = list_first_entry(head, struct reserve_ticket, list); 379 391 380 - /* Check and see if our ticket can be satisified now. */ 392 + /* Check and see if our ticket can be satisfied now. */ 381 393 if ((used + ticket->bytes <= space_info->total_bytes) || 382 394 btrfs_can_overcommit(fs_info, space_info, ticket->bytes, 383 395 flush)) { ··· 483 495 */ 484 496 static void shrink_delalloc(struct btrfs_fs_info *fs_info, 485 497 struct btrfs_space_info *space_info, 486 - u64 to_reclaim, bool wait_ordered) 498 + u64 to_reclaim, bool wait_ordered, 499 + bool for_preempt) 487 500 { 488 501 struct btrfs_trans_handle *trans; 489 502 u64 delalloc_bytes; ··· 521 532 * ordered extents, otherwise we'll waste time trying to flush delalloc 522 533 * that likely won't give us the space back we need. 523 534 */ 524 - if (ordered_bytes > delalloc_bytes) 535 + if (ordered_bytes > delalloc_bytes && !for_preempt) 525 536 wait_ordered = true; 526 537 527 538 loops = 0; ··· 540 551 break; 541 552 } 542 553 554 + /* 555 + * If we are for preemption we just want a one-shot of delalloc 556 + * flushing so we can stop flushing if we decide we don't need 557 + * to anymore. 558 + */ 559 + if (for_preempt) 560 + break; 561 + 543 562 spin_lock(&space_info->lock); 544 563 if (list_empty(&space_info->tickets) && 545 564 list_empty(&space_info->priority_tickets)) { ··· 561 564 ordered_bytes = percpu_counter_sum_positive( 562 565 &fs_info->ordered_bytes); 563 566 } 564 - } 565 - 566 - /** 567 - * Possibly commit the transaction if its ok to 568 - * 569 - * @fs_info: the filesystem 570 - * @space_info: space_info we are checking for commit, either data or metadata 571 - * 572 - * This will check to make sure that committing the transaction will actually 573 - * get us somewhere and then commit the transaction if it does. Otherwise it 574 - * will return -ENOSPC. 575 - */ 576 - static int may_commit_transaction(struct btrfs_fs_info *fs_info, 577 - struct btrfs_space_info *space_info) 578 - { 579 - struct reserve_ticket *ticket = NULL; 580 - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; 581 - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; 582 - struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; 583 - struct btrfs_trans_handle *trans; 584 - u64 reclaim_bytes = 0; 585 - u64 bytes_needed = 0; 586 - u64 cur_free_bytes = 0; 587 - 588 - trans = (struct btrfs_trans_handle *)current->journal_info; 589 - if (trans) 590 - return -EAGAIN; 591 - 592 - spin_lock(&space_info->lock); 593 - cur_free_bytes = btrfs_space_info_used(space_info, true); 594 - if (cur_free_bytes < space_info->total_bytes) 595 - cur_free_bytes = space_info->total_bytes - cur_free_bytes; 596 - else 597 - cur_free_bytes = 0; 598 - 599 - if (!list_empty(&space_info->priority_tickets)) 600 - ticket = list_first_entry(&space_info->priority_tickets, 601 - struct reserve_ticket, list); 602 - else if (!list_empty(&space_info->tickets)) 603 - ticket = list_first_entry(&space_info->tickets, 604 - struct reserve_ticket, list); 605 - if (ticket) 606 - bytes_needed = ticket->bytes; 607 - 608 - if (bytes_needed > cur_free_bytes) 609 - bytes_needed -= cur_free_bytes; 610 - else 611 - bytes_needed = 0; 612 - spin_unlock(&space_info->lock); 613 - 614 - if (!bytes_needed) 615 - return 0; 616 - 617 - trans = btrfs_join_transaction(fs_info->extent_root); 618 - if (IS_ERR(trans)) 619 - return PTR_ERR(trans); 620 - 621 - /* 622 - * See if there is enough pinned space to make this reservation, or if 623 - * we have block groups that are going to be freed, allowing us to 624 - * possibly do a chunk allocation the next loop through. 625 - */ 626 - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || 627 - __percpu_counter_compare(&space_info->total_bytes_pinned, 628 - bytes_needed, 629 - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) 630 - goto commit; 631 - 632 - /* 633 - * See if there is some space in the delayed insertion reserve for this 634 - * reservation. If the space_info's don't match (like for DATA or 635 - * SYSTEM) then just go enospc, reclaiming this space won't recover any 636 - * space to satisfy those reservations. 637 - */ 638 - if (space_info != delayed_rsv->space_info) 639 - goto enospc; 640 - 641 - spin_lock(&delayed_rsv->lock); 642 - reclaim_bytes += delayed_rsv->reserved; 643 - spin_unlock(&delayed_rsv->lock); 644 - 645 - spin_lock(&delayed_refs_rsv->lock); 646 - reclaim_bytes += delayed_refs_rsv->reserved; 647 - spin_unlock(&delayed_refs_rsv->lock); 648 - 649 - spin_lock(&trans_rsv->lock); 650 - reclaim_bytes += trans_rsv->reserved; 651 - spin_unlock(&trans_rsv->lock); 652 - 653 - if (reclaim_bytes >= bytes_needed) 654 - goto commit; 655 - bytes_needed -= reclaim_bytes; 656 - 657 - if (__percpu_counter_compare(&space_info->total_bytes_pinned, 658 - bytes_needed, 659 - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) 660 - goto enospc; 661 - 662 - commit: 663 - return btrfs_commit_transaction(trans); 664 - enospc: 665 - btrfs_end_transaction(trans); 666 - return -ENOSPC; 667 567 } 668 568 669 569 /* ··· 596 702 case FLUSH_DELALLOC: 597 703 case FLUSH_DELALLOC_WAIT: 598 704 shrink_delalloc(fs_info, space_info, num_bytes, 599 - state == FLUSH_DELALLOC_WAIT); 705 + state == FLUSH_DELALLOC_WAIT, for_preempt); 600 706 break; 601 707 case FLUSH_DELAYED_REFS_NR: 602 708 case FLUSH_DELAYED_REFS: ··· 637 743 btrfs_wait_on_delayed_iputs(fs_info); 638 744 break; 639 745 case COMMIT_TRANS: 640 - ret = may_commit_transaction(fs_info, space_info); 641 - break; 642 - case FORCE_COMMIT_TRANS: 746 + ASSERT(current->journal_info == NULL); 643 747 trans = btrfs_join_transaction(root); 644 748 if (IS_ERR(trans)) { 645 749 ret = PTR_ERR(trans); ··· 684 792 static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, 685 793 struct btrfs_space_info *space_info) 686 794 { 795 + u64 global_rsv_size = fs_info->global_block_rsv.reserved; 687 796 u64 ordered, delalloc; 688 797 u64 thresh = div_factor_fine(space_info->total_bytes, 98); 689 798 u64 used; 690 799 691 800 /* If we're just plain full then async reclaim just slows us down. */ 692 - if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) 801 + if ((space_info->bytes_used + space_info->bytes_reserved + 802 + global_rsv_size) >= thresh) 693 803 return false; 694 804 695 805 /* ··· 732 838 733 839 thresh = calc_available_free_space(fs_info, space_info, 734 840 BTRFS_RESERVE_FLUSH_ALL); 735 - thresh += (space_info->total_bytes - space_info->bytes_used - 736 - space_info->bytes_reserved - space_info->bytes_readonly); 841 + used = space_info->bytes_used + space_info->bytes_reserved + 842 + space_info->bytes_readonly + global_rsv_size; 843 + if (used < space_info->total_bytes) 844 + thresh += space_info->total_bytes - used; 737 845 thresh >>= space_info->clamp; 738 846 739 847 used = space_info->bytes_pinned; ··· 756 860 * clearly be heavy enough to warrant preemptive flushing. In the case 757 861 * of heavy DIO or ordered reservations, preemptive flushing will just 758 862 * waste time and cause us to slow down. 863 + * 864 + * We want to make sure we truly are maxed out on ordered however, so 865 + * cut ordered in half, and if it's still higher than delalloc then we 866 + * can keep flushing. This is to avoid the case where we start 867 + * flushing, and now delalloc == ordered and we stop preemptively 868 + * flushing when we could still have several gigs of delalloc to flush. 759 869 */ 760 - ordered = percpu_counter_read_positive(&fs_info->ordered_bytes); 870 + ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1; 761 871 delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); 762 872 if (ordered >= delalloc) 763 873 used += fs_info->delayed_refs_rsv.reserved + 764 874 fs_info->delayed_block_rsv.reserved; 765 875 else 766 - used += space_info->bytes_may_use; 876 + used += space_info->bytes_may_use - global_rsv_size; 767 877 768 878 return (used >= thresh && !btrfs_fs_closing(fs_info) && 769 879 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); ··· 823 921 { 824 922 struct reserve_ticket *ticket; 825 923 u64 tickets_id = space_info->tickets_id; 826 - u64 first_ticket_bytes = 0; 827 924 828 925 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 829 926 btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); ··· 836 935 837 936 if (ticket->steal && 838 937 steal_from_global_rsv(fs_info, space_info, ticket)) 839 - return true; 840 - 841 - /* 842 - * may_commit_transaction will avoid committing the transaction 843 - * if it doesn't feel like the space reclaimed by the commit 844 - * would result in the ticket succeeding. However if we have a 845 - * smaller ticket in the queue it may be small enough to be 846 - * satisified by committing the transaction, so if any 847 - * subsequent ticket is smaller than the first ticket go ahead 848 - * and send us back for another loop through the enospc flushing 849 - * code. 850 - */ 851 - if (first_ticket_bytes == 0) 852 - first_ticket_bytes = ticket->bytes; 853 - else if (first_ticket_bytes > ticket->bytes) 854 938 return true; 855 939 856 940 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) ··· 1003 1117 (delayed_block_rsv->reserved + 1004 1118 delayed_refs_rsv->reserved)) { 1005 1119 to_reclaim = space_info->bytes_pinned; 1006 - flush = FORCE_COMMIT_TRANS; 1120 + flush = COMMIT_TRANS; 1007 1121 } else if (delayed_block_rsv->reserved > 1008 1122 delayed_refs_rsv->reserved) { 1009 1123 to_reclaim = delayed_block_rsv->reserved; ··· 1057 1171 * immediately re-usable, it comes in the form of a delayed ref, which must be 1058 1172 * run and then the transaction must be committed. 1059 1173 * 1060 - * FLUSH_DELAYED_REFS 1061 - * The above two cases generate delayed refs that will affect 1062 - * ->total_bytes_pinned. However this counter can be inconsistent with 1063 - * reality if there are outstanding delayed refs. This is because we adjust 1064 - * the counter based solely on the current set of delayed refs and disregard 1065 - * any on-disk state which might include more refs. So for example, if we 1066 - * have an extent with 2 references, but we only drop 1, we'll see that there 1067 - * is a negative delayed ref count for the extent and assume that the space 1068 - * will be freed, and thus increase ->total_bytes_pinned. 1069 - * 1070 - * Running the delayed refs gives us the actual real view of what will be 1071 - * freed at the transaction commit time. This stage will not actually free 1072 - * space for us, it just makes sure that may_commit_transaction() has all of 1073 - * the information it needs to make the right decision. 1074 - * 1075 1174 * COMMIT_TRANS 1076 - * This is where we reclaim all of the pinned space generated by the previous 1077 - * two stages. We will not commit the transaction if we don't think we're 1078 - * likely to satisfy our request, which means if our current free space + 1079 - * total_bytes_pinned < reservation we will not commit. This is why the 1080 - * previous states are actually important, to make sure we know for sure 1081 - * whether committing the transaction will allow us to make progress. 1175 + * This is where we reclaim all of the pinned space generated by running the 1176 + * iputs 1082 1177 * 1083 1178 * ALLOC_CHUNK_FORCE 1084 1179 * For data we start with alloc chunk force, however we could have been full ··· 1069 1202 static const enum btrfs_flush_state data_flush_states[] = { 1070 1203 FLUSH_DELALLOC_WAIT, 1071 1204 RUN_DELAYED_IPUTS, 1072 - FLUSH_DELAYED_REFS, 1073 1205 COMMIT_TRANS, 1074 1206 ALLOC_CHUNK_FORCE, 1075 1207 }; ··· 1427 1561 flush == BTRFS_RESERVE_FLUSH_DATA) { 1428 1562 list_add_tail(&ticket.list, &space_info->tickets); 1429 1563 if (!space_info->flush) { 1564 + /* 1565 + * We were forced to add a reserve ticket, so 1566 + * our preemptive flushing is unable to keep 1567 + * up. Clamp down on the threshold for the 1568 + * preemptive flushing in order to keep up with 1569 + * the workload. 1570 + */ 1571 + maybe_clamp_preempt(fs_info, space_info); 1572 + 1430 1573 space_info->flush = 1; 1431 1574 trace_btrfs_trigger_flush(fs_info, 1432 1575 space_info->flags, ··· 1447 1572 list_add_tail(&ticket.list, 1448 1573 &space_info->priority_tickets); 1449 1574 } 1450 - 1451 - /* 1452 - * We were forced to add a reserve ticket, so our preemptive 1453 - * flushing is unable to keep up. Clamp down on the threshold 1454 - * for the preemptive flushing in order to keep up with the 1455 - * workload. 1456 - */ 1457 - maybe_clamp_preempt(fs_info, space_info); 1458 1575 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 1459 1576 used += orig_bytes; 1460 1577 /* ··· 1455 1588 * the async reclaim as we will panic. 1456 1589 */ 1457 1590 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && 1458 - need_preemptive_reclaim(fs_info, space_info) && 1459 - !work_busy(&fs_info->preempt_reclaim_work)) { 1591 + !work_busy(&fs_info->preempt_reclaim_work) && 1592 + need_preemptive_reclaim(fs_info, space_info)) { 1460 1593 trace_btrfs_trigger_flush(fs_info, space_info->flags, 1461 1594 orig_bytes, flush, "preempt"); 1462 1595 queue_work(system_unbound_wq,

-30

fs/btrfs/space-info.h

··· 43 43 44 44 u64 flags; 45 45 46 - /* 47 - * bytes_pinned is kept in line with what is actually pinned, as in 48 - * we've called update_block_group and dropped the bytes_used counter 49 - * and increased the bytes_pinned counter. However this means that 50 - * bytes_pinned does not reflect the bytes that will be pinned once the 51 - * delayed refs are flushed, so this counter is inc'ed every time we 52 - * call btrfs_free_extent so it is a realtime count of what will be 53 - * freed once the transaction is committed. It will be zeroed every 54 - * time the transaction commits. 55 - */ 56 - struct percpu_counter total_bytes_pinned; 57 - 58 46 struct list_head list; 59 47 /* Protected by the spinlock 'lock'. */ 60 48 struct list_head ro_bgs; ··· 145 157 } 146 158 int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, 147 159 enum btrfs_reserve_flush_enum flush); 148 - 149 - static inline void __btrfs_mod_total_bytes_pinned( 150 - struct btrfs_space_info *space_info, 151 - s64 mod) 152 - { 153 - percpu_counter_add_batch(&space_info->total_bytes_pinned, mod, 154 - BTRFS_TOTAL_BYTES_PINNED_BATCH); 155 - } 156 - 157 - static inline void btrfs_mod_total_bytes_pinned(struct btrfs_fs_info *fs_info, 158 - u64 flags, s64 mod) 159 - { 160 - struct btrfs_space_info *space_info = btrfs_find_space_info(fs_info, flags); 161 - 162 - ASSERT(space_info); 163 - __btrfs_mod_total_bytes_pinned(space_info, mod); 164 - } 165 - 166 160 #endif /* BTRFS_SPACE_INFO_H */

+149 -6

fs/btrfs/subpage.c

··· 3 3 #include <linux/slab.h> 4 4 #include "ctree.h" 5 5 #include "subpage.h" 6 + #include "btrfs_inode.h" 6 7 7 8 /* 8 9 * Subpage (sectorsize < PAGE_SIZE) support overview: ··· 111 110 if (!*ret) 112 111 return -ENOMEM; 113 112 spin_lock_init(&(*ret)->lock); 114 - if (type == BTRFS_SUBPAGE_METADATA) 113 + if (type == BTRFS_SUBPAGE_METADATA) { 115 114 atomic_set(&(*ret)->eb_refs, 0); 116 - else 115 + } else { 117 116 atomic_set(&(*ret)->readers, 0); 117 + atomic_set(&(*ret)->writers, 0); 118 + } 118 119 return 0; 119 120 } 120 121 ··· 186 183 { 187 184 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 188 185 const int nbits = len >> fs_info->sectorsize_bits; 189 - int ret; 190 186 191 187 btrfs_subpage_assert(fs_info, page, start, len); 192 188 193 - ret = atomic_add_return(nbits, &subpage->readers); 194 - ASSERT(ret == nbits); 189 + atomic_add(nbits, &subpage->readers); 195 190 } 196 191 197 192 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, ··· 197 196 { 198 197 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 199 198 const int nbits = len >> fs_info->sectorsize_bits; 199 + bool is_data; 200 + bool last; 200 201 201 202 btrfs_subpage_assert(fs_info, page, start, len); 203 + is_data = is_data_inode(page->mapping->host); 202 204 ASSERT(atomic_read(&subpage->readers) >= nbits); 203 - if (atomic_sub_and_test(nbits, &subpage->readers)) 205 + last = atomic_sub_and_test(nbits, &subpage->readers); 206 + 207 + /* 208 + * For data we need to unlock the page if the last read has finished. 209 + * 210 + * And please don't replace @last with atomic_sub_and_test() call 211 + * inside if () condition. 212 + * As we want the atomic_sub_and_test() to be always executed. 213 + */ 214 + if (is_data && last) 215 + unlock_page(page); 216 + } 217 + 218 + static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) 219 + { 220 + u64 orig_start = *start; 221 + u32 orig_len = *len; 222 + 223 + *start = max_t(u64, page_offset(page), orig_start); 224 + *len = min_t(u64, page_offset(page) + PAGE_SIZE, 225 + orig_start + orig_len) - *start; 226 + } 227 + 228 + void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, 229 + struct page *page, u64 start, u32 len) 230 + { 231 + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 232 + const int nbits = (len >> fs_info->sectorsize_bits); 233 + int ret; 234 + 235 + btrfs_subpage_assert(fs_info, page, start, len); 236 + 237 + ASSERT(atomic_read(&subpage->readers) == 0); 238 + ret = atomic_add_return(nbits, &subpage->writers); 239 + ASSERT(ret == nbits); 240 + } 241 + 242 + bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, 243 + struct page *page, u64 start, u32 len) 244 + { 245 + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 246 + const int nbits = (len >> fs_info->sectorsize_bits); 247 + 248 + btrfs_subpage_assert(fs_info, page, start, len); 249 + 250 + ASSERT(atomic_read(&subpage->writers) >= nbits); 251 + return atomic_sub_and_test(nbits, &subpage->writers); 252 + } 253 + 254 + /* 255 + * Lock a page for delalloc page writeback. 256 + * 257 + * Return -EAGAIN if the page is not properly initialized. 258 + * Return 0 with the page locked, and writer counter updated. 259 + * 260 + * Even with 0 returned, the page still need extra check to make sure 261 + * it's really the correct page, as the caller is using 262 + * find_get_pages_contig(), which can race with page invalidating. 263 + */ 264 + int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, 265 + struct page *page, u64 start, u32 len) 266 + { 267 + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { 268 + lock_page(page); 269 + return 0; 270 + } 271 + lock_page(page); 272 + if (!PagePrivate(page) || !page->private) { 273 + unlock_page(page); 274 + return -EAGAIN; 275 + } 276 + btrfs_subpage_clamp_range(page, &start, &len); 277 + btrfs_subpage_start_writer(fs_info, page, start, len); 278 + return 0; 279 + } 280 + 281 + void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, 282 + struct page *page, u64 start, u32 len) 283 + { 284 + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) 285 + return unlock_page(page); 286 + btrfs_subpage_clamp_range(page, &start, &len); 287 + if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) 204 288 unlock_page(page); 205 289 } 206 290 ··· 440 354 spin_unlock_irqrestore(&subpage->lock, flags); 441 355 } 442 356 357 + void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, 358 + struct page *page, u64 start, u32 len) 359 + { 360 + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 361 + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 362 + unsigned long flags; 363 + 364 + spin_lock_irqsave(&subpage->lock, flags); 365 + subpage->ordered_bitmap |= tmp; 366 + SetPageOrdered(page); 367 + spin_unlock_irqrestore(&subpage->lock, flags); 368 + } 369 + 370 + void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, 371 + struct page *page, u64 start, u32 len) 372 + { 373 + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; 374 + const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); 375 + unsigned long flags; 376 + 377 + spin_lock_irqsave(&subpage->lock, flags); 378 + subpage->ordered_bitmap &= ~tmp; 379 + if (subpage->ordered_bitmap == 0) 380 + ClearPageOrdered(page); 381 + spin_unlock_irqrestore(&subpage->lock, flags); 382 + } 443 383 /* 444 384 * Unlike set/clear which is dependent on each page status, for test all bits 445 385 * are tested in the same way. ··· 488 376 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); 489 377 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); 490 378 IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); 379 + IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); 491 380 492 381 /* 493 382 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed ··· 521 408 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 522 409 return test_page_func(page); \ 523 410 return btrfs_subpage_test_##name(fs_info, page, start, len); \ 411 + } \ 412 + void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ 413 + struct page *page, u64 start, u32 len) \ 414 + { \ 415 + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 416 + set_page_func(page); \ 417 + return; \ 418 + } \ 419 + btrfs_subpage_clamp_range(page, &start, &len); \ 420 + btrfs_subpage_set_##name(fs_info, page, start, len); \ 421 + } \ 422 + void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ 423 + struct page *page, u64 start, u32 len) \ 424 + { \ 425 + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ 426 + clear_page_func(page); \ 427 + return; \ 428 + } \ 429 + btrfs_subpage_clamp_range(page, &start, &len); \ 430 + btrfs_subpage_clear_##name(fs_info, page, start, len); \ 431 + } \ 432 + bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ 433 + struct page *page, u64 start, u32 len) \ 434 + { \ 435 + if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ 436 + return test_page_func(page); \ 437 + btrfs_subpage_clamp_range(page, &start, &len); \ 438 + return btrfs_subpage_test_##name(fs_info, page, start, len); \ 524 439 } 525 440 IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, 526 441 PageUptodate); ··· 557 416 PageDirty); 558 417 IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, 559 418 PageWriteback); 419 + IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, 420 + PageOrdered);

+32 -1

fs/btrfs/subpage.h

··· 22 22 u16 error_bitmap; 23 23 u16 dirty_bitmap; 24 24 u16 writeback_bitmap; 25 + /* 26 + * Both data and metadata needs to track how many readers are for the 27 + * page. 28 + * Data relies on @readers to unlock the page when last reader finished. 29 + * While metadata doesn't need page unlock, it needs to prevent 30 + * page::private get cleared before the last end_page_read(). 31 + */ 32 + atomic_t readers; 25 33 union { 26 34 /* 27 35 * Structures only used by metadata ··· 40 32 atomic_t eb_refs; 41 33 /* Structures only used by data */ 42 34 struct { 43 - atomic_t readers; 35 + atomic_t writers; 36 + 37 + /* Tracke pending ordered extent in this sector */ 38 + u16 ordered_bitmap; 44 39 }; 45 40 }; 46 41 }; ··· 74 63 void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, 75 64 struct page *page, u64 start, u32 len); 76 65 66 + void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, 67 + struct page *page, u64 start, u32 len); 68 + bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, 69 + struct page *page, u64 start, u32 len); 70 + int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, 71 + struct page *page, u64 start, u32 len); 72 + void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, 73 + struct page *page, u64 start, u32 len); 74 + 77 75 /* 78 76 * Template for subpage related operations. 79 77 * ··· 92 72 * btrfs_page_*() are for call sites where the page can either be subpage 93 73 * specific or regular page. The function will handle both cases. 94 74 * But the range still needs to be inside the page. 75 + * 76 + * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't 77 + * need to be inside the page. Those functions will truncate the range 78 + * automatically. 95 79 */ 96 80 #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ 97 81 void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ ··· 109 85 void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ 110 86 struct page *page, u64 start, u32 len); \ 111 87 bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ 88 + struct page *page, u64 start, u32 len); \ 89 + void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ 90 + struct page *page, u64 start, u32 len); \ 91 + void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ 92 + struct page *page, u64 start, u32 len); \ 93 + bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ 112 94 struct page *page, u64 start, u32 len); 113 95 114 96 DECLARE_BTRFS_SUBPAGE_OPS(uptodate); 115 97 DECLARE_BTRFS_SUBPAGE_OPS(error); 116 98 DECLARE_BTRFS_SUBPAGE_OPS(dirty); 117 99 DECLARE_BTRFS_SUBPAGE_OPS(writeback); 100 + DECLARE_BTRFS_SUBPAGE_OPS(ordered); 118 101 119 102 bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, 120 103 struct page *page, u64 start, u32 len);

+2 -14

fs/btrfs/super.c

··· 299 299 struct btrfs_fs_info *fs_info = trans->fs_info; 300 300 301 301 WRITE_ONCE(trans->aborted, errno); 302 - /* Nothing used. The other threads that have joined this 303 - * transaction may be able to continue. */ 304 - if (!trans->dirty && list_empty(&trans->new_bgs)) { 305 - const char *errstr; 306 - 307 - errstr = btrfs_decode_error(errno); 308 - btrfs_warn(fs_info, 309 - "%s:%d: Aborting unused transaction(%s).", 310 - function, line, errstr); 311 - return; 312 - } 313 302 WRITE_ONCE(trans->transaction->aborted, errno); 314 303 /* Wake up anybody who may be waiting on this transaction */ 315 304 wake_up(&fs_info->transaction_wait); ··· 934 945 case Opt_check_integrity_including_extent_data: 935 946 btrfs_info(info, 936 947 "enabling check integrity including extent data"); 937 - btrfs_set_opt(info->mount_opt, 938 - CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); 948 + btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); 939 949 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); 940 950 break; 941 951 case Opt_check_integrity: ··· 1515 1527 if (btrfs_test_opt(info, SKIP_BALANCE)) 1516 1528 seq_puts(seq, ",skip_balance"); 1517 1529 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 1518 - if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) 1530 + if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA)) 1519 1531 seq_puts(seq, ",check_int_data"); 1520 1532 else if (btrfs_test_opt(info, CHECK_INTEGRITY)) 1521 1533 seq_puts(seq, ",check_int");

+59 -15

fs/btrfs/sysfs.c

··· 429 429 { 430 430 struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); 431 431 432 - return scnprintf(buf, PAGE_SIZE, "%lld\n", 432 + return scnprintf(buf, PAGE_SIZE, "%llu\n", 433 433 fs_info->discard_ctl.discard_bitmap_bytes); 434 434 } 435 435 BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); ··· 451 451 { 452 452 struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); 453 453 454 - return scnprintf(buf, PAGE_SIZE, "%lld\n", 454 + return scnprintf(buf, PAGE_SIZE, "%llu\n", 455 455 fs_info->discard_ctl.discard_extent_bytes); 456 456 } 457 457 BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); ··· 665 665 } \ 666 666 BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) 667 667 668 - static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, 669 - struct kobj_attribute *a, 670 - char *buf) 671 - { 672 - struct btrfs_space_info *sinfo = to_space_info(kobj); 673 - s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned); 674 - return scnprintf(buf, PAGE_SIZE, "%lld\n", val); 675 - } 676 - 677 668 SPACE_INFO_ATTR(flags); 678 669 SPACE_INFO_ATTR(total_bytes); 679 670 SPACE_INFO_ATTR(bytes_used); ··· 675 684 SPACE_INFO_ATTR(bytes_zone_unusable); 676 685 SPACE_INFO_ATTR(disk_used); 677 686 SPACE_INFO_ATTR(disk_total); 678 - BTRFS_ATTR(space_info, total_bytes_pinned, 679 - btrfs_space_info_show_total_bytes_pinned); 680 687 681 688 static struct attribute *space_info_attrs[] = { 682 689 BTRFS_ATTR_PTR(space_info, flags), ··· 687 698 BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), 688 699 BTRFS_ATTR_PTR(space_info, disk_used), 689 700 BTRFS_ATTR_PTR(space_info, disk_total), 690 - BTRFS_ATTR_PTR(space_info, total_bytes_pinned), 691 701 NULL, 692 702 }; 693 703 ATTRIBUTE_GROUPS(space_info); ··· 694 706 static void space_info_release(struct kobject *kobj) 695 707 { 696 708 struct btrfs_space_info *sinfo = to_space_info(kobj); 697 - percpu_counter_destroy(&sinfo->total_bytes_pinned); 698 709 kfree(sinfo); 699 710 } 700 711 ··· 1442 1455 } 1443 1456 BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); 1444 1457 1458 + static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, 1459 + struct kobj_attribute *a, 1460 + char *buf) 1461 + { 1462 + struct btrfs_device *device = container_of(kobj, struct btrfs_device, 1463 + devid_kobj); 1464 + 1465 + return scnprintf(buf, PAGE_SIZE, "%llu\n", 1466 + READ_ONCE(device->scrub_speed_max)); 1467 + } 1468 + 1469 + static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, 1470 + struct kobj_attribute *a, 1471 + const char *buf, size_t len) 1472 + { 1473 + struct btrfs_device *device = container_of(kobj, struct btrfs_device, 1474 + devid_kobj); 1475 + char *endptr; 1476 + unsigned long long limit; 1477 + 1478 + limit = memparse(buf, &endptr); 1479 + WRITE_ONCE(device->scrub_speed_max, limit); 1480 + return len; 1481 + } 1482 + BTRFS_ATTR_RW(devid, scrub_speed_max, btrfs_devinfo_scrub_speed_max_show, 1483 + btrfs_devinfo_scrub_speed_max_store); 1484 + 1445 1485 static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, 1446 1486 struct kobj_attribute *a, char *buf) 1447 1487 { ··· 1482 1468 } 1483 1469 BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); 1484 1470 1471 + static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, 1472 + struct kobj_attribute *a, char *buf) 1473 + { 1474 + struct btrfs_device *device = container_of(kobj, struct btrfs_device, 1475 + devid_kobj); 1476 + 1477 + if (!device->dev_stats_valid) 1478 + return scnprintf(buf, PAGE_SIZE, "invalid\n"); 1479 + 1480 + /* 1481 + * Print all at once so we get a snapshot of all values from the same 1482 + * time. Keep them in sync and in order of definition of 1483 + * btrfs_dev_stat_values. 1484 + */ 1485 + return scnprintf(buf, PAGE_SIZE, 1486 + "write_errs %d\n" 1487 + "read_errs %d\n" 1488 + "flush_errs %d\n" 1489 + "corruption_errs %d\n" 1490 + "generation_errs %d\n", 1491 + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_WRITE_ERRS), 1492 + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_READ_ERRS), 1493 + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_FLUSH_ERRS), 1494 + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_CORRUPTION_ERRS), 1495 + btrfs_dev_stat_read(device, BTRFS_DEV_STAT_GENERATION_ERRS)); 1496 + } 1497 + BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); 1498 + 1485 1499 static struct attribute *devid_attrs[] = { 1500 + BTRFS_ATTR_PTR(devid, error_stats), 1486 1501 BTRFS_ATTR_PTR(devid, in_fs_metadata), 1487 1502 BTRFS_ATTR_PTR(devid, missing), 1488 1503 BTRFS_ATTR_PTR(devid, replace_target), 1504 + BTRFS_ATTR_PTR(devid, scrub_speed_max), 1489 1505 BTRFS_ATTR_PTR(devid, writeable), 1490 1506 NULL 1491 1507 };

+1 -1

fs/btrfs/tests/extent-map-tests.c

··· 557 557 { 558 558 /* 559 559 * Test a chunk with 2 data stripes one of which 560 - * interesects the physical address of the super block 560 + * intersects the physical address of the super block 561 561 * is correctly recognised. 562 562 */ 563 563 .raid_type = BTRFS_BLOCK_GROUP_RAID1,

+13 -48

fs/btrfs/transaction.c

··· 583 583 bool do_chunk_alloc = false; 584 584 int ret; 585 585 586 - /* Send isn't supposed to start transactions. */ 587 - ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); 588 - 589 586 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 590 587 return ERR_PTR(-EROFS); 591 588 ··· 1403 1406 1404 1407 while (1) { 1405 1408 trans = btrfs_start_transaction(root, 0); 1406 - if (IS_ERR(trans)) 1407 - return PTR_ERR(trans); 1409 + if (IS_ERR(trans)) { 1410 + ret = PTR_ERR(trans); 1411 + break; 1412 + } 1408 1413 1409 1414 ret = btrfs_defrag_leaves(trans, root); 1410 1415 ··· 1475 1476 ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); 1476 1477 if (ret) { 1477 1478 btrfs_abort_transaction(trans, ret); 1478 - goto out; 1479 + return ret; 1479 1480 } 1480 1481 1481 1482 /* ··· 1868 1869 } 1869 1870 1870 1871 /* 1871 - * wait for the current transaction commit to start and block subsequent 1872 - * transaction joins 1873 - */ 1874 - static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info, 1875 - struct btrfs_transaction *trans) 1876 - { 1877 - wait_event(fs_info->transaction_blocked_wait, 1878 - trans->state >= TRANS_STATE_COMMIT_START || 1879 - TRANS_ABORTED(trans)); 1880 - } 1881 - 1882 - /* 1883 - * wait for the current transaction to start and then become unblocked. 1884 - * caller holds ref. 1885 - */ 1886 - static void wait_current_trans_commit_start_and_unblock( 1887 - struct btrfs_fs_info *fs_info, 1888 - struct btrfs_transaction *trans) 1889 - { 1890 - wait_event(fs_info->transaction_wait, 1891 - trans->state >= TRANS_STATE_UNBLOCKED || 1892 - TRANS_ABORTED(trans)); 1893 - } 1894 - 1895 - /* 1896 1872 * commit transactions asynchronously. once btrfs_commit_transaction_async 1897 1873 * returns, any subsequent transaction will not be allowed to join. 1898 1874 */ ··· 1894 1920 kfree(ac); 1895 1921 } 1896 1922 1897 - int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, 1898 - int wait_for_unblock) 1923 + int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) 1899 1924 { 1900 1925 struct btrfs_fs_info *fs_info = trans->fs_info; 1901 1926 struct btrfs_async_commit *ac; ··· 1926 1953 __sb_writers_release(fs_info->sb, SB_FREEZE_FS); 1927 1954 1928 1955 schedule_work(&ac->work); 1929 - 1930 - /* wait for transaction to start and unblock */ 1931 - if (wait_for_unblock) 1932 - wait_current_trans_commit_start_and_unblock(fs_info, cur_trans); 1933 - else 1934 - wait_current_trans_commit_start(fs_info, cur_trans); 1935 - 1956 + /* 1957 + * Wait for the current transaction commit to start and block 1958 + * subsequent transaction joins 1959 + */ 1960 + wait_event(fs_info->transaction_blocked_wait, 1961 + cur_trans->state >= TRANS_STATE_COMMIT_START || 1962 + TRANS_ABORTED(cur_trans)); 1936 1963 if (current->journal_info == trans) 1937 1964 current->journal_info = NULL; 1938 1965 ··· 2046 2073 int ret; 2047 2074 2048 2075 ASSERT(refcount_read(&trans->use_count) == 1); 2049 - 2050 - /* 2051 - * Some places just start a transaction to commit it. We need to make 2052 - * sure that if this commit fails that the abort code actually marks the 2053 - * transaction as failed, so set trans->dirty to make the abort code do 2054 - * the right thing. 2055 - */ 2056 - trans->dirty = true; 2057 2076 2058 2077 /* Stop the commit early if ->aborted is set */ 2059 2078 if (TRANS_ABORTED(cur_trans)) {

+1 -5

fs/btrfs/transaction.h

··· 122 122 123 123 #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) 124 124 125 - #define BTRFS_SEND_TRANS_STUB ((void *)1) 126 - 127 125 struct btrfs_trans_handle { 128 126 u64 transid; 129 127 u64 bytes_reserved; ··· 141 143 bool allocating_chunk; 142 144 bool can_flush_pending_bgs; 143 145 bool reloc_reserved; 144 - bool dirty; 145 146 bool in_fsync; 146 147 struct btrfs_root *root; 147 148 struct btrfs_fs_info *fs_info; ··· 224 227 int btrfs_defrag_root(struct btrfs_root *root); 225 228 int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); 226 229 int btrfs_commit_transaction(struct btrfs_trans_handle *trans); 227 - int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, 228 - int wait_for_unblock); 230 + int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans); 229 231 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); 230 232 bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans); 231 233 void btrfs_throttle(struct btrfs_fs_info *fs_info);

+19 -7

fs/btrfs/tree-log.c

··· 4468 4468 ret = btrfs_truncate_inode_items(trans, 4469 4469 root->log_root, 4470 4470 inode, truncate_offset, 4471 - BTRFS_EXTENT_DATA_KEY); 4471 + BTRFS_EXTENT_DATA_KEY, 4472 + NULL); 4472 4473 } while (ret == -EAGAIN); 4473 4474 if (ret) 4474 4475 goto out; ··· 5417 5416 &inode->runtime_flags); 5418 5417 while(1) { 5419 5418 ret = btrfs_truncate_inode_items(trans, 5420 - log, inode, 0, 0); 5419 + log, inode, 0, 0, NULL); 5421 5420 if (ret != -EAGAIN) 5422 5421 break; 5423 5422 } ··· 5467 5466 btrfs_release_path(dst_path); 5468 5467 if (need_log_inode_item) { 5469 5468 err = log_inode_item(trans, log, dst_path, inode); 5470 - if (!err && !xattrs_logged) { 5471 - err = btrfs_log_all_xattrs(trans, root, inode, path, 5472 - dst_path); 5473 - btrfs_release_path(path); 5474 - } 5475 5469 if (err) 5476 5470 goto out_unlock; 5471 + /* 5472 + * If we are doing a fast fsync and the inode was logged before 5473 + * in this transaction, we don't need to log the xattrs because 5474 + * they were logged before. If xattrs were added, changed or 5475 + * deleted since the last time we logged the inode, then we have 5476 + * already logged them because the inode had the runtime flag 5477 + * BTRFS_INODE_COPY_EVERYTHING set. 5478 + */ 5479 + if (!xattrs_logged && inode->logged_trans < trans->transid) { 5480 + err = btrfs_log_all_xattrs(trans, root, inode, path, 5481 + dst_path); 5482 + if (err) 5483 + goto out_unlock; 5484 + btrfs_release_path(path); 5485 + } 5477 5486 } 5478 5487 if (fast_search) { 5479 5488 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, ··· 6382 6371 error: 6383 6372 if (wc.trans) 6384 6373 btrfs_end_transaction(wc.trans); 6374 + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); 6385 6375 btrfs_free_path(path); 6386 6376 return ret; 6387 6377 }

+7 -17

fs/btrfs/volumes.c

··· 717 717 718 718 /* 719 719 * Handles the case where scanned device is part of an fs that had 720 - * multiple successful changes of FSID but curently device didn't 720 + * multiple successful changes of FSID but currently device didn't 721 721 * observe it. Meaning our fsid will be different than theirs. We need 722 722 * to handle two subcases : 723 723 * 1 - The fs still continues to have different METADATA/FSID uuids. ··· 1550 1550 * check to ensure dev extents are not double allocated. 1551 1551 * This makes the function safe to allocate dev extents but may not report 1552 1552 * correct usable device space, as device extent freed in current transaction 1553 - * is not reported as avaiable. 1553 + * is not reported as available. 1554 1554 */ 1555 1555 static int find_free_dev_extent_start(struct btrfs_device *device, 1556 1556 u64 num_bytes, u64 search_start, u64 *start, ··· 4217 4217 btrfs_bg_type_to_raid_name(data_target)); 4218 4218 } 4219 4219 4220 - if (fs_info->send_in_progress) { 4221 - btrfs_warn_rl(fs_info, 4222 - "cannot run balance while send operations are in progress (%d in progress)", 4223 - fs_info->send_in_progress); 4224 - ret = -EAGAIN; 4225 - goto out; 4226 - } 4227 - 4228 4220 ret = insert_balance_item(fs_info, bctl); 4229 4221 if (ret && ret != -EEXIST) 4230 4222 goto out; ··· 6119 6127 * @em: mapping containing the logical extent 6120 6128 * @op: type of operation - write or read 6121 6129 * @logical: address that we want to figure out the geometry of 6122 - * @len: the length of IO we are going to perform, starting at @logical 6123 6130 * @io_geom: pointer used to return values 6124 6131 * 6125 6132 * Returns < 0 in case a chunk for the given logical address cannot be found, 6126 6133 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 6127 6134 */ 6128 6135 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 6129 - enum btrfs_map_op op, u64 logical, u64 len, 6136 + enum btrfs_map_op op, u64 logical, 6130 6137 struct btrfs_io_geometry *io_geom) 6131 6138 { 6132 6139 struct map_lookup *map; 6140 + u64 len; 6133 6141 u64 offset; 6134 6142 u64 stripe_offset; 6135 6143 u64 stripe_nr; ··· 6144 6152 offset = logical - em->start; 6145 6153 /* Len of a stripe in a chunk */ 6146 6154 stripe_len = map->stripe_len; 6147 - /* Stripe wher this block falls in */ 6155 + /* Stripe where this block falls in */ 6148 6156 stripe_nr = div64_u64(offset, stripe_len); 6149 6157 /* Offset of stripe in the chunk */ 6150 6158 stripe_offset = stripe_nr * stripe_len; ··· 6235 6243 em = btrfs_get_chunk_map(fs_info, logical, *length); 6236 6244 ASSERT(!IS_ERR(em)); 6237 6245 6238 - ret = btrfs_get_io_geometry(fs_info, em, op, logical, *length, &geom); 6246 + ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 6239 6247 if (ret < 0) 6240 6248 return ret; 6241 6249 ··· 6662 6670 * 6663 6671 * If devid and uuid are both specified, the match must be exact, otherwise 6664 6672 * only devid is used. 6665 - * 6666 - * If @seed is true, traverse through the seed devices. 6667 6673 */ 6668 6674 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, 6669 6675 u64 devid, u8 *uuid, u8 *fsid) ··· 7855 7865 ret = -EUCLEAN; 7856 7866 } 7857 7867 7858 - /* Make sure no dev extent is beyond device bondary */ 7868 + /* Make sure no dev extent is beyond device boundary */ 7859 7869 dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); 7860 7870 if (!dev) { 7861 7871 btrfs_err(fs_info, "failed to find devid %llu", devid);

+4 -1

fs/btrfs/volumes.h

··· 143 143 struct completion kobj_unregister; 144 144 /* For sysfs/FSID/devinfo/devid/ */ 145 145 struct kobject devid_kobj; 146 + 147 + /* Bandwidth limit for scrub, in bytes */ 148 + u64 scrub_speed_max; 146 149 }; 147 150 148 151 /* ··· 446 443 u64 logical, u64 *length, 447 444 struct btrfs_bio **bbio_ret); 448 445 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, 449 - enum btrfs_map_op op, u64 logical, u64 len, 446 + enum btrfs_map_op op, u64 logical, 450 447 struct btrfs_io_geometry *io_geom); 451 448 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); 452 449 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);

+41 -2

fs/btrfs/zoned.c

··· 81 81 * *: Special case, no superblock is written 82 82 * 0: Use write pointer of zones[0] 83 83 * 1: Use write pointer of zones[1] 84 - * C: Compare super blcoks from zones[0] and zones[1], use the latest 84 + * C: Compare super blocks from zones[0] and zones[1], use the latest 85 85 * one determined by generation 86 86 * x: Invalid state 87 87 */ ··· 433 433 } 434 434 435 435 /* 436 - * If zones[0] is conventional, always use the beggining of the 436 + * If zones[0] is conventional, always use the beginning of the 437 437 * zone to record superblock. No need to validate in that case. 438 438 */ 439 439 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == ··· 1140 1140 } 1141 1141 1142 1142 if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { 1143 + btrfs_err_in_rcu(fs_info, 1144 + "zoned: unexpected conventional zone %llu on device %s (devid %llu)", 1145 + zone.start << SECTOR_SHIFT, 1146 + rcu_str_deref(device->name), device->devid); 1143 1147 ret = -EIO; 1144 1148 goto out; 1145 1149 } ··· 1204 1200 1205 1201 switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 1206 1202 case 0: /* single */ 1203 + if (alloc_offsets[0] == WP_MISSING_DEV) { 1204 + btrfs_err(fs_info, 1205 + "zoned: cannot recover write pointer for zone %llu", 1206 + physical); 1207 + ret = -EIO; 1208 + goto out; 1209 + } 1207 1210 cache->alloc_offset = alloc_offsets[0]; 1208 1211 break; 1209 1212 case BTRFS_BLOCK_GROUP_DUP: ··· 1228 1217 } 1229 1218 1230 1219 out: 1220 + if (cache->alloc_offset > fs_info->zone_size) { 1221 + btrfs_err(fs_info, 1222 + "zoned: invalid write pointer %llu in block group %llu", 1223 + cache->alloc_offset, cache->start); 1224 + ret = -EIO; 1225 + } 1226 + 1231 1227 /* An extent is allocated after the write pointer */ 1232 1228 if (!ret && num_conventional && last_alloc > cache->alloc_offset) { 1233 1229 btrfs_err(fs_info, ··· 1532 1514 1533 1515 length = wp - physical_pos; 1534 1516 return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); 1517 + } 1518 + 1519 + struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, 1520 + u64 logical, u64 length) 1521 + { 1522 + struct btrfs_device *device; 1523 + struct extent_map *em; 1524 + struct map_lookup *map; 1525 + 1526 + em = btrfs_get_chunk_map(fs_info, logical, length); 1527 + if (IS_ERR(em)) 1528 + return ERR_CAST(em); 1529 + 1530 + map = em->map_lookup; 1531 + /* We only support single profile for now */ 1532 + ASSERT(map->num_stripes == 1); 1533 + device = map->stripes[0].dev; 1534 + 1535 + free_extent_map(em); 1536 + 1537 + return device; 1535 1538 }

+9

fs/btrfs/zoned.h

··· 65 65 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); 66 66 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, 67 67 u64 physical_start, u64 physical_pos); 68 + struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, 69 + u64 logical, u64 length); 68 70 #else /* CONFIG_BLK_DEV_ZONED */ 69 71 static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, 70 72 struct blk_zone *zone) ··· 191 189 u64 physical_pos) 192 190 { 193 191 return -EOPNOTSUPP; 192 + } 193 + 194 + static inline struct btrfs_device *btrfs_zoned_get_device( 195 + struct btrfs_fs_info *fs_info, 196 + u64 logical, u64 length) 197 + { 198 + return ERR_PTR(-EOPNOTSUPP); 194 199 } 195 200 196 201 #endif

+9 -14

include/trace/events/btrfs.h

··· 99 99 EM( ALLOC_CHUNK, "ALLOC_CHUNK") \ 100 100 EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \ 101 101 EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \ 102 - EM( COMMIT_TRANS, "COMMIT_TRANS") \ 103 - EMe(FORCE_COMMIT_TRANS, "FORCE_COMMIT_TRANS") 102 + EMe(COMMIT_TRANS, "COMMIT_TRANS") 104 103 105 104 /* 106 105 * First define the enums in the above macros to be exported to userspace via ··· 653 654 654 655 TRACE_EVENT(btrfs_writepage_end_io_hook, 655 656 656 - TP_PROTO(const struct page *page, u64 start, u64 end, int uptodate), 657 + TP_PROTO(const struct btrfs_inode *inode, u64 start, u64 end, 658 + int uptodate), 657 659 658 - TP_ARGS(page, start, end, uptodate), 660 + TP_ARGS(inode, start, end, uptodate), 659 661 660 662 TP_STRUCT__entry_btrfs( 661 663 __field( u64, ino ) 662 - __field( unsigned long, index ) 663 664 __field( u64, start ) 664 665 __field( u64, end ) 665 666 __field( int, uptodate ) 666 667 __field( u64, root_objectid ) 667 668 ), 668 669 669 - TP_fast_assign_btrfs(btrfs_sb(page->mapping->host->i_sb), 670 - __entry->ino = btrfs_ino(BTRFS_I(page->mapping->host)); 671 - __entry->index = page->index; 670 + TP_fast_assign_btrfs(inode->root->fs_info, 671 + __entry->ino = btrfs_ino(inode); 672 672 __entry->start = start; 673 673 __entry->end = end; 674 674 __entry->uptodate = uptodate; 675 - __entry->root_objectid = 676 - BTRFS_I(page->mapping->host)->root->root_key.objectid; 675 + __entry->root_objectid = inode->root->root_key.objectid; 677 676 ), 678 677 679 - TP_printk_btrfs("root=%llu(%s) ino=%llu page_index=%lu start=%llu " 680 - "end=%llu uptodate=%d", 678 + TP_printk_btrfs("root=%llu(%s) ino=%llu start=%llu end=%llu uptodate=%d", 681 679 show_root_type(__entry->root_objectid), 682 - __entry->ino, __entry->index, 683 - __entry->start, 680 + __entry->ino, __entry->start, 684 681 __entry->end, __entry->uptodate) 685 682 ); 686 683

+2 -2

include/uapi/linux/btrfs.h

··· 154 154 __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */ 155 155 __u64 read_errors; /* # of read errors encountered (EIO) */ 156 156 __u64 csum_errors; /* # of failed csum checks */ 157 - __u64 verify_errors; /* # of occurences, where the metadata 157 + __u64 verify_errors; /* # of occurrences, where the metadata 158 158 * of a tree block did not match the 159 159 * expected values, like generation or 160 160 * logical */ ··· 174 174 __u64 last_physical; /* last physical address scrubbed. In 175 175 * case a scrub was aborted, this can 176 176 * be used to restart the scrub */ 177 - __u64 unverified_errors; /* # of occurences where a read for a 177 + __u64 unverified_errors; /* # of occurrences where a read for a 178 178 * full (64k) bio failed, but the re- 179 179 * check succeeded for each 4k piece. 180 180 * Intermittent error. */

+2 -2

include/uapi/linux/btrfs_tree.h

··· 59 59 /* for storing balance parameters in the root tree */ 60 60 #define BTRFS_BALANCE_OBJECTID -4ULL 61 61 62 - /* orhpan objectid for tracking unlinked/truncated files */ 62 + /* orphan objectid for tracking unlinked/truncated files */ 63 63 #define BTRFS_ORPHAN_OBJECTID -5ULL 64 64 65 65 /* does write ahead logging to speed up fsyncs */ ··· 275 275 #define BTRFS_PERSISTENT_ITEM_KEY 249 276 276 277 277 /* 278 - * Persistantly stores the device replace state in the device tree. 278 + * Persistently stores the device replace state in the device tree. 279 279 * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). 280 280 */ 281 281 #define BTRFS_DEV_REPLACE_KEY 250