Btrfs: RAID5 and RAID6 · tjh.dev/kernel@53b381b

+2

fs/btrfs/Kconfig

··· 6 6 select ZLIB_DEFLATE 7 7 select LZO_COMPRESS 8 8 select LZO_DECOMPRESS 9 + select RAID6_PQ 10 + 9 11 help 10 12 Btrfs is a new filesystem with extents, writable snapshotting, 11 13 support for multiple devices and many more features.

+1 -1

fs/btrfs/Makefile

··· 8 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 10 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 11 - reada.o backref.o ulist.o qgroup.o send.o dev-replace.o 11 + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o 12 12 13 13 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 14 14 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

+34 -1

fs/btrfs/ctree.h

··· 502 502 #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 503 503 504 504 #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) 505 + #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) 505 506 506 507 #define BTRFS_FEATURE_COMPAT_SUPP 0ULL 507 508 #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL ··· 512 511 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 513 512 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 514 513 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ 514 + BTRFS_FEATURE_INCOMPAT_RAID56 | \ 515 515 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) 516 516 517 517 /* ··· 954 952 #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 955 953 #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 956 954 #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 955 + #define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) 956 + #define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) 957 957 #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 958 - #define BTRFS_NR_RAID_TYPES 5 958 + #define BTRFS_NR_RAID_TYPES 7 959 959 960 960 #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ 961 961 BTRFS_BLOCK_GROUP_SYSTEM | \ ··· 965 961 966 962 #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 967 963 BTRFS_BLOCK_GROUP_RAID1 | \ 964 + BTRFS_BLOCK_GROUP_RAID5 | \ 965 + BTRFS_BLOCK_GROUP_RAID6 | \ 968 966 BTRFS_BLOCK_GROUP_DUP | \ 969 967 BTRFS_BLOCK_GROUP_RAID10) 970 968 /* ··· 1191 1185 u64 flags; 1192 1186 u64 sectorsize; 1193 1187 u64 cache_generation; 1188 + 1189 + /* for raid56, this is a full stripe, without parity */ 1190 + unsigned long full_stripe_len; 1191 + 1194 1192 unsigned int ro:1; 1195 1193 unsigned int dirty:1; 1196 1194 unsigned int iref:1; ··· 1234 1224 struct list_head list; 1235 1225 u64 seq; 1236 1226 }; 1227 + 1228 + /* used by the raid56 code to lock stripes for read/modify/write */ 1229 + struct btrfs_stripe_hash { 1230 + struct list_head hash_list; 1231 + wait_queue_head_t wait; 1232 + spinlock_t lock; 1233 + }; 1234 + 1235 + /* used by the raid56 code to lock stripes for read/modify/write */ 1236 + struct btrfs_stripe_hash_table { 1237 + struct btrfs_stripe_hash *table; 1238 + }; 1239 + 1240 + #define BTRFS_STRIPE_HASH_TABLE_BITS 11 1237 1241 1238 1242 /* fs_info */ 1239 1243 struct reloc_control; ··· 1331 1307 struct mutex cleaner_mutex; 1332 1308 struct mutex chunk_mutex; 1333 1309 struct mutex volume_mutex; 1310 + 1311 + /* this is used during read/modify/write to make sure 1312 + * no two ios are trying to mod the same stripe at the same 1313 + * time 1314 + */ 1315 + struct btrfs_stripe_hash_table *stripe_hash_table; 1316 + 1334 1317 /* 1335 1318 * this protects the ordered operations list only while we are 1336 1319 * processing all of the entries on it. This way we make ··· 1426 1395 struct btrfs_workers flush_workers; 1427 1396 struct btrfs_workers endio_workers; 1428 1397 struct btrfs_workers endio_meta_workers; 1398 + struct btrfs_workers endio_raid56_workers; 1399 + struct btrfs_workers rmw_workers; 1429 1400 struct btrfs_workers endio_meta_write_workers; 1430 1401 struct btrfs_workers endio_write_workers; 1431 1402 struct btrfs_workers endio_freespace_worker;

+53 -9

fs/btrfs/disk-io.c

··· 46 46 #include "check-integrity.h" 47 47 #include "rcu-string.h" 48 48 #include "dev-replace.h" 49 + #include "raid56.h" 49 50 50 51 #ifdef CONFIG_X86 51 52 #include <asm/cpufeature.h> ··· 640 639 btree_readahead_hook(root, eb, eb->start, ret); 641 640 } 642 641 643 - if (ret) 642 + if (ret) { 643 + /* 644 + * our io error hook is going to dec the io pages 645 + * again, we have to make sure it has something 646 + * to decrement 647 + */ 648 + atomic_inc(&eb->io_pages); 644 649 clear_extent_buffer_uptodate(eb); 650 + } 645 651 free_extent_buffer(eb); 646 652 out: 647 653 return ret; ··· 662 654 eb = (struct extent_buffer *)page->private; 663 655 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 664 656 eb->read_mirror = failed_mirror; 657 + atomic_dec(&eb->io_pages); 665 658 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 666 659 btree_readahead_hook(root, eb, eb->start, -EIO); 667 660 return -EIO; /* we fixed nothing */ ··· 679 670 end_io_wq->work.flags = 0; 680 671 681 672 if (bio->bi_rw & REQ_WRITE) { 682 - if (end_io_wq->metadata == 1) 673 + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 683 674 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 684 675 &end_io_wq->work); 685 - else if (end_io_wq->metadata == 2) 676 + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 686 677 btrfs_queue_worker(&fs_info->endio_freespace_worker, 678 + &end_io_wq->work); 679 + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 680 + btrfs_queue_worker(&fs_info->endio_raid56_workers, 687 681 &end_io_wq->work); 688 682 else 689 683 btrfs_queue_worker(&fs_info->endio_write_workers, 690 684 &end_io_wq->work); 691 685 } else { 692 - if (end_io_wq->metadata) 686 + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 687 + btrfs_queue_worker(&fs_info->endio_raid56_workers, 688 + &end_io_wq->work); 689 + else if (end_io_wq->metadata) 693 690 btrfs_queue_worker(&fs_info->endio_meta_workers, 694 691 &end_io_wq->work); 695 692 else ··· 710 695 * 0 - if data 711 696 * 1 - if normal metadta 712 697 * 2 - if writing to the free space cache area 698 + * 3 - raid parity work 713 699 */ 714 700 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 715 701 int metadata) ··· 2181 2165 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2182 2166 init_waitqueue_head(&fs_info->async_submit_wait); 2183 2167 2168 + ret = btrfs_alloc_stripe_hash_table(fs_info); 2169 + if (ret) { 2170 + err = -ENOMEM; 2171 + goto fail_alloc; 2172 + } 2173 + 2184 2174 __setup_root(4096, 4096, 4096, 4096, tree_root, 2185 2175 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2186 2176 ··· 2354 2332 btrfs_init_workers(&fs_info->endio_meta_write_workers, 2355 2333 "endio-meta-write", fs_info->thread_pool_size, 2356 2334 &fs_info->generic_worker); 2335 + btrfs_init_workers(&fs_info->endio_raid56_workers, 2336 + "endio-raid56", fs_info->thread_pool_size, 2337 + &fs_info->generic_worker); 2338 + btrfs_init_workers(&fs_info->rmw_workers, 2339 + "rmw", fs_info->thread_pool_size, 2340 + &fs_info->generic_worker); 2357 2341 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 2358 2342 fs_info->thread_pool_size, 2359 2343 &fs_info->generic_worker); ··· 2378 2350 */ 2379 2351 fs_info->endio_workers.idle_thresh = 4; 2380 2352 fs_info->endio_meta_workers.idle_thresh = 4; 2353 + fs_info->endio_raid56_workers.idle_thresh = 4; 2354 + fs_info->rmw_workers.idle_thresh = 2; 2381 2355 2382 2356 fs_info->endio_write_workers.idle_thresh = 2; 2383 2357 fs_info->endio_meta_write_workers.idle_thresh = 2; ··· 2396 2366 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2397 2367 ret |= btrfs_start_workers(&fs_info->endio_workers); 2398 2368 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2369 + ret |= btrfs_start_workers(&fs_info->rmw_workers); 2370 + ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); 2399 2371 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2400 2372 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2401 2373 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); ··· 2742 2710 btrfs_stop_workers(&fs_info->workers); 2743 2711 btrfs_stop_workers(&fs_info->endio_workers); 2744 2712 btrfs_stop_workers(&fs_info->endio_meta_workers); 2713 + btrfs_stop_workers(&fs_info->endio_raid56_workers); 2714 + btrfs_stop_workers(&fs_info->rmw_workers); 2745 2715 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2746 2716 btrfs_stop_workers(&fs_info->endio_write_workers); 2747 2717 btrfs_stop_workers(&fs_info->endio_freespace_worker); ··· 2762 2728 fail_srcu: 2763 2729 cleanup_srcu_struct(&fs_info->subvol_srcu); 2764 2730 fail: 2731 + btrfs_free_stripe_hash_table(fs_info); 2765 2732 btrfs_close_devices(fs_info->fs_devices); 2766 2733 return err; 2767 2734 ··· 3111 3076 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) 3112 3077 == 0))) 3113 3078 num_tolerated_disk_barrier_failures = 0; 3114 - else if (num_tolerated_disk_barrier_failures > 1 3115 - && 3116 - (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3117 - BTRFS_BLOCK_GROUP_RAID10))) 3118 - num_tolerated_disk_barrier_failures = 1; 3079 + else if (num_tolerated_disk_barrier_failures > 1) { 3080 + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3081 + BTRFS_BLOCK_GROUP_RAID5 | 3082 + BTRFS_BLOCK_GROUP_RAID10)) { 3083 + num_tolerated_disk_barrier_failures = 1; 3084 + } else if (flags & 3085 + BTRFS_BLOCK_GROUP_RAID5) { 3086 + num_tolerated_disk_barrier_failures = 2; 3087 + } 3088 + } 3119 3089 } 3120 3090 } 3121 3091 up_read(&sinfo->groups_sem); ··· 3424 3384 btrfs_stop_workers(&fs_info->workers); 3425 3385 btrfs_stop_workers(&fs_info->endio_workers); 3426 3386 btrfs_stop_workers(&fs_info->endio_meta_workers); 3387 + btrfs_stop_workers(&fs_info->endio_raid56_workers); 3388 + btrfs_stop_workers(&fs_info->rmw_workers); 3427 3389 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 3428 3390 btrfs_stop_workers(&fs_info->endio_write_workers); 3429 3391 btrfs_stop_workers(&fs_info->endio_freespace_worker); ··· 3445 3403 3446 3404 bdi_destroy(&fs_info->bdi); 3447 3405 cleanup_srcu_struct(&fs_info->subvol_srcu); 3406 + 3407 + btrfs_free_stripe_hash_table(fs_info); 3448 3408 3449 3409 return 0; 3450 3410 }

+7

fs/btrfs/disk-io.h

··· 25 25 #define BTRFS_SUPER_MIRROR_MAX 3 26 26 #define BTRFS_SUPER_MIRROR_SHIFT 12 27 27 28 + enum { 29 + BTRFS_WQ_ENDIO_DATA = 0, 30 + BTRFS_WQ_ENDIO_METADATA = 1, 31 + BTRFS_WQ_ENDIO_FREE_SPACE = 2, 32 + BTRFS_WQ_ENDIO_RAID56 = 3, 33 + }; 34 + 28 35 static inline u64 btrfs_sb_offset(int mirror) 29 36 { 30 37 u64 start = 16 * 1024;

+59 -29

fs/btrfs/extent-tree.c

··· 31 31 #include "print-tree.h" 32 32 #include "transaction.h" 33 33 #include "volumes.h" 34 + #include "raid56.h" 34 35 #include "locking.h" 35 36 #include "free-space-cache.h" 36 37 #include "math.h" ··· 1853 1852 *actual_bytes = discarded_bytes; 1854 1853 1855 1854 1855 + if (ret == -EOPNOTSUPP) 1856 + ret = 0; 1856 1857 return ret; 1857 1858 } 1858 1859 ··· 3279 3276 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3280 3277 root->fs_info->fs_devices->missing_devices; 3281 3278 u64 target; 3279 + u64 tmp; 3282 3280 3283 3281 /* 3284 3282 * see if restripe for this chunk_type is in progress, if so ··· 3296 3292 } 3297 3293 spin_unlock(&root->fs_info->balance_lock); 3298 3294 3295 + /* First, mask out the RAID levels which aren't possible */ 3299 3296 if (num_devices == 1) 3300 - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3297 + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3298 + BTRFS_BLOCK_GROUP_RAID5); 3299 + if (num_devices < 3) 3300 + flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3301 3301 if (num_devices < 4) 3302 3302 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3303 3303 3304 - if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3305 - (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3306 - BTRFS_BLOCK_GROUP_RAID10))) { 3307 - flags &= ~BTRFS_BLOCK_GROUP_DUP; 3308 - } 3304 + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3305 + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3306 + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3307 + flags &= ~tmp; 3309 3308 3310 - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && 3311 - (flags & BTRFS_BLOCK_GROUP_RAID10)) { 3312 - flags &= ~BTRFS_BLOCK_GROUP_RAID1; 3313 - } 3309 + if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3310 + tmp = BTRFS_BLOCK_GROUP_RAID6; 3311 + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3312 + tmp = BTRFS_BLOCK_GROUP_RAID5; 3313 + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3314 + tmp = BTRFS_BLOCK_GROUP_RAID10; 3315 + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3316 + tmp = BTRFS_BLOCK_GROUP_RAID1; 3317 + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3318 + tmp = BTRFS_BLOCK_GROUP_RAID0; 3314 3319 3315 - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3316 - ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3317 - (flags & BTRFS_BLOCK_GROUP_RAID10) | 3318 - (flags & BTRFS_BLOCK_GROUP_DUP))) { 3319 - flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3320 - } 3321 - 3322 - return extended_to_chunk(flags); 3320 + return extended_to_chunk(flags | tmp); 3323 3321 } 3324 3322 3325 3323 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) ··· 3339 3333 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3340 3334 { 3341 3335 u64 flags; 3336 + u64 ret; 3342 3337 3343 3338 if (data) 3344 3339 flags = BTRFS_BLOCK_GROUP_DATA; ··· 3348 3341 else 3349 3342 flags = BTRFS_BLOCK_GROUP_METADATA; 3350 3343 3351 - return get_alloc_profile(root, flags); 3344 + ret = get_alloc_profile(root, flags); 3345 + return ret; 3352 3346 } 3353 3347 3354 3348 /* ··· 3524 3516 { 3525 3517 u64 num_dev; 3526 3518 3527 - if (type & BTRFS_BLOCK_GROUP_RAID10 || 3528 - type & BTRFS_BLOCK_GROUP_RAID0) 3519 + if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3520 + BTRFS_BLOCK_GROUP_RAID0 | 3521 + BTRFS_BLOCK_GROUP_RAID5 | 3522 + BTRFS_BLOCK_GROUP_RAID6)) 3529 3523 num_dev = root->fs_info->fs_devices->rw_devices; 3530 3524 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3531 3525 num_dev = 2; ··· 3677 3667 3678 3668 /* 3679 3669 * If we have dup, raid1 or raid10 then only half of the free 3680 - * space is actually useable. 3670 + * space is actually useable. For raid56, the space info used 3671 + * doesn't include the parity drive, so we don't have to 3672 + * change the math 3681 3673 */ 3682 3674 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3683 3675 BTRFS_BLOCK_GROUP_RAID1 | ··· 5467 5455 return ret; 5468 5456 } 5469 5457 5470 - static u64 stripe_align(struct btrfs_root *root, u64 val) 5458 + static u64 stripe_align(struct btrfs_root *root, 5459 + struct btrfs_block_group_cache *cache, 5460 + u64 val, u64 num_bytes) 5471 5461 { 5472 - u64 mask = ((u64)root->stripesize - 1); 5473 - u64 ret = (val + mask) & ~mask; 5462 + u64 mask; 5463 + u64 ret; 5464 + mask = ((u64)root->stripesize - 1); 5465 + ret = (val + mask) & ~mask; 5474 5466 return ret; 5475 5467 } 5476 5468 ··· 5535 5519 index = 2; 5536 5520 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5537 5521 index = 3; 5522 + else if (flags & BTRFS_BLOCK_GROUP_RAID5) 5523 + index = 5; 5524 + else if (flags & BTRFS_BLOCK_GROUP_RAID6) 5525 + index = 6; 5538 5526 else 5539 - index = 4; 5540 - 5527 + index = 4; /* BTRFS_BLOCK_GROUP_SINGLE */ 5541 5528 return index; 5542 5529 } 5543 5530 ··· 5684 5665 if (!block_group_bits(block_group, data)) { 5685 5666 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5686 5667 BTRFS_BLOCK_GROUP_RAID1 | 5668 + BTRFS_BLOCK_GROUP_RAID5 | 5669 + BTRFS_BLOCK_GROUP_RAID6 | 5687 5670 BTRFS_BLOCK_GROUP_RAID10; 5688 5671 5689 5672 /* ··· 5856 5835 goto loop; 5857 5836 } 5858 5837 checks: 5859 - search_start = stripe_align(root, offset); 5838 + search_start = stripe_align(root, used_block_group, 5839 + offset, num_bytes); 5860 5840 5861 5841 /* move on to the next group */ 5862 5842 if (search_start + num_bytes > ··· 7225 7203 root->fs_info->fs_devices->missing_devices; 7226 7204 7227 7205 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7206 + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 7228 7207 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7229 7208 7230 7209 if (num_devices == 1) { ··· 7777 7754 btrfs_release_path(path); 7778 7755 cache->flags = btrfs_block_group_flags(&cache->item); 7779 7756 cache->sectorsize = root->sectorsize; 7780 - 7757 + cache->full_stripe_len = btrfs_full_stripe_len(root, 7758 + &root->fs_info->mapping_tree, 7759 + found_key.objectid); 7781 7760 btrfs_init_free_space_ctl(cache); 7782 7761 7783 7762 /* ··· 7833 7808 if (!(get_alloc_profile(root, space_info->flags) & 7834 7809 (BTRFS_BLOCK_GROUP_RAID10 | 7835 7810 BTRFS_BLOCK_GROUP_RAID1 | 7811 + BTRFS_BLOCK_GROUP_RAID5 | 7812 + BTRFS_BLOCK_GROUP_RAID6 | 7836 7813 BTRFS_BLOCK_GROUP_DUP))) 7837 7814 continue; 7838 7815 /* ··· 7910 7883 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7911 7884 cache->sectorsize = root->sectorsize; 7912 7885 cache->fs_info = root->fs_info; 7886 + cache->full_stripe_len = btrfs_full_stripe_len(root, 7887 + &root->fs_info->mapping_tree, 7888 + chunk_offset); 7913 7889 7914 7890 atomic_set(&cache->count, 1); 7915 7891 spin_lock_init(&cache->lock);

+11 -7

fs/btrfs/extent_io.c

··· 1895 1895 if (ret) 1896 1896 err = ret; 1897 1897 1898 - if (did_repair) { 1899 - ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1900 - rec->start + rec->len - 1, 1901 - EXTENT_DAMAGED, GFP_NOFS); 1902 - if (ret && !err) 1903 - err = ret; 1904 - } 1898 + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1899 + rec->start + rec->len - 1, 1900 + EXTENT_DAMAGED, GFP_NOFS); 1901 + if (ret && !err) 1902 + err = ret; 1905 1903 1906 1904 kfree(rec); 1907 1905 return err; ··· 1930 1932 u64 map_length = 0; 1931 1933 u64 sector; 1932 1934 struct btrfs_bio *bbio = NULL; 1935 + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 1933 1936 int ret; 1934 1937 1935 1938 BUG_ON(!mirror_num); 1939 + 1940 + /* we can't repair anything in raid56 yet */ 1941 + if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) 1942 + return 0; 1936 1943 1937 1944 bio = bio_alloc(GFP_NOFS, 1); 1938 1945 if (!bio) ··· 2055 2052 failrec->failed_mirror); 2056 2053 did_repair = !ret; 2057 2054 } 2055 + ret = 0; 2058 2056 } 2059 2057 2060 2058 out:

+42 -8

fs/btrfs/free-space-cache.c

··· 1463 1463 } 1464 1464 1465 1465 static struct btrfs_free_space * 1466 - find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) 1466 + find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 1467 + unsigned long align) 1467 1468 { 1468 1469 struct btrfs_free_space *entry; 1469 1470 struct rb_node *node; 1471 + u64 ctl_off; 1472 + u64 tmp; 1473 + u64 align_off; 1470 1474 int ret; 1471 1475 1472 1476 if (!ctl->free_space_offset.rb_node) ··· 1485 1481 if (entry->bytes < *bytes) 1486 1482 continue; 1487 1483 1484 + /* make sure the space returned is big enough 1485 + * to match our requested alignment 1486 + */ 1487 + if (*bytes >= align) { 1488 + ctl_off = entry->offset - ctl->start; 1489 + tmp = ctl_off + align - 1;; 1490 + do_div(tmp, align); 1491 + tmp = tmp * align + ctl->start; 1492 + align_off = tmp - entry->offset; 1493 + } else { 1494 + align_off = 0; 1495 + tmp = entry->offset; 1496 + } 1497 + 1498 + if (entry->bytes < *bytes + align_off) 1499 + continue; 1500 + 1488 1501 if (entry->bitmap) { 1489 - ret = search_bitmap(ctl, entry, offset, bytes); 1490 - if (!ret) 1502 + ret = search_bitmap(ctl, entry, &tmp, bytes); 1503 + if (!ret) { 1504 + *offset = tmp; 1491 1505 return entry; 1506 + } 1492 1507 continue; 1493 1508 } 1494 1509 1495 - *offset = entry->offset; 1496 - *bytes = entry->bytes; 1510 + *offset = tmp; 1511 + *bytes = entry->bytes - align_off; 1497 1512 return entry; 1498 1513 } 1499 1514 ··· 2114 2091 struct btrfs_free_space *entry = NULL; 2115 2092 u64 bytes_search = bytes + empty_size; 2116 2093 u64 ret = 0; 2094 + u64 align_gap = 0; 2095 + u64 align_gap_len = 0; 2117 2096 2118 2097 spin_lock(&ctl->tree_lock); 2119 - entry = find_free_space(ctl, &offset, &bytes_search); 2098 + entry = find_free_space(ctl, &offset, &bytes_search, 2099 + block_group->full_stripe_len); 2120 2100 if (!entry) 2121 2101 goto out; 2122 2102 ··· 2129 2103 if (!entry->bytes) 2130 2104 free_bitmap(ctl, entry); 2131 2105 } else { 2106 + 2132 2107 unlink_free_space(ctl, entry); 2133 - entry->offset += bytes; 2134 - entry->bytes -= bytes; 2108 + align_gap_len = offset - entry->offset; 2109 + align_gap = entry->offset; 2110 + 2111 + entry->offset = offset + bytes; 2112 + WARN_ON(entry->bytes < bytes + align_gap_len); 2113 + 2114 + entry->bytes -= bytes + align_gap_len; 2135 2115 if (!entry->bytes) 2136 2116 kmem_cache_free(btrfs_free_space_cachep, entry); 2137 2117 else ··· 2147 2115 out: 2148 2116 spin_unlock(&ctl->tree_lock); 2149 2117 2118 + if (align_gap_len) 2119 + __btrfs_add_free_space(ctl, align_gap, align_gap_len); 2150 2120 return ret; 2151 2121 } 2152 2122

+13 -5

fs/btrfs/inode.c

··· 39 39 #include <linux/slab.h> 40 40 #include <linux/ratelimit.h> 41 41 #include <linux/mount.h> 42 + #include <linux/blkdev.h> 42 43 #include "compat.h" 43 44 #include "ctree.h" 44 45 #include "disk-io.h" ··· 6387 6386 int async_submit = 0; 6388 6387 6389 6388 map_length = orig_bio->bi_size; 6390 - ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, 6389 + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, 6391 6390 &map_length, NULL, 0); 6392 6391 if (ret) { 6393 6392 bio_put(orig_bio); 6394 6393 return -EIO; 6395 6394 } 6396 - 6397 6395 if (map_length >= orig_bio->bi_size) { 6398 6396 bio = orig_bio; 6399 6397 goto submit; 6400 6398 } 6401 6399 6402 - async_submit = 1; 6400 + /* async crcs make it difficult to collect full stripe writes. */ 6401 + if (btrfs_get_alloc_profile(root, 1) & 6402 + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) 6403 + async_submit = 0; 6404 + else 6405 + async_submit = 1; 6406 + 6403 6407 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6404 6408 if (!bio) 6405 6409 return -ENOMEM; ··· 6446 6440 bio->bi_end_io = btrfs_end_dio_bio; 6447 6441 6448 6442 map_length = orig_bio->bi_size; 6449 - ret = btrfs_map_block(root->fs_info, READ, 6443 + ret = btrfs_map_block(root->fs_info, rw, 6450 6444 start_sector << 9, 6451 6445 &map_length, NULL, 0); 6452 6446 if (ret) { ··· 6589 6583 { 6590 6584 struct file *file = iocb->ki_filp; 6591 6585 struct inode *inode = file->f_mapping->host; 6586 + ssize_t ret; 6592 6587 6593 6588 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6594 6589 offset, nr_segs)) 6595 6590 return 0; 6596 6591 6597 - return __blockdev_direct_IO(rw, iocb, inode, 6592 + ret = __blockdev_direct_IO(rw, iocb, inode, 6598 6593 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6599 6594 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6600 6595 btrfs_submit_direct, 0); 6596 + return ret; 6601 6597 } 6602 6598 6603 6599 #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)

+1647

fs/btrfs/raid56.c

··· 1 + /* 2 + * Copyright (C) 2012 Fusion-io All rights reserved. 3 + * Copyright (C) 2012 Intel Corp. All rights reserved. 4 + * 5 + * This program is free software; you can redistribute it and/or 6 + * modify it under the terms of the GNU General Public 7 + * License v2 as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope that it will be useful, 10 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 + * General Public License for more details. 13 + * 14 + * You should have received a copy of the GNU General Public 15 + * License along with this program; if not, write to the 16 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 17 + * Boston, MA 021110-1307, USA. 18 + */ 19 + #include <linux/sched.h> 20 + #include <linux/wait.h> 21 + #include <linux/bio.h> 22 + #include <linux/slab.h> 23 + #include <linux/buffer_head.h> 24 + #include <linux/blkdev.h> 25 + #include <linux/random.h> 26 + #include <linux/iocontext.h> 27 + #include <linux/capability.h> 28 + #include <linux/ratelimit.h> 29 + #include <linux/kthread.h> 30 + #include <linux/raid/pq.h> 31 + #include <linux/hash.h> 32 + #include <linux/list_sort.h> 33 + #include <linux/raid/xor.h> 34 + #include <asm/div64.h> 35 + #include "compat.h" 36 + #include "ctree.h" 37 + #include "extent_map.h" 38 + #include "disk-io.h" 39 + #include "transaction.h" 40 + #include "print-tree.h" 41 + #include "volumes.h" 42 + #include "raid56.h" 43 + #include "async-thread.h" 44 + #include "check-integrity.h" 45 + #include "rcu-string.h" 46 + 47 + /* set when additional merges to this rbio are not allowed */ 48 + #define RBIO_RMW_LOCKED_BIT 1 49 + 50 + struct btrfs_raid_bio { 51 + struct btrfs_fs_info *fs_info; 52 + struct btrfs_bio *bbio; 53 + 54 + /* 55 + * logical block numbers for the start of each stripe 56 + * The last one or two are p/q. These are sorted, 57 + * so raid_map[0] is the start of our full stripe 58 + */ 59 + u64 *raid_map; 60 + 61 + /* while we're doing rmw on a stripe 62 + * we put it into a hash table so we can 63 + * lock the stripe and merge more rbios 64 + * into it. 65 + */ 66 + struct list_head hash_list; 67 + 68 + /* 69 + * for scheduling work in the helper threads 70 + */ 71 + struct btrfs_work work; 72 + 73 + /* 74 + * bio list and bio_list_lock are used 75 + * to add more bios into the stripe 76 + * in hopes of avoiding the full rmw 77 + */ 78 + struct bio_list bio_list; 79 + spinlock_t bio_list_lock; 80 + 81 + /* 82 + * also protected by the bio_list_lock, the 83 + * stripe locking code uses plug_list to hand off 84 + * the stripe lock to the next pending IO 85 + */ 86 + struct list_head plug_list; 87 + 88 + /* 89 + * flags that tell us if it is safe to 90 + * merge with this bio 91 + */ 92 + unsigned long flags; 93 + 94 + /* size of each individual stripe on disk */ 95 + int stripe_len; 96 + 97 + /* number of data stripes (no p/q) */ 98 + int nr_data; 99 + 100 + /* 101 + * set if we're doing a parity rebuild 102 + * for a read from higher up, which is handled 103 + * differently from a parity rebuild as part of 104 + * rmw 105 + */ 106 + int read_rebuild; 107 + 108 + /* first bad stripe */ 109 + int faila; 110 + 111 + /* second bad stripe (for raid6 use) */ 112 + int failb; 113 + 114 + /* 115 + * number of pages needed to represent the full 116 + * stripe 117 + */ 118 + int nr_pages; 119 + 120 + /* 121 + * size of all the bios in the bio_list. This 122 + * helps us decide if the rbio maps to a full 123 + * stripe or not 124 + */ 125 + int bio_list_bytes; 126 + 127 + atomic_t refs; 128 + 129 + /* 130 + * these are two arrays of pointers. We allocate the 131 + * rbio big enough to hold them both and setup their 132 + * locations when the rbio is allocated 133 + */ 134 + 135 + /* pointers to pages that we allocated for 136 + * reading/writing stripes directly from the disk (including P/Q) 137 + */ 138 + struct page **stripe_pages; 139 + 140 + /* 141 + * pointers to the pages in the bio_list. Stored 142 + * here for faster lookup 143 + */ 144 + struct page **bio_pages; 145 + }; 146 + 147 + static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 148 + static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 149 + static void rmw_work(struct btrfs_work *work); 150 + static void read_rebuild_work(struct btrfs_work *work); 151 + static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 152 + static void async_read_rebuild(struct btrfs_raid_bio *rbio); 153 + static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 154 + static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 155 + static void __free_raid_bio(struct btrfs_raid_bio *rbio); 156 + static void index_rbio_pages(struct btrfs_raid_bio *rbio); 157 + static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 158 + 159 + /* 160 + * the stripe hash table is used for locking, and to collect 161 + * bios in hopes of making a full stripe 162 + */ 163 + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 164 + { 165 + struct btrfs_stripe_hash_table *table; 166 + struct btrfs_stripe_hash_table *x; 167 + struct btrfs_stripe_hash *cur; 168 + struct btrfs_stripe_hash *h; 169 + int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 170 + int i; 171 + 172 + if (info->stripe_hash_table) 173 + return 0; 174 + 175 + table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS); 176 + if (!table) 177 + return -ENOMEM; 178 + 179 + table->table = (void *)(table + 1); 180 + h = table->table; 181 + 182 + for (i = 0; i < num_entries; i++) { 183 + cur = h + i; 184 + INIT_LIST_HEAD(&cur->hash_list); 185 + spin_lock_init(&cur->lock); 186 + init_waitqueue_head(&cur->wait); 187 + } 188 + 189 + x = cmpxchg(&info->stripe_hash_table, NULL, table); 190 + if (x) 191 + kfree(x); 192 + return 0; 193 + } 194 + 195 + /* 196 + * we hash on the first logical address of the stripe 197 + */ 198 + static int rbio_bucket(struct btrfs_raid_bio *rbio) 199 + { 200 + u64 num = rbio->raid_map[0]; 201 + 202 + /* 203 + * we shift down quite a bit. We're using byte 204 + * addressing, and most of the lower bits are zeros. 205 + * This tends to upset hash_64, and it consistently 206 + * returns just one or two different values. 207 + * 208 + * shifting off the lower bits fixes things. 209 + */ 210 + return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 211 + } 212 + 213 + /* 214 + * merging means we take the bio_list from the victim and 215 + * splice it into the destination. The victim should 216 + * be discarded afterwards. 217 + * 218 + * must be called with dest->rbio_list_lock held 219 + */ 220 + static void merge_rbio(struct btrfs_raid_bio *dest, 221 + struct btrfs_raid_bio *victim) 222 + { 223 + bio_list_merge(&dest->bio_list, &victim->bio_list); 224 + dest->bio_list_bytes += victim->bio_list_bytes; 225 + bio_list_init(&victim->bio_list); 226 + } 227 + 228 + /* 229 + * free the hash table used by unmount 230 + */ 231 + void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 232 + { 233 + if (!info->stripe_hash_table) 234 + return; 235 + kfree(info->stripe_hash_table); 236 + info->stripe_hash_table = NULL; 237 + } 238 + 239 + /* 240 + * helper function to run the xor_blocks api. It is only 241 + * able to do MAX_XOR_BLOCKS at a time, so we need to 242 + * loop through. 243 + */ 244 + static void run_xor(void **pages, int src_cnt, ssize_t len) 245 + { 246 + int src_off = 0; 247 + int xor_src_cnt = 0; 248 + void *dest = pages[src_cnt]; 249 + 250 + while(src_cnt > 0) { 251 + xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 252 + xor_blocks(xor_src_cnt, len, dest, pages + src_off); 253 + 254 + src_cnt -= xor_src_cnt; 255 + src_off += xor_src_cnt; 256 + } 257 + } 258 + 259 + /* 260 + * returns true if the bio list inside this rbio 261 + * covers an entire stripe (no rmw required). 262 + * Must be called with the bio list lock held, or 263 + * at a time when you know it is impossible to add 264 + * new bios into the list 265 + */ 266 + static int __rbio_is_full(struct btrfs_raid_bio *rbio) 267 + { 268 + unsigned long size = rbio->bio_list_bytes; 269 + int ret = 1; 270 + 271 + if (size != rbio->nr_data * rbio->stripe_len) 272 + ret = 0; 273 + 274 + BUG_ON(size > rbio->nr_data * rbio->stripe_len); 275 + return ret; 276 + } 277 + 278 + static int rbio_is_full(struct btrfs_raid_bio *rbio) 279 + { 280 + unsigned long flags; 281 + int ret; 282 + 283 + spin_lock_irqsave(&rbio->bio_list_lock, flags); 284 + ret = __rbio_is_full(rbio); 285 + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 286 + return ret; 287 + } 288 + 289 + /* 290 + * returns 1 if it is safe to merge two rbios together. 291 + * The merging is safe if the two rbios correspond to 292 + * the same stripe and if they are both going in the same 293 + * direction (read vs write), and if neither one is 294 + * locked for final IO 295 + * 296 + * The caller is responsible for locking such that 297 + * rmw_locked is safe to test 298 + */ 299 + static int rbio_can_merge(struct btrfs_raid_bio *last, 300 + struct btrfs_raid_bio *cur) 301 + { 302 + if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 303 + test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 304 + return 0; 305 + 306 + if (last->raid_map[0] != 307 + cur->raid_map[0]) 308 + return 0; 309 + 310 + /* reads can't merge with writes */ 311 + if (last->read_rebuild != 312 + cur->read_rebuild) { 313 + return 0; 314 + } 315 + 316 + return 1; 317 + } 318 + 319 + /* 320 + * helper to index into the pstripe 321 + */ 322 + static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 323 + { 324 + index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 325 + return rbio->stripe_pages[index]; 326 + } 327 + 328 + /* 329 + * helper to index into the qstripe, returns null 330 + * if there is no qstripe 331 + */ 332 + static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 333 + { 334 + if (rbio->nr_data + 1 == rbio->bbio->num_stripes) 335 + return NULL; 336 + 337 + index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 338 + PAGE_CACHE_SHIFT; 339 + return rbio->stripe_pages[index]; 340 + } 341 + 342 + /* 343 + * The first stripe in the table for a logical address 344 + * has the lock. rbios are added in one of three ways: 345 + * 346 + * 1) Nobody has the stripe locked yet. The rbio is given 347 + * the lock and 0 is returned. The caller must start the IO 348 + * themselves. 349 + * 350 + * 2) Someone has the stripe locked, but we're able to merge 351 + * with the lock owner. The rbio is freed and the IO will 352 + * start automatically along with the existing rbio. 1 is returned. 353 + * 354 + * 3) Someone has the stripe locked, but we're not able to merge. 355 + * The rbio is added to the lock owner's plug list, or merged into 356 + * an rbio already on the plug list. When the lock owner unlocks, 357 + * the next rbio on the list is run and the IO is started automatically. 358 + * 1 is returned 359 + * 360 + * If we return 0, the caller still owns the rbio and must continue with 361 + * IO submission. If we return 1, the caller must assume the rbio has 362 + * already been freed. 363 + */ 364 + static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 365 + { 366 + int bucket = rbio_bucket(rbio); 367 + struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 368 + struct btrfs_raid_bio *cur; 369 + struct btrfs_raid_bio *pending; 370 + unsigned long flags; 371 + DEFINE_WAIT(wait); 372 + struct btrfs_raid_bio *freeit = NULL; 373 + int ret = 0; 374 + int walk = 0; 375 + 376 + spin_lock_irqsave(&h->lock, flags); 377 + list_for_each_entry(cur, &h->hash_list, hash_list) { 378 + walk++; 379 + if (cur->raid_map[0] == rbio->raid_map[0]) { 380 + spin_lock(&cur->bio_list_lock); 381 + 382 + /* can we merge into the lock owner? */ 383 + if (rbio_can_merge(cur, rbio)) { 384 + merge_rbio(cur, rbio); 385 + spin_unlock(&cur->bio_list_lock); 386 + freeit = rbio; 387 + ret = 1; 388 + goto out; 389 + } 390 + 391 + /* 392 + * we couldn't merge with the running 393 + * rbio, see if we can merge with the 394 + * pending ones. We don't have to 395 + * check for rmw_locked because there 396 + * is no way they are inside finish_rmw 397 + * right now 398 + */ 399 + list_for_each_entry(pending, &cur->plug_list, 400 + plug_list) { 401 + if (rbio_can_merge(pending, rbio)) { 402 + merge_rbio(pending, rbio); 403 + spin_unlock(&cur->bio_list_lock); 404 + freeit = rbio; 405 + ret = 1; 406 + goto out; 407 + } 408 + } 409 + 410 + /* no merging, put us on the tail of the plug list, 411 + * our rbio will be started with the currently 412 + * running rbio unlocks 413 + */ 414 + list_add_tail(&rbio->plug_list, &cur->plug_list); 415 + spin_unlock(&cur->bio_list_lock); 416 + ret = 1; 417 + goto out; 418 + } 419 + } 420 + 421 + atomic_inc(&rbio->refs); 422 + list_add(&rbio->hash_list, &h->hash_list); 423 + out: 424 + spin_unlock_irqrestore(&h->lock, flags); 425 + if (freeit) 426 + __free_raid_bio(freeit); 427 + return ret; 428 + } 429 + 430 + /* 431 + * called as rmw or parity rebuild is completed. If the plug list has more 432 + * rbios waiting for this stripe, the next one on the list will be started 433 + */ 434 + static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 435 + { 436 + int bucket; 437 + struct btrfs_stripe_hash *h; 438 + unsigned long flags; 439 + 440 + bucket = rbio_bucket(rbio); 441 + h = rbio->fs_info->stripe_hash_table->table + bucket; 442 + 443 + spin_lock_irqsave(&h->lock, flags); 444 + spin_lock(&rbio->bio_list_lock); 445 + 446 + if (!list_empty(&rbio->hash_list)) { 447 + 448 + list_del_init(&rbio->hash_list); 449 + atomic_dec(&rbio->refs); 450 + 451 + /* 452 + * we use the plug list to hold all the rbios 453 + * waiting for the chance to lock this stripe. 454 + * hand the lock over to one of them. 455 + */ 456 + if (!list_empty(&rbio->plug_list)) { 457 + struct btrfs_raid_bio *next; 458 + struct list_head *head = rbio->plug_list.next; 459 + 460 + next = list_entry(head, struct btrfs_raid_bio, 461 + plug_list); 462 + 463 + list_del_init(&rbio->plug_list); 464 + 465 + list_add(&next->hash_list, &h->hash_list); 466 + atomic_inc(&next->refs); 467 + spin_unlock(&rbio->bio_list_lock); 468 + spin_unlock_irqrestore(&h->lock, flags); 469 + 470 + if (next->read_rebuild) 471 + async_read_rebuild(next); 472 + else 473 + async_rmw_stripe(next); 474 + 475 + goto done_nolock; 476 + 477 + } else if (waitqueue_active(&h->wait)) { 478 + spin_unlock(&rbio->bio_list_lock); 479 + spin_unlock_irqrestore(&h->lock, flags); 480 + wake_up(&h->wait); 481 + goto done_nolock; 482 + } 483 + } 484 + spin_unlock(&rbio->bio_list_lock); 485 + spin_unlock_irqrestore(&h->lock, flags); 486 + 487 + done_nolock: 488 + return; 489 + } 490 + 491 + static void __free_raid_bio(struct btrfs_raid_bio *rbio) 492 + { 493 + int i; 494 + 495 + WARN_ON(atomic_read(&rbio->refs) < 0); 496 + if (!atomic_dec_and_test(&rbio->refs)) 497 + return; 498 + 499 + WARN_ON(!list_empty(&rbio->hash_list)); 500 + WARN_ON(!bio_list_empty(&rbio->bio_list)); 501 + 502 + for (i = 0; i < rbio->nr_pages; i++) { 503 + if (rbio->stripe_pages[i]) { 504 + __free_page(rbio->stripe_pages[i]); 505 + rbio->stripe_pages[i] = NULL; 506 + } 507 + } 508 + kfree(rbio->raid_map); 509 + kfree(rbio->bbio); 510 + kfree(rbio); 511 + } 512 + 513 + static void free_raid_bio(struct btrfs_raid_bio *rbio) 514 + { 515 + unlock_stripe(rbio); 516 + __free_raid_bio(rbio); 517 + } 518 + 519 + /* 520 + * this frees the rbio and runs through all the bios in the 521 + * bio_list and calls end_io on them 522 + */ 523 + static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) 524 + { 525 + struct bio *cur = bio_list_get(&rbio->bio_list); 526 + struct bio *next; 527 + free_raid_bio(rbio); 528 + 529 + while (cur) { 530 + next = cur->bi_next; 531 + cur->bi_next = NULL; 532 + if (uptodate) 533 + set_bit(BIO_UPTODATE, &cur->bi_flags); 534 + bio_endio(cur, err); 535 + cur = next; 536 + } 537 + } 538 + 539 + /* 540 + * end io function used by finish_rmw. When we finally 541 + * get here, we've written a full stripe 542 + */ 543 + static void raid_write_end_io(struct bio *bio, int err) 544 + { 545 + struct btrfs_raid_bio *rbio = bio->bi_private; 546 + 547 + if (err) 548 + fail_bio_stripe(rbio, bio); 549 + 550 + bio_put(bio); 551 + 552 + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 553 + return; 554 + 555 + err = 0; 556 + 557 + /* OK, we have read all the stripes we need to. */ 558 + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 559 + err = -EIO; 560 + 561 + rbio_orig_end_io(rbio, err, 0); 562 + return; 563 + } 564 + 565 + /* 566 + * the read/modify/write code wants to use the original bio for 567 + * any pages it included, and then use the rbio for everything 568 + * else. This function decides if a given index (stripe number) 569 + * and page number in that stripe fall inside the original bio 570 + * or the rbio. 571 + * 572 + * if you set bio_list_only, you'll get a NULL back for any ranges 573 + * that are outside the bio_list 574 + * 575 + * This doesn't take any refs on anything, you get a bare page pointer 576 + * and the caller must bump refs as required. 577 + * 578 + * You must call index_rbio_pages once before you can trust 579 + * the answers from this function. 580 + */ 581 + static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 582 + int index, int pagenr, int bio_list_only) 583 + { 584 + int chunk_page; 585 + struct page *p = NULL; 586 + 587 + chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 588 + 589 + spin_lock_irq(&rbio->bio_list_lock); 590 + p = rbio->bio_pages[chunk_page]; 591 + spin_unlock_irq(&rbio->bio_list_lock); 592 + 593 + if (p || bio_list_only) 594 + return p; 595 + 596 + return rbio->stripe_pages[chunk_page]; 597 + } 598 + 599 + /* 600 + * number of pages we need for the entire stripe across all the 601 + * drives 602 + */ 603 + static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 604 + { 605 + unsigned long nr = stripe_len * nr_stripes; 606 + return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 607 + } 608 + 609 + /* 610 + * allocation and initial setup for the btrfs_raid_bio. Not 611 + * this does not allocate any pages for rbio->pages. 612 + */ 613 + static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, 614 + struct btrfs_bio *bbio, u64 *raid_map, 615 + u64 stripe_len) 616 + { 617 + struct btrfs_raid_bio *rbio; 618 + int nr_data = 0; 619 + int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); 620 + void *p; 621 + 622 + rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, 623 + GFP_NOFS); 624 + if (!rbio) { 625 + kfree(raid_map); 626 + kfree(bbio); 627 + return ERR_PTR(-ENOMEM); 628 + } 629 + 630 + bio_list_init(&rbio->bio_list); 631 + INIT_LIST_HEAD(&rbio->plug_list); 632 + spin_lock_init(&rbio->bio_list_lock); 633 + INIT_LIST_HEAD(&rbio->hash_list); 634 + rbio->bbio = bbio; 635 + rbio->raid_map = raid_map; 636 + rbio->fs_info = root->fs_info; 637 + rbio->stripe_len = stripe_len; 638 + rbio->nr_pages = num_pages; 639 + rbio->faila = -1; 640 + rbio->failb = -1; 641 + atomic_set(&rbio->refs, 1); 642 + 643 + /* 644 + * the stripe_pages and bio_pages array point to the extra 645 + * memory we allocated past the end of the rbio 646 + */ 647 + p = rbio + 1; 648 + rbio->stripe_pages = p; 649 + rbio->bio_pages = p + sizeof(struct page *) * num_pages; 650 + 651 + if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 652 + nr_data = bbio->num_stripes - 2; 653 + else 654 + nr_data = bbio->num_stripes - 1; 655 + 656 + rbio->nr_data = nr_data; 657 + return rbio; 658 + } 659 + 660 + /* allocate pages for all the stripes in the bio, including parity */ 661 + static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 662 + { 663 + int i; 664 + struct page *page; 665 + 666 + for (i = 0; i < rbio->nr_pages; i++) { 667 + if (rbio->stripe_pages[i]) 668 + continue; 669 + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 670 + if (!page) 671 + return -ENOMEM; 672 + rbio->stripe_pages[i] = page; 673 + ClearPageUptodate(page); 674 + } 675 + return 0; 676 + } 677 + 678 + /* allocate pages for just the p/q stripes */ 679 + static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 680 + { 681 + int i; 682 + struct page *page; 683 + 684 + i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 685 + 686 + for (; i < rbio->nr_pages; i++) { 687 + if (rbio->stripe_pages[i]) 688 + continue; 689 + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 690 + if (!page) 691 + return -ENOMEM; 692 + rbio->stripe_pages[i] = page; 693 + } 694 + return 0; 695 + } 696 + 697 + /* 698 + * add a single page from a specific stripe into our list of bios for IO 699 + * this will try to merge into existing bios if possible, and returns 700 + * zero if all went well. 701 + */ 702 + int rbio_add_io_page(struct btrfs_raid_bio *rbio, 703 + struct bio_list *bio_list, 704 + struct page *page, 705 + int stripe_nr, 706 + unsigned long page_index, 707 + unsigned long bio_max_len) 708 + { 709 + struct bio *last = bio_list->tail; 710 + u64 last_end = 0; 711 + int ret; 712 + struct bio *bio; 713 + struct btrfs_bio_stripe *stripe; 714 + u64 disk_start; 715 + 716 + stripe = &rbio->bbio->stripes[stripe_nr]; 717 + disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); 718 + 719 + /* if the device is missing, just fail this stripe */ 720 + if (!stripe->dev->bdev) 721 + return fail_rbio_index(rbio, stripe_nr); 722 + 723 + /* see if we can add this page onto our existing bio */ 724 + if (last) { 725 + last_end = (u64)last->bi_sector << 9; 726 + last_end += last->bi_size; 727 + 728 + /* 729 + * we can't merge these if they are from different 730 + * devices or if they are not contiguous 731 + */ 732 + if (last_end == disk_start && stripe->dev->bdev && 733 + test_bit(BIO_UPTODATE, &last->bi_flags) && 734 + last->bi_bdev == stripe->dev->bdev) { 735 + ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); 736 + if (ret == PAGE_CACHE_SIZE) 737 + return 0; 738 + } 739 + } 740 + 741 + /* put a new bio on the list */ 742 + bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 743 + if (!bio) 744 + return -ENOMEM; 745 + 746 + bio->bi_size = 0; 747 + bio->bi_bdev = stripe->dev->bdev; 748 + bio->bi_sector = disk_start >> 9; 749 + set_bit(BIO_UPTODATE, &bio->bi_flags); 750 + 751 + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 752 + bio_list_add(bio_list, bio); 753 + return 0; 754 + } 755 + 756 + /* 757 + * while we're doing the read/modify/write cycle, we could 758 + * have errors in reading pages off the disk. This checks 759 + * for errors and if we're not able to read the page it'll 760 + * trigger parity reconstruction. The rmw will be finished 761 + * after we've reconstructed the failed stripes 762 + */ 763 + static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 764 + { 765 + if (rbio->faila >= 0 || rbio->failb >= 0) { 766 + BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); 767 + __raid56_parity_recover(rbio); 768 + } else { 769 + finish_rmw(rbio); 770 + } 771 + } 772 + 773 + /* 774 + * these are just the pages from the rbio array, not from anything 775 + * the FS sent down to us 776 + */ 777 + static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) 778 + { 779 + int index; 780 + index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); 781 + index += page; 782 + return rbio->stripe_pages[index]; 783 + } 784 + 785 + /* 786 + * helper function to walk our bio list and populate the bio_pages array with 787 + * the result. This seems expensive, but it is faster than constantly 788 + * searching through the bio list as we setup the IO in finish_rmw or stripe 789 + * reconstruction. 790 + * 791 + * This must be called before you trust the answers from page_in_rbio 792 + */ 793 + static void index_rbio_pages(struct btrfs_raid_bio *rbio) 794 + { 795 + struct bio *bio; 796 + u64 start; 797 + unsigned long stripe_offset; 798 + unsigned long page_index; 799 + struct page *p; 800 + int i; 801 + 802 + spin_lock_irq(&rbio->bio_list_lock); 803 + bio_list_for_each(bio, &rbio->bio_list) { 804 + start = (u64)bio->bi_sector << 9; 805 + stripe_offset = start - rbio->raid_map[0]; 806 + page_index = stripe_offset >> PAGE_CACHE_SHIFT; 807 + 808 + for (i = 0; i < bio->bi_vcnt; i++) { 809 + p = bio->bi_io_vec[i].bv_page; 810 + rbio->bio_pages[page_index + i] = p; 811 + } 812 + } 813 + spin_unlock_irq(&rbio->bio_list_lock); 814 + } 815 + 816 + /* 817 + * this is called from one of two situations. We either 818 + * have a full stripe from the higher layers, or we've read all 819 + * the missing bits off disk. 820 + * 821 + * This will calculate the parity and then send down any 822 + * changed blocks. 823 + */ 824 + static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 825 + { 826 + struct btrfs_bio *bbio = rbio->bbio; 827 + void *pointers[bbio->num_stripes]; 828 + int stripe_len = rbio->stripe_len; 829 + int nr_data = rbio->nr_data; 830 + int stripe; 831 + int pagenr; 832 + int p_stripe = -1; 833 + int q_stripe = -1; 834 + struct bio_list bio_list; 835 + struct bio *bio; 836 + int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; 837 + int ret; 838 + 839 + bio_list_init(&bio_list); 840 + 841 + if (bbio->num_stripes - rbio->nr_data == 1) { 842 + p_stripe = bbio->num_stripes - 1; 843 + } else if (bbio->num_stripes - rbio->nr_data == 2) { 844 + p_stripe = bbio->num_stripes - 2; 845 + q_stripe = bbio->num_stripes - 1; 846 + } else { 847 + BUG(); 848 + } 849 + 850 + /* at this point we either have a full stripe, 851 + * or we've read the full stripe from the drive. 852 + * recalculate the parity and write the new results. 853 + * 854 + * We're not allowed to add any new bios to the 855 + * bio list here, anyone else that wants to 856 + * change this stripe needs to do their own rmw. 857 + */ 858 + spin_lock_irq(&rbio->bio_list_lock); 859 + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 860 + spin_unlock_irq(&rbio->bio_list_lock); 861 + 862 + atomic_set(&rbio->bbio->error, 0); 863 + 864 + /* 865 + * now that we've set rmw_locked, run through the 866 + * bio list one last time and map the page pointers 867 + */ 868 + index_rbio_pages(rbio); 869 + 870 + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 871 + struct page *p; 872 + /* first collect one page from each data stripe */ 873 + for (stripe = 0; stripe < nr_data; stripe++) { 874 + p = page_in_rbio(rbio, stripe, pagenr, 0); 875 + pointers[stripe] = kmap(p); 876 + } 877 + 878 + /* then add the parity stripe */ 879 + p = rbio_pstripe_page(rbio, pagenr); 880 + SetPageUptodate(p); 881 + pointers[stripe++] = kmap(p); 882 + 883 + if (q_stripe != -1) { 884 + 885 + /* 886 + * raid6, add the qstripe and call the 887 + * library function to fill in our p/q 888 + */ 889 + p = rbio_qstripe_page(rbio, pagenr); 890 + SetPageUptodate(p); 891 + pointers[stripe++] = kmap(p); 892 + 893 + raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, 894 + pointers); 895 + } else { 896 + /* raid5 */ 897 + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 898 + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 899 + } 900 + 901 + 902 + for (stripe = 0; stripe < bbio->num_stripes; stripe++) 903 + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 904 + } 905 + 906 + /* 907 + * time to start writing. Make bios for everything from the 908 + * higher layers (the bio_list in our rbio) and our p/q. Ignore 909 + * everything else. 910 + */ 911 + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 912 + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 913 + struct page *page; 914 + if (stripe < rbio->nr_data) { 915 + page = page_in_rbio(rbio, stripe, pagenr, 1); 916 + if (!page) 917 + continue; 918 + } else { 919 + page = rbio_stripe_page(rbio, stripe, pagenr); 920 + } 921 + 922 + ret = rbio_add_io_page(rbio, &bio_list, 923 + page, stripe, pagenr, rbio->stripe_len); 924 + if (ret) 925 + goto cleanup; 926 + } 927 + } 928 + 929 + atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); 930 + BUG_ON(atomic_read(&bbio->stripes_pending) == 0); 931 + 932 + while (1) { 933 + bio = bio_list_pop(&bio_list); 934 + if (!bio) 935 + break; 936 + 937 + bio->bi_private = rbio; 938 + bio->bi_end_io = raid_write_end_io; 939 + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 940 + submit_bio(WRITE, bio); 941 + } 942 + return; 943 + 944 + cleanup: 945 + rbio_orig_end_io(rbio, -EIO, 0); 946 + } 947 + 948 + /* 949 + * helper to find the stripe number for a given bio. Used to figure out which 950 + * stripe has failed. This expects the bio to correspond to a physical disk, 951 + * so it looks up based on physical sector numbers. 952 + */ 953 + static int find_bio_stripe(struct btrfs_raid_bio *rbio, 954 + struct bio *bio) 955 + { 956 + u64 physical = bio->bi_sector; 957 + u64 stripe_start; 958 + int i; 959 + struct btrfs_bio_stripe *stripe; 960 + 961 + physical <<= 9; 962 + 963 + for (i = 0; i < rbio->bbio->num_stripes; i++) { 964 + stripe = &rbio->bbio->stripes[i]; 965 + stripe_start = stripe->physical; 966 + if (physical >= stripe_start && 967 + physical < stripe_start + rbio->stripe_len) { 968 + return i; 969 + } 970 + } 971 + return -1; 972 + } 973 + 974 + /* 975 + * helper to find the stripe number for a given 976 + * bio (before mapping). Used to figure out which stripe has 977 + * failed. This looks up based on logical block numbers. 978 + */ 979 + static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 980 + struct bio *bio) 981 + { 982 + u64 logical = bio->bi_sector; 983 + u64 stripe_start; 984 + int i; 985 + 986 + logical <<= 9; 987 + 988 + for (i = 0; i < rbio->nr_data; i++) { 989 + stripe_start = rbio->raid_map[i]; 990 + if (logical >= stripe_start && 991 + logical < stripe_start + rbio->stripe_len) { 992 + return i; 993 + } 994 + } 995 + return -1; 996 + } 997 + 998 + /* 999 + * returns -EIO if we had too many failures 1000 + */ 1001 + static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1002 + { 1003 + unsigned long flags; 1004 + int ret = 0; 1005 + 1006 + spin_lock_irqsave(&rbio->bio_list_lock, flags); 1007 + 1008 + /* we already know this stripe is bad, move on */ 1009 + if (rbio->faila == failed || rbio->failb == failed) 1010 + goto out; 1011 + 1012 + if (rbio->faila == -1) { 1013 + /* first failure on this rbio */ 1014 + rbio->faila = failed; 1015 + atomic_inc(&rbio->bbio->error); 1016 + } else if (rbio->failb == -1) { 1017 + /* second failure on this rbio */ 1018 + rbio->failb = failed; 1019 + atomic_inc(&rbio->bbio->error); 1020 + } else { 1021 + ret = -EIO; 1022 + } 1023 + out: 1024 + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1025 + 1026 + return ret; 1027 + } 1028 + 1029 + /* 1030 + * helper to fail a stripe based on a physical disk 1031 + * bio. 1032 + */ 1033 + static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1034 + struct bio *bio) 1035 + { 1036 + int failed = find_bio_stripe(rbio, bio); 1037 + 1038 + if (failed < 0) 1039 + return -EIO; 1040 + 1041 + return fail_rbio_index(rbio, failed); 1042 + } 1043 + 1044 + /* 1045 + * this sets each page in the bio uptodate. It should only be used on private 1046 + * rbio pages, nothing that comes in from the higher layers 1047 + */ 1048 + static void set_bio_pages_uptodate(struct bio *bio) 1049 + { 1050 + int i; 1051 + struct page *p; 1052 + 1053 + for (i = 0; i < bio->bi_vcnt; i++) { 1054 + p = bio->bi_io_vec[i].bv_page; 1055 + SetPageUptodate(p); 1056 + } 1057 + } 1058 + 1059 + /* 1060 + * end io for the read phase of the rmw cycle. All the bios here are physical 1061 + * stripe bios we've read from the disk so we can recalculate the parity of the 1062 + * stripe. 1063 + * 1064 + * This will usually kick off finish_rmw once all the bios are read in, but it 1065 + * may trigger parity reconstruction if we had any errors along the way 1066 + */ 1067 + static void raid_rmw_end_io(struct bio *bio, int err) 1068 + { 1069 + struct btrfs_raid_bio *rbio = bio->bi_private; 1070 + 1071 + if (err) 1072 + fail_bio_stripe(rbio, bio); 1073 + else 1074 + set_bio_pages_uptodate(bio); 1075 + 1076 + bio_put(bio); 1077 + 1078 + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1079 + return; 1080 + 1081 + err = 0; 1082 + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1083 + goto cleanup; 1084 + 1085 + /* 1086 + * this will normally call finish_rmw to start our write 1087 + * but if there are any failed stripes we'll reconstruct 1088 + * from parity first 1089 + */ 1090 + validate_rbio_for_rmw(rbio); 1091 + return; 1092 + 1093 + cleanup: 1094 + 1095 + rbio_orig_end_io(rbio, -EIO, 0); 1096 + } 1097 + 1098 + static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1099 + { 1100 + rbio->work.flags = 0; 1101 + rbio->work.func = rmw_work; 1102 + 1103 + btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1104 + &rbio->work); 1105 + } 1106 + 1107 + static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1108 + { 1109 + rbio->work.flags = 0; 1110 + rbio->work.func = read_rebuild_work; 1111 + 1112 + btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1113 + &rbio->work); 1114 + } 1115 + 1116 + /* 1117 + * the stripe must be locked by the caller. It will 1118 + * unlock after all the writes are done 1119 + */ 1120 + static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1121 + { 1122 + int bios_to_read = 0; 1123 + struct btrfs_bio *bbio = rbio->bbio; 1124 + struct bio_list bio_list; 1125 + int ret; 1126 + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1127 + int pagenr; 1128 + int stripe; 1129 + struct bio *bio; 1130 + 1131 + bio_list_init(&bio_list); 1132 + 1133 + ret = alloc_rbio_pages(rbio); 1134 + if (ret) 1135 + goto cleanup; 1136 + 1137 + index_rbio_pages(rbio); 1138 + 1139 + atomic_set(&rbio->bbio->error, 0); 1140 + /* 1141 + * build a list of bios to read all the missing parts of this 1142 + * stripe 1143 + */ 1144 + for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1145 + for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1146 + struct page *page; 1147 + /* 1148 + * we want to find all the pages missing from 1149 + * the rbio and read them from the disk. If 1150 + * page_in_rbio finds a page in the bio list 1151 + * we don't need to read it off the stripe. 1152 + */ 1153 + page = page_in_rbio(rbio, stripe, pagenr, 1); 1154 + if (page) 1155 + continue; 1156 + 1157 + page = rbio_stripe_page(rbio, stripe, pagenr); 1158 + ret = rbio_add_io_page(rbio, &bio_list, page, 1159 + stripe, pagenr, rbio->stripe_len); 1160 + if (ret) 1161 + goto cleanup; 1162 + } 1163 + } 1164 + 1165 + bios_to_read = bio_list_size(&bio_list); 1166 + if (!bios_to_read) { 1167 + /* 1168 + * this can happen if others have merged with 1169 + * us, it means there is nothing left to read. 1170 + * But if there are missing devices it may not be 1171 + * safe to do the full stripe write yet. 1172 + */ 1173 + goto finish; 1174 + } 1175 + 1176 + /* 1177 + * the bbio may be freed once we submit the last bio. Make sure 1178 + * not to touch it after that 1179 + */ 1180 + atomic_set(&bbio->stripes_pending, bios_to_read); 1181 + while (1) { 1182 + bio = bio_list_pop(&bio_list); 1183 + if (!bio) 1184 + break; 1185 + 1186 + bio->bi_private = rbio; 1187 + bio->bi_end_io = raid_rmw_end_io; 1188 + 1189 + btrfs_bio_wq_end_io(rbio->fs_info, bio, 1190 + BTRFS_WQ_ENDIO_RAID56); 1191 + 1192 + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1193 + submit_bio(READ, bio); 1194 + } 1195 + /* the actual write will happen once the reads are done */ 1196 + return 0; 1197 + 1198 + cleanup: 1199 + rbio_orig_end_io(rbio, -EIO, 0); 1200 + return -EIO; 1201 + 1202 + finish: 1203 + validate_rbio_for_rmw(rbio); 1204 + return 0; 1205 + } 1206 + 1207 + /* 1208 + * if the upper layers pass in a full stripe, we thank them by only allocating 1209 + * enough pages to hold the parity, and sending it all down quickly. 1210 + */ 1211 + static int full_stripe_write(struct btrfs_raid_bio *rbio) 1212 + { 1213 + int ret; 1214 + 1215 + ret = alloc_rbio_parity_pages(rbio); 1216 + if (ret) 1217 + return ret; 1218 + 1219 + ret = lock_stripe_add(rbio); 1220 + if (ret == 0) 1221 + finish_rmw(rbio); 1222 + return 0; 1223 + } 1224 + 1225 + /* 1226 + * partial stripe writes get handed over to async helpers. 1227 + * We're really hoping to merge a few more writes into this 1228 + * rbio before calculating new parity 1229 + */ 1230 + static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1231 + { 1232 + int ret; 1233 + 1234 + ret = lock_stripe_add(rbio); 1235 + if (ret == 0) 1236 + async_rmw_stripe(rbio); 1237 + return 0; 1238 + } 1239 + 1240 + /* 1241 + * sometimes while we were reading from the drive to 1242 + * recalculate parity, enough new bios come into create 1243 + * a full stripe. So we do a check here to see if we can 1244 + * go directly to finish_rmw 1245 + */ 1246 + static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1247 + { 1248 + /* head off into rmw land if we don't have a full stripe */ 1249 + if (!rbio_is_full(rbio)) 1250 + return partial_stripe_write(rbio); 1251 + return full_stripe_write(rbio); 1252 + } 1253 + 1254 + /* 1255 + * our main entry point for writes from the rest of the FS. 1256 + */ 1257 + int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 1258 + struct btrfs_bio *bbio, u64 *raid_map, 1259 + u64 stripe_len) 1260 + { 1261 + struct btrfs_raid_bio *rbio; 1262 + 1263 + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1264 + if (IS_ERR(rbio)) { 1265 + kfree(raid_map); 1266 + kfree(bbio); 1267 + return PTR_ERR(rbio); 1268 + } 1269 + bio_list_add(&rbio->bio_list, bio); 1270 + rbio->bio_list_bytes = bio->bi_size; 1271 + return __raid56_parity_write(rbio); 1272 + } 1273 + 1274 + /* 1275 + * all parity reconstruction happens here. We've read in everything 1276 + * we can find from the drives and this does the heavy lifting of 1277 + * sorting the good from the bad. 1278 + */ 1279 + static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1280 + { 1281 + int pagenr, stripe; 1282 + void **pointers; 1283 + int faila = -1, failb = -1; 1284 + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1285 + struct page *page; 1286 + int err; 1287 + int i; 1288 + 1289 + pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), 1290 + GFP_NOFS); 1291 + if (!pointers) { 1292 + err = -ENOMEM; 1293 + goto cleanup_io; 1294 + } 1295 + 1296 + faila = rbio->faila; 1297 + failb = rbio->failb; 1298 + 1299 + if (rbio->read_rebuild) { 1300 + spin_lock_irq(&rbio->bio_list_lock); 1301 + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1302 + spin_unlock_irq(&rbio->bio_list_lock); 1303 + } 1304 + 1305 + index_rbio_pages(rbio); 1306 + 1307 + for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1308 + /* setup our array of pointers with pages 1309 + * from each stripe 1310 + */ 1311 + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1312 + /* 1313 + * if we're rebuilding a read, we have to use 1314 + * pages from the bio list 1315 + */ 1316 + if (rbio->read_rebuild && 1317 + (stripe == faila || stripe == failb)) { 1318 + page = page_in_rbio(rbio, stripe, pagenr, 0); 1319 + } else { 1320 + page = rbio_stripe_page(rbio, stripe, pagenr); 1321 + } 1322 + pointers[stripe] = kmap(page); 1323 + } 1324 + 1325 + /* all raid6 handling here */ 1326 + if (rbio->raid_map[rbio->bbio->num_stripes - 1] == 1327 + RAID6_Q_STRIPE) { 1328 + 1329 + /* 1330 + * single failure, rebuild from parity raid5 1331 + * style 1332 + */ 1333 + if (failb < 0) { 1334 + if (faila == rbio->nr_data) { 1335 + /* 1336 + * Just the P stripe has failed, without 1337 + * a bad data or Q stripe. 1338 + * TODO, we should redo the xor here. 1339 + */ 1340 + err = -EIO; 1341 + goto cleanup; 1342 + } 1343 + /* 1344 + * a single failure in raid6 is rebuilt 1345 + * in the pstripe code below 1346 + */ 1347 + goto pstripe; 1348 + } 1349 + 1350 + /* make sure our ps and qs are in order */ 1351 + if (faila > failb) { 1352 + int tmp = failb; 1353 + failb = faila; 1354 + faila = tmp; 1355 + } 1356 + 1357 + /* if the q stripe is failed, do a pstripe reconstruction 1358 + * from the xors. 1359 + * If both the q stripe and the P stripe are failed, we're 1360 + * here due to a crc mismatch and we can't give them the 1361 + * data they want 1362 + */ 1363 + if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { 1364 + if (rbio->raid_map[faila] == RAID5_P_STRIPE) { 1365 + err = -EIO; 1366 + goto cleanup; 1367 + } 1368 + /* 1369 + * otherwise we have one bad data stripe and 1370 + * a good P stripe. raid5! 1371 + */ 1372 + goto pstripe; 1373 + } 1374 + 1375 + if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 1376 + raid6_datap_recov(rbio->bbio->num_stripes, 1377 + PAGE_SIZE, faila, pointers); 1378 + } else { 1379 + raid6_2data_recov(rbio->bbio->num_stripes, 1380 + PAGE_SIZE, faila, failb, 1381 + pointers); 1382 + } 1383 + } else { 1384 + void *p; 1385 + 1386 + /* rebuild from P stripe here (raid5 or raid6) */ 1387 + BUG_ON(failb != -1); 1388 + pstripe: 1389 + /* Copy parity block into failed block to start with */ 1390 + memcpy(pointers[faila], 1391 + pointers[rbio->nr_data], 1392 + PAGE_CACHE_SIZE); 1393 + 1394 + /* rearrange the pointer array */ 1395 + p = pointers[faila]; 1396 + for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1397 + pointers[stripe] = pointers[stripe + 1]; 1398 + pointers[rbio->nr_data - 1] = p; 1399 + 1400 + /* xor in the rest */ 1401 + run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); 1402 + } 1403 + /* if we're doing this rebuild as part of an rmw, go through 1404 + * and set all of our private rbio pages in the 1405 + * failed stripes as uptodate. This way finish_rmw will 1406 + * know they can be trusted. If this was a read reconstruction, 1407 + * other endio functions will fiddle the uptodate bits 1408 + */ 1409 + if (!rbio->read_rebuild) { 1410 + for (i = 0; i < nr_pages; i++) { 1411 + if (faila != -1) { 1412 + page = rbio_stripe_page(rbio, faila, i); 1413 + SetPageUptodate(page); 1414 + } 1415 + if (failb != -1) { 1416 + page = rbio_stripe_page(rbio, failb, i); 1417 + SetPageUptodate(page); 1418 + } 1419 + } 1420 + } 1421 + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1422 + /* 1423 + * if we're rebuilding a read, we have to use 1424 + * pages from the bio list 1425 + */ 1426 + if (rbio->read_rebuild && 1427 + (stripe == faila || stripe == failb)) { 1428 + page = page_in_rbio(rbio, stripe, pagenr, 0); 1429 + } else { 1430 + page = rbio_stripe_page(rbio, stripe, pagenr); 1431 + } 1432 + kunmap(page); 1433 + } 1434 + } 1435 + 1436 + err = 0; 1437 + cleanup: 1438 + kfree(pointers); 1439 + 1440 + cleanup_io: 1441 + 1442 + if (rbio->read_rebuild) { 1443 + rbio_orig_end_io(rbio, err, err == 0); 1444 + } else if (err == 0) { 1445 + rbio->faila = -1; 1446 + rbio->failb = -1; 1447 + finish_rmw(rbio); 1448 + } else { 1449 + rbio_orig_end_io(rbio, err, 0); 1450 + } 1451 + } 1452 + 1453 + /* 1454 + * This is called only for stripes we've read from disk to 1455 + * reconstruct the parity. 1456 + */ 1457 + static void raid_recover_end_io(struct bio *bio, int err) 1458 + { 1459 + struct btrfs_raid_bio *rbio = bio->bi_private; 1460 + 1461 + /* 1462 + * we only read stripe pages off the disk, set them 1463 + * up to date if there were no errors 1464 + */ 1465 + if (err) 1466 + fail_bio_stripe(rbio, bio); 1467 + else 1468 + set_bio_pages_uptodate(bio); 1469 + bio_put(bio); 1470 + 1471 + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1472 + return; 1473 + 1474 + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1475 + rbio_orig_end_io(rbio, -EIO, 0); 1476 + else 1477 + __raid_recover_end_io(rbio); 1478 + } 1479 + 1480 + /* 1481 + * reads everything we need off the disk to reconstruct 1482 + * the parity. endio handlers trigger final reconstruction 1483 + * when the IO is done. 1484 + * 1485 + * This is used both for reads from the higher layers and for 1486 + * parity construction required to finish a rmw cycle. 1487 + */ 1488 + static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 1489 + { 1490 + int bios_to_read = 0; 1491 + struct btrfs_bio *bbio = rbio->bbio; 1492 + struct bio_list bio_list; 1493 + int ret; 1494 + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1495 + int pagenr; 1496 + int stripe; 1497 + struct bio *bio; 1498 + 1499 + bio_list_init(&bio_list); 1500 + 1501 + ret = alloc_rbio_pages(rbio); 1502 + if (ret) 1503 + goto cleanup; 1504 + 1505 + atomic_set(&rbio->bbio->error, 0); 1506 + 1507 + /* 1508 + * read everything that hasn't failed. 1509 + */ 1510 + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1511 + if (rbio->faila == stripe || 1512 + rbio->failb == stripe) 1513 + continue; 1514 + 1515 + for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1516 + struct page *p; 1517 + 1518 + /* 1519 + * the rmw code may have already read this 1520 + * page in 1521 + */ 1522 + p = rbio_stripe_page(rbio, stripe, pagenr); 1523 + if (PageUptodate(p)) 1524 + continue; 1525 + 1526 + ret = rbio_add_io_page(rbio, &bio_list, 1527 + rbio_stripe_page(rbio, stripe, pagenr), 1528 + stripe, pagenr, rbio->stripe_len); 1529 + if (ret < 0) 1530 + goto cleanup; 1531 + } 1532 + } 1533 + 1534 + bios_to_read = bio_list_size(&bio_list); 1535 + if (!bios_to_read) { 1536 + /* 1537 + * we might have no bios to read just because the pages 1538 + * were up to date, or we might have no bios to read because 1539 + * the devices were gone. 1540 + */ 1541 + if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { 1542 + __raid_recover_end_io(rbio); 1543 + goto out; 1544 + } else { 1545 + goto cleanup; 1546 + } 1547 + } 1548 + 1549 + /* 1550 + * the bbio may be freed once we submit the last bio. Make sure 1551 + * not to touch it after that 1552 + */ 1553 + atomic_set(&bbio->stripes_pending, bios_to_read); 1554 + while (1) { 1555 + bio = bio_list_pop(&bio_list); 1556 + if (!bio) 1557 + break; 1558 + 1559 + bio->bi_private = rbio; 1560 + bio->bi_end_io = raid_recover_end_io; 1561 + 1562 + btrfs_bio_wq_end_io(rbio->fs_info, bio, 1563 + BTRFS_WQ_ENDIO_RAID56); 1564 + 1565 + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1566 + submit_bio(READ, bio); 1567 + } 1568 + out: 1569 + return 0; 1570 + 1571 + cleanup: 1572 + if (rbio->read_rebuild) 1573 + rbio_orig_end_io(rbio, -EIO, 0); 1574 + return -EIO; 1575 + } 1576 + 1577 + /* 1578 + * the main entry point for reads from the higher layers. This 1579 + * is really only called when the normal read path had a failure, 1580 + * so we assume the bio they send down corresponds to a failed part 1581 + * of the drive. 1582 + */ 1583 + int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 1584 + struct btrfs_bio *bbio, u64 *raid_map, 1585 + u64 stripe_len, int mirror_num) 1586 + { 1587 + struct btrfs_raid_bio *rbio; 1588 + int ret; 1589 + 1590 + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1591 + if (IS_ERR(rbio)) { 1592 + return PTR_ERR(rbio); 1593 + } 1594 + 1595 + rbio->read_rebuild = 1; 1596 + bio_list_add(&rbio->bio_list, bio); 1597 + rbio->bio_list_bytes = bio->bi_size; 1598 + 1599 + rbio->faila = find_logical_bio_stripe(rbio, bio); 1600 + if (rbio->faila == -1) { 1601 + BUG(); 1602 + kfree(rbio); 1603 + return -EIO; 1604 + } 1605 + 1606 + /* 1607 + * reconstruct from the q stripe if they are 1608 + * asking for mirror 3 1609 + */ 1610 + if (mirror_num == 3) 1611 + rbio->failb = bbio->num_stripes - 2; 1612 + 1613 + ret = lock_stripe_add(rbio); 1614 + 1615 + /* 1616 + * __raid56_parity_recover will end the bio with 1617 + * any errors it hits. We don't want to return 1618 + * its error value up the stack because our caller 1619 + * will end up calling bio_endio with any nonzero 1620 + * return 1621 + */ 1622 + if (ret == 0) 1623 + __raid56_parity_recover(rbio); 1624 + /* 1625 + * our rbio has been added to the list of 1626 + * rbios that will be handled after the 1627 + * currently lock owner is done 1628 + */ 1629 + return 0; 1630 + 1631 + } 1632 + 1633 + static void rmw_work(struct btrfs_work *work) 1634 + { 1635 + struct btrfs_raid_bio *rbio; 1636 + 1637 + rbio = container_of(work, struct btrfs_raid_bio, work); 1638 + raid56_rmw_stripe(rbio); 1639 + } 1640 + 1641 + static void read_rebuild_work(struct btrfs_work *work) 1642 + { 1643 + struct btrfs_raid_bio *rbio; 1644 + 1645 + rbio = container_of(work, struct btrfs_raid_bio, work); 1646 + __raid56_parity_recover(rbio); 1647 + }

+51

fs/btrfs/raid56.h

··· 1 + /* 2 + * Copyright (C) 2012 Fusion-io All rights reserved. 3 + * Copyright (C) 2012 Intel Corp. All rights reserved. 4 + * 5 + * This program is free software; you can redistribute it and/or 6 + * modify it under the terms of the GNU General Public 7 + * License v2 as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope that it will be useful, 10 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 + * General Public License for more details. 13 + * 14 + * You should have received a copy of the GNU General Public 15 + * License along with this program; if not, write to the 16 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 17 + * Boston, MA 021110-1307, USA. 18 + */ 19 + 20 + #ifndef __BTRFS_RAID56__ 21 + #define __BTRFS_RAID56__ 22 + static inline int nr_parity_stripes(struct map_lookup *map) 23 + { 24 + if (map->type & BTRFS_BLOCK_GROUP_RAID5) 25 + return 1; 26 + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 27 + return 2; 28 + else 29 + return 0; 30 + } 31 + 32 + static inline int nr_data_stripes(struct map_lookup *map) 33 + { 34 + return map->num_stripes - nr_parity_stripes(map); 35 + } 36 + #define RAID5_P_STRIPE ((u64)-2) 37 + #define RAID6_Q_STRIPE ((u64)-1) 38 + 39 + #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ 40 + ((x) == RAID6_Q_STRIPE)) 41 + 42 + int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 43 + struct btrfs_bio *bbio, u64 *raid_map, 44 + u64 stripe_len, int mirror_num); 45 + int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 46 + struct btrfs_bio *bbio, u64 *raid_map, 47 + u64 stripe_len); 48 + 49 + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); 50 + void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); 51 + #endif

+8

fs/btrfs/scrub.c

··· 28 28 #include "dev-replace.h" 29 29 #include "check-integrity.h" 30 30 #include "rcu-string.h" 31 + #include "raid56.h" 31 32 32 33 /* 33 34 * This is only the first step towards a full-features scrub. It reads all ··· 2246 2245 u64 extent_len; 2247 2246 struct btrfs_device *extent_dev; 2248 2247 int extent_mirror_num; 2248 + 2249 + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 2250 + BTRFS_BLOCK_GROUP_RAID6)) { 2251 + if (num >= nr_data_stripes(map)) { 2252 + return 0; 2253 + } 2254 + } 2249 2255 2250 2256 nstripes = length; 2251 2257 offset = 0;

+3

fs/btrfs/transaction.c

··· 686 686 struct extent_state *cached_state = NULL; 687 687 u64 start = 0; 688 688 u64 end; 689 + struct blk_plug plug; 689 690 691 + blk_start_plug(&plug); 690 692 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 691 693 mark, &cached_state)) { 692 694 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, ··· 702 700 } 703 701 if (err) 704 702 werr = err; 703 + blk_finish_plug(&plug); 705 704 return werr; 706 705 } 707 706

+344 -41

fs/btrfs/volumes.c

··· 25 25 #include <linux/capability.h> 26 26 #include <linux/ratelimit.h> 27 27 #include <linux/kthread.h> 28 + #include <linux/raid/pq.h> 29 + #include <asm/div64.h> 28 30 #include "compat.h" 29 31 #include "ctree.h" 30 32 #include "extent_map.h" ··· 34 32 #include "transaction.h" 35 33 #include "print-tree.h" 36 34 #include "volumes.h" 35 + #include "raid56.h" 37 36 #include "async-thread.h" 38 37 #include "check-integrity.h" 39 38 #include "rcu-string.h" ··· 1392 1389 } 1393 1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1394 1391 1392 + if ((all_avail & (BTRFS_BLOCK_GROUP_RAID5 | 1393 + BTRFS_BLOCK_GROUP_RAID6) && num_devices <= 3)) { 1394 + printk(KERN_ERR "btrfs: unable to go below three devices " 1395 + "on raid5 or raid6\n"); 1396 + ret = -EINVAL; 1397 + goto out; 1398 + } 1399 + 1395 1400 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1396 1401 printk(KERN_ERR "btrfs: unable to go below four devices " 1397 1402 "on raid10\n"); ··· 1410 1399 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1411 1400 printk(KERN_ERR "btrfs: unable to go below two " 1412 1401 "devices on raid1\n"); 1402 + ret = -EINVAL; 1403 + goto out; 1404 + } 1405 + 1406 + if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1407 + root->fs_info->fs_devices->rw_devices <= 2) { 1408 + printk(KERN_ERR "btrfs: unable to go below two " 1409 + "devices on raid5\n"); 1410 + ret = -EINVAL; 1411 + goto out; 1412 + } 1413 + if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1414 + root->fs_info->fs_devices->rw_devices <= 3) { 1415 + printk(KERN_ERR "btrfs: unable to go below three " 1416 + "devices on raid6\n"); 1413 1417 ret = -EINVAL; 1414 1418 goto out; 1415 1419 } ··· 2683 2657 return 0; 2684 2658 2685 2659 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2686 - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2687 - factor = 2; 2688 - else 2689 - factor = 1; 2690 - factor = num_stripes / factor; 2660 + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 2661 + factor = num_stripes / 2; 2662 + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 2663 + factor = num_stripes - 1; 2664 + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 2665 + factor = num_stripes - 2; 2666 + } else { 2667 + factor = num_stripes; 2668 + } 2691 2669 2692 2670 for (i = 0; i < num_stripes; i++) { 2693 2671 stripe = btrfs_stripe_nr(chunk, i); ··· 3006 2976 int mixed = 0; 3007 2977 int ret; 3008 2978 u64 num_devices; 2979 + int cancel = 0; 3009 2980 3010 2981 if (btrfs_fs_closing(fs_info) || 3011 2982 atomic_read(&fs_info->balance_pause_req) || ··· 3049 3018 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3050 3019 else 3051 3020 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3052 - BTRFS_BLOCK_GROUP_RAID10); 3021 + BTRFS_BLOCK_GROUP_RAID10 | 3022 + BTRFS_BLOCK_GROUP_RAID5 | 3023 + BTRFS_BLOCK_GROUP_RAID6); 3053 3024 3054 3025 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3055 3026 (!alloc_profile_is_valid(bctl->data.target, 1) || ··· 3091 3058 3092 3059 /* allow to reduce meta or sys integrity only if force set */ 3093 3060 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3094 - BTRFS_BLOCK_GROUP_RAID10; 3061 + BTRFS_BLOCK_GROUP_RAID10 | 3062 + BTRFS_BLOCK_GROUP_RAID5 | 3063 + BTRFS_BLOCK_GROUP_RAID6; 3064 + 3095 3065 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3096 3066 (fs_info->avail_system_alloc_bits & allowed) && 3097 3067 !(bctl->sys.target & allowed)) || ··· 3160 3124 } 3161 3125 3162 3126 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3163 - balance_need_close(fs_info)) { 3164 - __cancel_balance(fs_info); 3165 - } 3127 + balance_need_close(fs_info)) 3128 + cancel = 1; 3166 3129 3167 3130 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3168 3131 fs_info->num_tolerated_disk_barrier_failures = 3169 3132 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3170 3133 } 3134 + 3135 + if (cancel) 3136 + __cancel_balance(fs_info); 3171 3137 3172 3138 wake_up(&fs_info->balance_wait_q); 3173 3139 ··· 3531 3493 } 3532 3494 3533 3495 struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3496 + /* 3497 + * sub_stripes info for map, 3498 + * dev_stripes -- stripes per dev, 2 for DUP, 1 other wise 3499 + * devs_max -- max devices per stripe, 0 for unlimited 3500 + * devs_min -- min devices per stripe 3501 + * devs_increment -- ndevs must be a multiple of this 3502 + * ncopies -- how many copies of the data we have 3503 + */ 3534 3504 { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3535 3505 { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3536 3506 { 1, 2, 1, 1, 1, 2 /* dup */ }, 3537 3507 { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3538 3508 { 1, 1, 0, 1, 1, 1 /* single */ }, 3509 + { 1, 1, 0, 2, 1, 2 /* raid5 */ }, 3510 + { 1, 1, 0, 3, 1, 3 /* raid6 */ }, 3539 3511 }; 3512 + 3513 + static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) 3514 + { 3515 + /* TODO allow them to set a preferred stripe size */ 3516 + return 64 * 1024; 3517 + } 3518 + 3519 + static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 3520 + { 3521 + u64 features; 3522 + 3523 + if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) 3524 + return; 3525 + 3526 + features = btrfs_super_incompat_flags(info->super_copy); 3527 + if (features & BTRFS_FEATURE_INCOMPAT_RAID56) 3528 + return; 3529 + 3530 + features |= BTRFS_FEATURE_INCOMPAT_RAID56; 3531 + btrfs_set_super_incompat_flags(info->super_copy, features); 3532 + printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); 3533 + } 3540 3534 3541 3535 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3542 3536 struct btrfs_root *extent_root, ··· 3585 3515 struct btrfs_device_info *devices_info = NULL; 3586 3516 u64 total_avail; 3587 3517 int num_stripes; /* total number of stripes to allocate */ 3518 + int data_stripes; /* number of stripes that count for 3519 + block group size */ 3588 3520 int sub_stripes; /* sub_stripes info for map */ 3589 3521 int dev_stripes; /* stripes per dev */ 3590 3522 int devs_max; /* max devs to use */ ··· 3598 3526 u64 max_chunk_size; 3599 3527 u64 stripe_size; 3600 3528 u64 num_bytes; 3529 + u64 raid_stripe_len = BTRFS_STRIPE_LEN; 3601 3530 int ndevs; 3602 3531 int i; 3603 3532 int j; ··· 3724 3651 stripe_size = devices_info[ndevs-1].max_avail; 3725 3652 num_stripes = ndevs * dev_stripes; 3726 3653 3654 + /* 3655 + * this will have to be fixed for RAID1 and RAID10 over 3656 + * more drives 3657 + */ 3658 + data_stripes = num_stripes / ncopies; 3659 + 3727 3660 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3728 3661 stripe_size = max_chunk_size * ncopies; 3729 3662 do_div(stripe_size, ndevs); 3730 3663 } 3731 - 3664 + if (type & BTRFS_BLOCK_GROUP_RAID5) { 3665 + raid_stripe_len = find_raid56_stripe_len(ndevs - 1, 3666 + btrfs_super_stripesize(info->super_copy)); 3667 + data_stripes = num_stripes - 1; 3668 + } 3669 + if (type & BTRFS_BLOCK_GROUP_RAID6) { 3670 + raid_stripe_len = find_raid56_stripe_len(ndevs - 2, 3671 + btrfs_super_stripesize(info->super_copy)); 3672 + data_stripes = num_stripes - 2; 3673 + } 3732 3674 do_div(stripe_size, dev_stripes); 3733 3675 3734 3676 /* align to BTRFS_STRIPE_LEN */ 3735 - do_div(stripe_size, BTRFS_STRIPE_LEN); 3736 - stripe_size *= BTRFS_STRIPE_LEN; 3677 + do_div(stripe_size, raid_stripe_len); 3678 + stripe_size *= raid_stripe_len; 3737 3679 3738 3680 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3739 3681 if (!map) { ··· 3766 3678 } 3767 3679 } 3768 3680 map->sector_size = extent_root->sectorsize; 3769 - map->stripe_len = BTRFS_STRIPE_LEN; 3770 - map->io_align = BTRFS_STRIPE_LEN; 3771 - map->io_width = BTRFS_STRIPE_LEN; 3681 + map->stripe_len = raid_stripe_len; 3682 + map->io_align = raid_stripe_len; 3683 + map->io_width = raid_stripe_len; 3772 3684 map->type = type; 3773 3685 map->sub_stripes = sub_stripes; 3774 3686 3775 3687 *map_ret = map; 3776 - num_bytes = stripe_size * (num_stripes / ncopies); 3688 + num_bytes = stripe_size * data_stripes; 3777 3689 3778 3690 *stripe_size_out = stripe_size; 3779 3691 *num_bytes_out = num_bytes; ··· 3821 3733 goto error; 3822 3734 } 3823 3735 } 3736 + 3737 + check_raid56_incompat_flag(extent_root->fs_info, type); 3824 3738 3825 3739 kfree(devices_info); 3826 3740 return 0; ··· 4093 4003 ret = map->num_stripes; 4094 4004 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4095 4005 ret = map->sub_stripes; 4006 + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 4007 + ret = 2; 4008 + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4009 + ret = 3; 4096 4010 else 4097 4011 ret = 1; 4098 4012 free_extent_map(em); ··· 4106 4012 ret++; 4107 4013 btrfs_dev_replace_unlock(&fs_info->dev_replace); 4108 4014 4015 + return ret; 4016 + } 4017 + 4018 + unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 4019 + struct btrfs_mapping_tree *map_tree, 4020 + u64 logical) 4021 + { 4022 + struct extent_map *em; 4023 + struct map_lookup *map; 4024 + struct extent_map_tree *em_tree = &map_tree->map_tree; 4025 + unsigned long len = root->sectorsize; 4026 + 4027 + read_lock(&em_tree->lock); 4028 + em = lookup_extent_mapping(em_tree, logical, len); 4029 + read_unlock(&em_tree->lock); 4030 + BUG_ON(!em); 4031 + 4032 + BUG_ON(em->start > logical || em->start + em->len < logical); 4033 + map = (struct map_lookup *)em->bdev; 4034 + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4035 + BTRFS_BLOCK_GROUP_RAID6)) { 4036 + len = map->stripe_len * nr_data_stripes(map); 4037 + } 4038 + free_extent_map(em); 4039 + return len; 4040 + } 4041 + 4042 + int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 4043 + u64 logical, u64 len, int mirror_num) 4044 + { 4045 + struct extent_map *em; 4046 + struct map_lookup *map; 4047 + struct extent_map_tree *em_tree = &map_tree->map_tree; 4048 + int ret = 0; 4049 + 4050 + read_lock(&em_tree->lock); 4051 + em = lookup_extent_mapping(em_tree, logical, len); 4052 + read_unlock(&em_tree->lock); 4053 + BUG_ON(!em); 4054 + 4055 + BUG_ON(em->start > logical || em->start + em->len < logical); 4056 + map = (struct map_lookup *)em->bdev; 4057 + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4058 + BTRFS_BLOCK_GROUP_RAID6)) 4059 + ret = 1; 4060 + free_extent_map(em); 4109 4061 return ret; 4110 4062 } 4111 4063 ··· 4192 4052 return optimal; 4193 4053 } 4194 4054 4055 + static inline int parity_smaller(u64 a, u64 b) 4056 + { 4057 + return a > b; 4058 + } 4059 + 4060 + /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 4061 + static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4062 + { 4063 + struct btrfs_bio_stripe s; 4064 + int i; 4065 + u64 l; 4066 + int again = 1; 4067 + 4068 + while (again) { 4069 + again = 0; 4070 + for (i = 0; i < bbio->num_stripes - 1; i++) { 4071 + if (parity_smaller(raid_map[i], raid_map[i+1])) { 4072 + s = bbio->stripes[i]; 4073 + l = raid_map[i]; 4074 + bbio->stripes[i] = bbio->stripes[i+1]; 4075 + raid_map[i] = raid_map[i+1]; 4076 + bbio->stripes[i+1] = s; 4077 + raid_map[i+1] = l; 4078 + again = 1; 4079 + } 4080 + } 4081 + } 4082 + } 4083 + 4195 4084 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4196 4085 u64 logical, u64 *length, 4197 4086 struct btrfs_bio **bbio_ret, 4198 - int mirror_num) 4087 + int mirror_num, u64 **raid_map_ret) 4199 4088 { 4200 4089 struct extent_map *em; 4201 4090 struct map_lookup *map; ··· 4236 4067 u64 stripe_nr; 4237 4068 u64 stripe_nr_orig; 4238 4069 u64 stripe_nr_end; 4070 + u64 stripe_len; 4071 + u64 *raid_map = NULL; 4239 4072 int stripe_index; 4240 4073 int i; 4241 4074 int ret = 0; ··· 4249 4078 int num_alloc_stripes; 4250 4079 int patch_the_first_stripe_for_dev_replace = 0; 4251 4080 u64 physical_to_patch_in_first_stripe = 0; 4081 + u64 raid56_full_stripe_start = (u64)-1; 4252 4082 4253 4083 read_lock(&em_tree->lock); 4254 4084 em = lookup_extent_mapping(em_tree, logical, *length); ··· 4266 4094 map = (struct map_lookup *)em->bdev; 4267 4095 offset = logical - em->start; 4268 4096 4097 + if (mirror_num > map->num_stripes) 4098 + mirror_num = 0; 4099 + 4100 + stripe_len = map->stripe_len; 4269 4101 stripe_nr = offset; 4270 4102 /* 4271 4103 * stripe_nr counts the total number of stripes we have to stride 4272 4104 * to get to this block 4273 4105 */ 4274 - do_div(stripe_nr, map->stripe_len); 4106 + do_div(stripe_nr, stripe_len); 4275 4107 4276 - stripe_offset = stripe_nr * map->stripe_len; 4108 + stripe_offset = stripe_nr * stripe_len; 4277 4109 BUG_ON(offset < stripe_offset); 4278 4110 4279 4111 /* stripe_offset is the offset of this block in its stripe*/ 4280 4112 stripe_offset = offset - stripe_offset; 4281 4113 4282 - if (rw & REQ_DISCARD) 4114 + /* if we're here for raid56, we need to know the stripe aligned start */ 4115 + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4116 + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 4117 + raid56_full_stripe_start = offset; 4118 + 4119 + /* allow a write of a full stripe, but make sure we don't 4120 + * allow straddling of stripes 4121 + */ 4122 + do_div(raid56_full_stripe_start, full_stripe_len); 4123 + raid56_full_stripe_start *= full_stripe_len; 4124 + } 4125 + 4126 + if (rw & REQ_DISCARD) { 4127 + /* we don't discard raid56 yet */ 4128 + if (map->type & 4129 + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4130 + ret = -EOPNOTSUPP; 4131 + goto out; 4132 + } 4283 4133 *length = min_t(u64, em->len - offset, *length); 4284 - else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4285 - /* we limit the length of each bio to what fits in a stripe */ 4286 - *length = min_t(u64, em->len - offset, 4287 - map->stripe_len - stripe_offset); 4134 + } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4135 + u64 max_len; 4136 + /* For writes to RAID[56], allow a full stripeset across all disks. 4137 + For other RAID types and for RAID[56] reads, just allow a single 4138 + stripe (on a single disk). */ 4139 + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && 4140 + (rw & REQ_WRITE)) { 4141 + max_len = stripe_len * nr_data_stripes(map) - 4142 + (offset - raid56_full_stripe_start); 4143 + } else { 4144 + /* we limit the length of each bio to what fits in a stripe */ 4145 + max_len = stripe_len - stripe_offset; 4146 + } 4147 + *length = min_t(u64, em->len - offset, max_len); 4288 4148 } else { 4289 4149 *length = em->len - offset; 4290 4150 } 4291 4151 4152 + /* This is for when we're called from btrfs_merge_bio_hook() and all 4153 + it cares about is the length */ 4292 4154 if (!bbio_ret) 4293 4155 goto out; 4294 4156 ··· 4355 4149 u64 physical_of_found = 0; 4356 4150 4357 4151 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4358 - logical, &tmp_length, &tmp_bbio, 0); 4152 + logical, &tmp_length, &tmp_bbio, 0, NULL); 4359 4153 if (ret) { 4360 4154 WARN_ON(tmp_bbio != NULL); 4361 4155 goto out; ··· 4421 4215 do_div(stripe_nr_end, map->stripe_len); 4422 4216 stripe_end_offset = stripe_nr_end * map->stripe_len - 4423 4217 (offset + *length); 4218 + 4424 4219 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4425 4220 if (rw & REQ_DISCARD) 4426 4221 num_stripes = min_t(u64, map->num_stripes, ··· 4471 4264 current->pid % map->sub_stripes, 4472 4265 dev_replace_is_ongoing); 4473 4266 mirror_num = stripe_index - old_stripe_index + 1; 4267 + } 4268 + 4269 + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4270 + BTRFS_BLOCK_GROUP_RAID6)) { 4271 + u64 tmp; 4272 + 4273 + if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) 4274 + && raid_map_ret) { 4275 + int i, rot; 4276 + 4277 + /* push stripe_nr back to the start of the full stripe */ 4278 + stripe_nr = raid56_full_stripe_start; 4279 + do_div(stripe_nr, stripe_len); 4280 + 4281 + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 4282 + 4283 + /* RAID[56] write or recovery. Return all stripes */ 4284 + num_stripes = map->num_stripes; 4285 + max_errors = nr_parity_stripes(map); 4286 + 4287 + raid_map = kmalloc(sizeof(u64) * num_stripes, 4288 + GFP_NOFS); 4289 + if (!raid_map) { 4290 + ret = -ENOMEM; 4291 + goto out; 4292 + } 4293 + 4294 + /* Work out the disk rotation on this stripe-set */ 4295 + tmp = stripe_nr; 4296 + rot = do_div(tmp, num_stripes); 4297 + 4298 + /* Fill in the logical address of each stripe */ 4299 + tmp = stripe_nr * nr_data_stripes(map); 4300 + for (i = 0; i < nr_data_stripes(map); i++) 4301 + raid_map[(i+rot) % num_stripes] = 4302 + em->start + (tmp + i) * map->stripe_len; 4303 + 4304 + raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 4305 + if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4306 + raid_map[(i+rot+1) % num_stripes] = 4307 + RAID6_Q_STRIPE; 4308 + 4309 + *length = map->stripe_len; 4310 + stripe_index = 0; 4311 + stripe_offset = 0; 4312 + } else { 4313 + /* 4314 + * Mirror #0 or #1 means the original data block. 4315 + * Mirror #2 is RAID5 parity block. 4316 + * Mirror #3 is RAID6 Q block. 4317 + */ 4318 + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 4319 + if (mirror_num > 1) 4320 + stripe_index = nr_data_stripes(map) + 4321 + mirror_num - 2; 4322 + 4323 + /* We distribute the parity blocks across stripes */ 4324 + tmp = stripe_nr + stripe_index; 4325 + stripe_index = do_div(tmp, map->num_stripes); 4474 4326 } 4475 4327 } else { 4476 4328 /* ··· 4639 4373 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4640 4374 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4641 4375 BTRFS_BLOCK_GROUP_RAID10 | 4376 + BTRFS_BLOCK_GROUP_RAID5 | 4642 4377 BTRFS_BLOCK_GROUP_DUP)) { 4643 4378 max_errors = 1; 4379 + } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 4380 + max_errors = 2; 4644 4381 } 4645 4382 } 4646 4383 ··· 4744 4475 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4745 4476 bbio->mirror_num = map->num_stripes + 1; 4746 4477 } 4478 + if (raid_map) { 4479 + sort_parity_stripes(bbio, raid_map); 4480 + *raid_map_ret = raid_map; 4481 + } 4747 4482 out: 4748 4483 if (dev_replace_is_ongoing) 4749 4484 btrfs_dev_replace_unlock(dev_replace); ··· 4760 4487 struct btrfs_bio **bbio_ret, int mirror_num) 4761 4488 { 4762 4489 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4763 - mirror_num); 4490 + mirror_num, NULL); 4764 4491 } 4765 4492 4766 4493 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, ··· 4774 4501 u64 bytenr; 4775 4502 u64 length; 4776 4503 u64 stripe_nr; 4504 + u64 rmap_len; 4777 4505 int i, j, nr = 0; 4778 4506 4779 4507 read_lock(&em_tree->lock); ··· 4785 4511 map = (struct map_lookup *)em->bdev; 4786 4512 4787 4513 length = em->len; 4514 + rmap_len = map->stripe_len; 4515 + 4788 4516 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4789 4517 do_div(length, map->num_stripes / map->sub_stripes); 4790 4518 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4791 4519 do_div(length, map->num_stripes); 4520 + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4521 + BTRFS_BLOCK_GROUP_RAID6)) { 4522 + do_div(length, nr_data_stripes(map)); 4523 + rmap_len = map->stripe_len * nr_data_stripes(map); 4524 + } 4792 4525 4793 4526 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4794 4527 BUG_ON(!buf); /* -ENOMEM */ ··· 4815 4534 do_div(stripe_nr, map->sub_stripes); 4816 4535 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4817 4536 stripe_nr = stripe_nr * map->num_stripes + i; 4818 - } 4819 - bytenr = chunk_start + stripe_nr * map->stripe_len; 4537 + } /* else if RAID[56], multiply by nr_data_stripes(). 4538 + * Alternatively, just use rmap_len below instead of 4539 + * map->stripe_len */ 4540 + 4541 + bytenr = chunk_start + stripe_nr * rmap_len; 4820 4542 WARN_ON(nr >= map->num_stripes); 4821 4543 for (j = 0; j < nr; j++) { 4822 4544 if (buf[j] == bytenr) ··· 4833 4549 4834 4550 *logical = buf; 4835 4551 *naddrs = nr; 4836 - *stripe_len = map->stripe_len; 4552 + *stripe_len = rmap_len; 4837 4553 4838 4554 free_extent_map(em); 4839 4555 return 0; ··· 4907 4623 bio->bi_bdev = (struct block_device *) 4908 4624 (unsigned long)bbio->mirror_num; 4909 4625 /* only send an error to the higher layers if it is 4910 - * beyond the tolerance of the multi-bio 4626 + * beyond the tolerance of the btrfs bio 4911 4627 */ 4912 4628 if (atomic_read(&bbio->error) > bbio->max_errors) { 4913 4629 err = -EIO; ··· 4941 4657 * This will add one bio to the pending list for a device and make sure 4942 4658 * the work struct is scheduled. 4943 4659 */ 4944 - static noinline void schedule_bio(struct btrfs_root *root, 4660 + noinline void btrfs_schedule_bio(struct btrfs_root *root, 4945 4661 struct btrfs_device *device, 4946 4662 int rw, struct bio *bio) 4947 4663 { 4948 4664 int should_queue = 1; 4949 4665 struct btrfs_pending_bios *pending_bios; 4666 + 4667 + if (device->missing || !device->bdev) { 4668 + bio_endio(bio, -EIO); 4669 + return; 4670 + } 4950 4671 4951 4672 /* don't bother with additional async steps for reads, right now */ 4952 4673 if (!(rw & REQ_WRITE)) { ··· 5050 4761 #endif 5051 4762 bio->bi_bdev = dev->bdev; 5052 4763 if (async) 5053 - schedule_bio(root, dev, rw, bio); 4764 + btrfs_schedule_bio(root, dev, rw, bio); 5054 4765 else 5055 4766 btrfsic_submit_bio(rw, bio); 5056 4767 } ··· 5109 4820 u64 logical = (u64)bio->bi_sector << 9; 5110 4821 u64 length = 0; 5111 4822 u64 map_length; 4823 + u64 *raid_map = NULL; 5112 4824 int ret; 5113 4825 int dev_nr = 0; 5114 4826 int total_devs = 1; ··· 5118 4828 length = bio->bi_size; 5119 4829 map_length = length; 5120 4830 5121 - ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5122 - mirror_num); 5123 - if (ret) 4831 + ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 4832 + mirror_num, &raid_map); 4833 + if (ret) /* -ENOMEM */ 5124 4834 return ret; 5125 4835 5126 4836 total_devs = bbio->num_stripes; 4837 + bbio->orig_bio = first_bio; 4838 + bbio->private = first_bio->bi_private; 4839 + bbio->end_io = first_bio->bi_end_io; 4840 + atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4841 + 4842 + if (raid_map) { 4843 + /* In this case, map_length has been set to the length of 4844 + a single stripe; not the whole write */ 4845 + if (rw & WRITE) { 4846 + return raid56_parity_write(root, bio, bbio, 4847 + raid_map, map_length); 4848 + } else { 4849 + return raid56_parity_recover(root, bio, bbio, 4850 + raid_map, map_length, 4851 + mirror_num); 4852 + } 4853 + } 4854 + 5127 4855 if (map_length < length) { 5128 4856 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5129 4857 "len %llu\n", (unsigned long long)logical, ··· 5149 4841 (unsigned long long)map_length); 5150 4842 BUG(); 5151 4843 } 5152 - 5153 - bbio->orig_bio = first_bio; 5154 - bbio->private = first_bio->bi_private; 5155 - bbio->end_io = first_bio->bi_end_io; 5156 - atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5157 4844 5158 4845 while (dev_nr < total_devs) { 5159 4846 dev = bbio->stripes[dev_nr].dev;

+8 -1

fs/btrfs/volumes.h

··· 321 321 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 322 322 struct btrfs_device *tgtdev); 323 323 int btrfs_scratch_superblock(struct btrfs_device *device); 324 - 324 + void btrfs_schedule_bio(struct btrfs_root *root, 325 + struct btrfs_device *device, 326 + int rw, struct bio *bio); 327 + int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 328 + u64 logical, u64 len, int mirror_num); 329 + unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 330 + struct btrfs_mapping_tree *map_tree, 331 + u64 logical); 325 332 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 326 333 int index) 327 334 {