Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Btrfs: RAID5 and RAID6

This builds on David Woodhouse's original Btrfs raid5/6 implementation.
The code has changed quite a bit, blame Chris Mason for any bugs.

Read/modify/write is done after the higher levels of the filesystem have
prepared a given bio. This means the higher layers are not responsible
for building full stripes, and they don't need to query for the topology
of the extents that may get allocated during delayed allocation runs.
It also means different files can easily share the same stripe.

But, it does expose us to incorrect parity if we crash or lose power
while doing a read/modify/write cycle. This will be addressed in a
later commit.

Scrub is unable to repair crc errors on raid5/6 chunks.

Discard does not work on raid5/6 (yet)

The stripe size is fixed at 64KiB per disk. This will be tunable
in a later commit.

Signed-off-by: Chris Mason <chris.mason@fusionio.com>

authored by

David Woodhouse and committed by
Chris Mason
53b381b3 64a16701

+2283 -102
+2
fs/btrfs/Kconfig
··· 6 6 select ZLIB_DEFLATE 7 7 select LZO_COMPRESS 8 8 select LZO_DECOMPRESS 9 + select RAID6_PQ 10 + 9 11 help 10 12 Btrfs is a new filesystem with extents, writable snapshotting, 11 13 support for multiple devices and many more features.
+1 -1
fs/btrfs/Makefile
··· 8 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 9 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 10 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 11 - reada.o backref.o ulist.o qgroup.o send.o dev-replace.o 11 + reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o 12 12 13 13 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 14 14 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+34 -1
fs/btrfs/ctree.h
··· 502 502 #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 503 503 504 504 #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) 505 + #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) 505 506 506 507 #define BTRFS_FEATURE_COMPAT_SUPP 0ULL 507 508 #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL ··· 512 511 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 513 512 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 514 513 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ 514 + BTRFS_FEATURE_INCOMPAT_RAID56 | \ 515 515 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) 516 516 517 517 /* ··· 954 952 #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 955 953 #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 956 954 #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 955 + #define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) 956 + #define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) 957 957 #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 958 - #define BTRFS_NR_RAID_TYPES 5 958 + #define BTRFS_NR_RAID_TYPES 7 959 959 960 960 #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ 961 961 BTRFS_BLOCK_GROUP_SYSTEM | \ ··· 965 961 966 962 #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 967 963 BTRFS_BLOCK_GROUP_RAID1 | \ 964 + BTRFS_BLOCK_GROUP_RAID5 | \ 965 + BTRFS_BLOCK_GROUP_RAID6 | \ 968 966 BTRFS_BLOCK_GROUP_DUP | \ 969 967 BTRFS_BLOCK_GROUP_RAID10) 970 968 /* ··· 1191 1185 u64 flags; 1192 1186 u64 sectorsize; 1193 1187 u64 cache_generation; 1188 + 1189 + /* for raid56, this is a full stripe, without parity */ 1190 + unsigned long full_stripe_len; 1191 + 1194 1192 unsigned int ro:1; 1195 1193 unsigned int dirty:1; 1196 1194 unsigned int iref:1; ··· 1234 1224 struct list_head list; 1235 1225 u64 seq; 1236 1226 }; 1227 + 1228 + /* used by the raid56 code to lock stripes for read/modify/write */ 1229 + struct btrfs_stripe_hash { 1230 + struct list_head hash_list; 1231 + wait_queue_head_t wait; 1232 + spinlock_t lock; 1233 + }; 1234 + 1235 + /* used by the raid56 code to lock stripes for read/modify/write */ 1236 + struct btrfs_stripe_hash_table { 1237 + struct btrfs_stripe_hash *table; 1238 + }; 1239 + 1240 + #define BTRFS_STRIPE_HASH_TABLE_BITS 11 1237 1241 1238 1242 /* fs_info */ 1239 1243 struct reloc_control; ··· 1331 1307 struct mutex cleaner_mutex; 1332 1308 struct mutex chunk_mutex; 1333 1309 struct mutex volume_mutex; 1310 + 1311 + /* this is used during read/modify/write to make sure 1312 + * no two ios are trying to mod the same stripe at the same 1313 + * time 1314 + */ 1315 + struct btrfs_stripe_hash_table *stripe_hash_table; 1316 + 1334 1317 /* 1335 1318 * this protects the ordered operations list only while we are 1336 1319 * processing all of the entries on it. This way we make ··· 1426 1395 struct btrfs_workers flush_workers; 1427 1396 struct btrfs_workers endio_workers; 1428 1397 struct btrfs_workers endio_meta_workers; 1398 + struct btrfs_workers endio_raid56_workers; 1399 + struct btrfs_workers rmw_workers; 1429 1400 struct btrfs_workers endio_meta_write_workers; 1430 1401 struct btrfs_workers endio_write_workers; 1431 1402 struct btrfs_workers endio_freespace_worker;
+53 -9
fs/btrfs/disk-io.c
··· 46 46 #include "check-integrity.h" 47 47 #include "rcu-string.h" 48 48 #include "dev-replace.h" 49 + #include "raid56.h" 49 50 50 51 #ifdef CONFIG_X86 51 52 #include <asm/cpufeature.h> ··· 640 639 btree_readahead_hook(root, eb, eb->start, ret); 641 640 } 642 641 643 - if (ret) 642 + if (ret) { 643 + /* 644 + * our io error hook is going to dec the io pages 645 + * again, we have to make sure it has something 646 + * to decrement 647 + */ 648 + atomic_inc(&eb->io_pages); 644 649 clear_extent_buffer_uptodate(eb); 650 + } 645 651 free_extent_buffer(eb); 646 652 out: 647 653 return ret; ··· 662 654 eb = (struct extent_buffer *)page->private; 663 655 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 664 656 eb->read_mirror = failed_mirror; 657 + atomic_dec(&eb->io_pages); 665 658 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 666 659 btree_readahead_hook(root, eb, eb->start, -EIO); 667 660 return -EIO; /* we fixed nothing */ ··· 679 670 end_io_wq->work.flags = 0; 680 671 681 672 if (bio->bi_rw & REQ_WRITE) { 682 - if (end_io_wq->metadata == 1) 673 + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 683 674 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 684 675 &end_io_wq->work); 685 - else if (end_io_wq->metadata == 2) 676 + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 686 677 btrfs_queue_worker(&fs_info->endio_freespace_worker, 678 + &end_io_wq->work); 679 + else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 680 + btrfs_queue_worker(&fs_info->endio_raid56_workers, 687 681 &end_io_wq->work); 688 682 else 689 683 btrfs_queue_worker(&fs_info->endio_write_workers, 690 684 &end_io_wq->work); 691 685 } else { 692 - if (end_io_wq->metadata) 686 + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 687 + btrfs_queue_worker(&fs_info->endio_raid56_workers, 688 + &end_io_wq->work); 689 + else if (end_io_wq->metadata) 693 690 btrfs_queue_worker(&fs_info->endio_meta_workers, 694 691 &end_io_wq->work); 695 692 else ··· 710 695 * 0 - if data 711 696 * 1 - if normal metadta 712 697 * 2 - if writing to the free space cache area 698 + * 3 - raid parity work 713 699 */ 714 700 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 715 701 int metadata) ··· 2181 2165 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2182 2166 init_waitqueue_head(&fs_info->async_submit_wait); 2183 2167 2168 + ret = btrfs_alloc_stripe_hash_table(fs_info); 2169 + if (ret) { 2170 + err = -ENOMEM; 2171 + goto fail_alloc; 2172 + } 2173 + 2184 2174 __setup_root(4096, 4096, 4096, 4096, tree_root, 2185 2175 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2186 2176 ··· 2354 2332 btrfs_init_workers(&fs_info->endio_meta_write_workers, 2355 2333 "endio-meta-write", fs_info->thread_pool_size, 2356 2334 &fs_info->generic_worker); 2335 + btrfs_init_workers(&fs_info->endio_raid56_workers, 2336 + "endio-raid56", fs_info->thread_pool_size, 2337 + &fs_info->generic_worker); 2338 + btrfs_init_workers(&fs_info->rmw_workers, 2339 + "rmw", fs_info->thread_pool_size, 2340 + &fs_info->generic_worker); 2357 2341 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 2358 2342 fs_info->thread_pool_size, 2359 2343 &fs_info->generic_worker); ··· 2378 2350 */ 2379 2351 fs_info->endio_workers.idle_thresh = 4; 2380 2352 fs_info->endio_meta_workers.idle_thresh = 4; 2353 + fs_info->endio_raid56_workers.idle_thresh = 4; 2354 + fs_info->rmw_workers.idle_thresh = 2; 2381 2355 2382 2356 fs_info->endio_write_workers.idle_thresh = 2; 2383 2357 fs_info->endio_meta_write_workers.idle_thresh = 2; ··· 2396 2366 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2397 2367 ret |= btrfs_start_workers(&fs_info->endio_workers); 2398 2368 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2369 + ret |= btrfs_start_workers(&fs_info->rmw_workers); 2370 + ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); 2399 2371 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2400 2372 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2401 2373 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); ··· 2742 2710 btrfs_stop_workers(&fs_info->workers); 2743 2711 btrfs_stop_workers(&fs_info->endio_workers); 2744 2712 btrfs_stop_workers(&fs_info->endio_meta_workers); 2713 + btrfs_stop_workers(&fs_info->endio_raid56_workers); 2714 + btrfs_stop_workers(&fs_info->rmw_workers); 2745 2715 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2746 2716 btrfs_stop_workers(&fs_info->endio_write_workers); 2747 2717 btrfs_stop_workers(&fs_info->endio_freespace_worker); ··· 2762 2728 fail_srcu: 2763 2729 cleanup_srcu_struct(&fs_info->subvol_srcu); 2764 2730 fail: 2731 + btrfs_free_stripe_hash_table(fs_info); 2765 2732 btrfs_close_devices(fs_info->fs_devices); 2766 2733 return err; 2767 2734 ··· 3111 3076 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) 3112 3077 == 0))) 3113 3078 num_tolerated_disk_barrier_failures = 0; 3114 - else if (num_tolerated_disk_barrier_failures > 1 3115 - && 3116 - (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3117 - BTRFS_BLOCK_GROUP_RAID10))) 3118 - num_tolerated_disk_barrier_failures = 1; 3079 + else if (num_tolerated_disk_barrier_failures > 1) { 3080 + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3081 + BTRFS_BLOCK_GROUP_RAID5 | 3082 + BTRFS_BLOCK_GROUP_RAID10)) { 3083 + num_tolerated_disk_barrier_failures = 1; 3084 + } else if (flags & 3085 + BTRFS_BLOCK_GROUP_RAID5) { 3086 + num_tolerated_disk_barrier_failures = 2; 3087 + } 3088 + } 3119 3089 } 3120 3090 } 3121 3091 up_read(&sinfo->groups_sem); ··· 3424 3384 btrfs_stop_workers(&fs_info->workers); 3425 3385 btrfs_stop_workers(&fs_info->endio_workers); 3426 3386 btrfs_stop_workers(&fs_info->endio_meta_workers); 3387 + btrfs_stop_workers(&fs_info->endio_raid56_workers); 3388 + btrfs_stop_workers(&fs_info->rmw_workers); 3427 3389 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 3428 3390 btrfs_stop_workers(&fs_info->endio_write_workers); 3429 3391 btrfs_stop_workers(&fs_info->endio_freespace_worker); ··· 3445 3403 3446 3404 bdi_destroy(&fs_info->bdi); 3447 3405 cleanup_srcu_struct(&fs_info->subvol_srcu); 3406 + 3407 + btrfs_free_stripe_hash_table(fs_info); 3448 3408 3449 3409 return 0; 3450 3410 }
+7
fs/btrfs/disk-io.h
··· 25 25 #define BTRFS_SUPER_MIRROR_MAX 3 26 26 #define BTRFS_SUPER_MIRROR_SHIFT 12 27 27 28 + enum { 29 + BTRFS_WQ_ENDIO_DATA = 0, 30 + BTRFS_WQ_ENDIO_METADATA = 1, 31 + BTRFS_WQ_ENDIO_FREE_SPACE = 2, 32 + BTRFS_WQ_ENDIO_RAID56 = 3, 33 + }; 34 + 28 35 static inline u64 btrfs_sb_offset(int mirror) 29 36 { 30 37 u64 start = 16 * 1024;
+59 -29
fs/btrfs/extent-tree.c
··· 31 31 #include "print-tree.h" 32 32 #include "transaction.h" 33 33 #include "volumes.h" 34 + #include "raid56.h" 34 35 #include "locking.h" 35 36 #include "free-space-cache.h" 36 37 #include "math.h" ··· 1853 1852 *actual_bytes = discarded_bytes; 1854 1853 1855 1854 1855 + if (ret == -EOPNOTSUPP) 1856 + ret = 0; 1856 1857 return ret; 1857 1858 } 1858 1859 ··· 3279 3276 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3280 3277 root->fs_info->fs_devices->missing_devices; 3281 3278 u64 target; 3279 + u64 tmp; 3282 3280 3283 3281 /* 3284 3282 * see if restripe for this chunk_type is in progress, if so ··· 3296 3292 } 3297 3293 spin_unlock(&root->fs_info->balance_lock); 3298 3294 3295 + /* First, mask out the RAID levels which aren't possible */ 3299 3296 if (num_devices == 1) 3300 - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3297 + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3298 + BTRFS_BLOCK_GROUP_RAID5); 3299 + if (num_devices < 3) 3300 + flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3301 3301 if (num_devices < 4) 3302 3302 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3303 3303 3304 - if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3305 - (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3306 - BTRFS_BLOCK_GROUP_RAID10))) { 3307 - flags &= ~BTRFS_BLOCK_GROUP_DUP; 3308 - } 3304 + tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3305 + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3306 + BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3307 + flags &= ~tmp; 3309 3308 3310 - if ((flags & BTRFS_BLOCK_GROUP_RAID1) && 3311 - (flags & BTRFS_BLOCK_GROUP_RAID10)) { 3312 - flags &= ~BTRFS_BLOCK_GROUP_RAID1; 3313 - } 3309 + if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3310 + tmp = BTRFS_BLOCK_GROUP_RAID6; 3311 + else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3312 + tmp = BTRFS_BLOCK_GROUP_RAID5; 3313 + else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3314 + tmp = BTRFS_BLOCK_GROUP_RAID10; 3315 + else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3316 + tmp = BTRFS_BLOCK_GROUP_RAID1; 3317 + else if (tmp & BTRFS_BLOCK_GROUP_RAID0) 3318 + tmp = BTRFS_BLOCK_GROUP_RAID0; 3314 3319 3315 - if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3316 - ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3317 - (flags & BTRFS_BLOCK_GROUP_RAID10) | 3318 - (flags & BTRFS_BLOCK_GROUP_DUP))) { 3319 - flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3320 - } 3321 - 3322 - return extended_to_chunk(flags); 3320 + return extended_to_chunk(flags | tmp); 3323 3321 } 3324 3322 3325 3323 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) ··· 3339 3333 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3340 3334 { 3341 3335 u64 flags; 3336 + u64 ret; 3342 3337 3343 3338 if (data) 3344 3339 flags = BTRFS_BLOCK_GROUP_DATA; ··· 3348 3341 else 3349 3342 flags = BTRFS_BLOCK_GROUP_METADATA; 3350 3343 3351 - return get_alloc_profile(root, flags); 3344 + ret = get_alloc_profile(root, flags); 3345 + return ret; 3352 3346 } 3353 3347 3354 3348 /* ··· 3524 3516 { 3525 3517 u64 num_dev; 3526 3518 3527 - if (type & BTRFS_BLOCK_GROUP_RAID10 || 3528 - type & BTRFS_BLOCK_GROUP_RAID0) 3519 + if (type & (BTRFS_BLOCK_GROUP_RAID10 | 3520 + BTRFS_BLOCK_GROUP_RAID0 | 3521 + BTRFS_BLOCK_GROUP_RAID5 | 3522 + BTRFS_BLOCK_GROUP_RAID6)) 3529 3523 num_dev = root->fs_info->fs_devices->rw_devices; 3530 3524 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3531 3525 num_dev = 2; ··· 3677 3667 3678 3668 /* 3679 3669 * If we have dup, raid1 or raid10 then only half of the free 3680 - * space is actually useable. 3670 + * space is actually useable. For raid56, the space info used 3671 + * doesn't include the parity drive, so we don't have to 3672 + * change the math 3681 3673 */ 3682 3674 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3683 3675 BTRFS_BLOCK_GROUP_RAID1 | ··· 5467 5455 return ret; 5468 5456 } 5469 5457 5470 - static u64 stripe_align(struct btrfs_root *root, u64 val) 5458 + static u64 stripe_align(struct btrfs_root *root, 5459 + struct btrfs_block_group_cache *cache, 5460 + u64 val, u64 num_bytes) 5471 5461 { 5472 - u64 mask = ((u64)root->stripesize - 1); 5473 - u64 ret = (val + mask) & ~mask; 5462 + u64 mask; 5463 + u64 ret; 5464 + mask = ((u64)root->stripesize - 1); 5465 + ret = (val + mask) & ~mask; 5474 5466 return ret; 5475 5467 } 5476 5468 ··· 5535 5519 index = 2; 5536 5520 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5537 5521 index = 3; 5522 + else if (flags & BTRFS_BLOCK_GROUP_RAID5) 5523 + index = 5; 5524 + else if (flags & BTRFS_BLOCK_GROUP_RAID6) 5525 + index = 6; 5538 5526 else 5539 - index = 4; 5540 - 5527 + index = 4; /* BTRFS_BLOCK_GROUP_SINGLE */ 5541 5528 return index; 5542 5529 } 5543 5530 ··· 5684 5665 if (!block_group_bits(block_group, data)) { 5685 5666 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5686 5667 BTRFS_BLOCK_GROUP_RAID1 | 5668 + BTRFS_BLOCK_GROUP_RAID5 | 5669 + BTRFS_BLOCK_GROUP_RAID6 | 5687 5670 BTRFS_BLOCK_GROUP_RAID10; 5688 5671 5689 5672 /* ··· 5856 5835 goto loop; 5857 5836 } 5858 5837 checks: 5859 - search_start = stripe_align(root, offset); 5838 + search_start = stripe_align(root, used_block_group, 5839 + offset, num_bytes); 5860 5840 5861 5841 /* move on to the next group */ 5862 5842 if (search_start + num_bytes > ··· 7225 7203 root->fs_info->fs_devices->missing_devices; 7226 7204 7227 7205 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7206 + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 7228 7207 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7229 7208 7230 7209 if (num_devices == 1) { ··· 7777 7754 btrfs_release_path(path); 7778 7755 cache->flags = btrfs_block_group_flags(&cache->item); 7779 7756 cache->sectorsize = root->sectorsize; 7780 - 7757 + cache->full_stripe_len = btrfs_full_stripe_len(root, 7758 + &root->fs_info->mapping_tree, 7759 + found_key.objectid); 7781 7760 btrfs_init_free_space_ctl(cache); 7782 7761 7783 7762 /* ··· 7833 7808 if (!(get_alloc_profile(root, space_info->flags) & 7834 7809 (BTRFS_BLOCK_GROUP_RAID10 | 7835 7810 BTRFS_BLOCK_GROUP_RAID1 | 7811 + BTRFS_BLOCK_GROUP_RAID5 | 7812 + BTRFS_BLOCK_GROUP_RAID6 | 7836 7813 BTRFS_BLOCK_GROUP_DUP))) 7837 7814 continue; 7838 7815 /* ··· 7910 7883 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7911 7884 cache->sectorsize = root->sectorsize; 7912 7885 cache->fs_info = root->fs_info; 7886 + cache->full_stripe_len = btrfs_full_stripe_len(root, 7887 + &root->fs_info->mapping_tree, 7888 + chunk_offset); 7913 7889 7914 7890 atomic_set(&cache->count, 1); 7915 7891 spin_lock_init(&cache->lock);
+11 -7
fs/btrfs/extent_io.c
··· 1895 1895 if (ret) 1896 1896 err = ret; 1897 1897 1898 - if (did_repair) { 1899 - ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1900 - rec->start + rec->len - 1, 1901 - EXTENT_DAMAGED, GFP_NOFS); 1902 - if (ret && !err) 1903 - err = ret; 1904 - } 1898 + ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1899 + rec->start + rec->len - 1, 1900 + EXTENT_DAMAGED, GFP_NOFS); 1901 + if (ret && !err) 1902 + err = ret; 1905 1903 1906 1904 kfree(rec); 1907 1905 return err; ··· 1930 1932 u64 map_length = 0; 1931 1933 u64 sector; 1932 1934 struct btrfs_bio *bbio = NULL; 1935 + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 1933 1936 int ret; 1934 1937 1935 1938 BUG_ON(!mirror_num); 1939 + 1940 + /* we can't repair anything in raid56 yet */ 1941 + if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) 1942 + return 0; 1936 1943 1937 1944 bio = bio_alloc(GFP_NOFS, 1); 1938 1945 if (!bio) ··· 2055 2052 failrec->failed_mirror); 2056 2053 did_repair = !ret; 2057 2054 } 2055 + ret = 0; 2058 2056 } 2059 2057 2060 2058 out:
+42 -8
fs/btrfs/free-space-cache.c
··· 1463 1463 } 1464 1464 1465 1465 static struct btrfs_free_space * 1466 - find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) 1466 + find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, 1467 + unsigned long align) 1467 1468 { 1468 1469 struct btrfs_free_space *entry; 1469 1470 struct rb_node *node; 1471 + u64 ctl_off; 1472 + u64 tmp; 1473 + u64 align_off; 1470 1474 int ret; 1471 1475 1472 1476 if (!ctl->free_space_offset.rb_node) ··· 1485 1481 if (entry->bytes < *bytes) 1486 1482 continue; 1487 1483 1484 + /* make sure the space returned is big enough 1485 + * to match our requested alignment 1486 + */ 1487 + if (*bytes >= align) { 1488 + ctl_off = entry->offset - ctl->start; 1489 + tmp = ctl_off + align - 1;; 1490 + do_div(tmp, align); 1491 + tmp = tmp * align + ctl->start; 1492 + align_off = tmp - entry->offset; 1493 + } else { 1494 + align_off = 0; 1495 + tmp = entry->offset; 1496 + } 1497 + 1498 + if (entry->bytes < *bytes + align_off) 1499 + continue; 1500 + 1488 1501 if (entry->bitmap) { 1489 - ret = search_bitmap(ctl, entry, offset, bytes); 1490 - if (!ret) 1502 + ret = search_bitmap(ctl, entry, &tmp, bytes); 1503 + if (!ret) { 1504 + *offset = tmp; 1491 1505 return entry; 1506 + } 1492 1507 continue; 1493 1508 } 1494 1509 1495 - *offset = entry->offset; 1496 - *bytes = entry->bytes; 1510 + *offset = tmp; 1511 + *bytes = entry->bytes - align_off; 1497 1512 return entry; 1498 1513 } 1499 1514 ··· 2114 2091 struct btrfs_free_space *entry = NULL; 2115 2092 u64 bytes_search = bytes + empty_size; 2116 2093 u64 ret = 0; 2094 + u64 align_gap = 0; 2095 + u64 align_gap_len = 0; 2117 2096 2118 2097 spin_lock(&ctl->tree_lock); 2119 - entry = find_free_space(ctl, &offset, &bytes_search); 2098 + entry = find_free_space(ctl, &offset, &bytes_search, 2099 + block_group->full_stripe_len); 2120 2100 if (!entry) 2121 2101 goto out; 2122 2102 ··· 2129 2103 if (!entry->bytes) 2130 2104 free_bitmap(ctl, entry); 2131 2105 } else { 2106 + 2132 2107 unlink_free_space(ctl, entry); 2133 - entry->offset += bytes; 2134 - entry->bytes -= bytes; 2108 + align_gap_len = offset - entry->offset; 2109 + align_gap = entry->offset; 2110 + 2111 + entry->offset = offset + bytes; 2112 + WARN_ON(entry->bytes < bytes + align_gap_len); 2113 + 2114 + entry->bytes -= bytes + align_gap_len; 2135 2115 if (!entry->bytes) 2136 2116 kmem_cache_free(btrfs_free_space_cachep, entry); 2137 2117 else ··· 2147 2115 out: 2148 2116 spin_unlock(&ctl->tree_lock); 2149 2117 2118 + if (align_gap_len) 2119 + __btrfs_add_free_space(ctl, align_gap, align_gap_len); 2150 2120 return ret; 2151 2121 } 2152 2122
+13 -5
fs/btrfs/inode.c
··· 39 39 #include <linux/slab.h> 40 40 #include <linux/ratelimit.h> 41 41 #include <linux/mount.h> 42 + #include <linux/blkdev.h> 42 43 #include "compat.h" 43 44 #include "ctree.h" 44 45 #include "disk-io.h" ··· 6387 6386 int async_submit = 0; 6388 6387 6389 6388 map_length = orig_bio->bi_size; 6390 - ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, 6389 + ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, 6391 6390 &map_length, NULL, 0); 6392 6391 if (ret) { 6393 6392 bio_put(orig_bio); 6394 6393 return -EIO; 6395 6394 } 6396 - 6397 6395 if (map_length >= orig_bio->bi_size) { 6398 6396 bio = orig_bio; 6399 6397 goto submit; 6400 6398 } 6401 6399 6402 - async_submit = 1; 6400 + /* async crcs make it difficult to collect full stripe writes. */ 6401 + if (btrfs_get_alloc_profile(root, 1) & 6402 + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) 6403 + async_submit = 0; 6404 + else 6405 + async_submit = 1; 6406 + 6403 6407 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6404 6408 if (!bio) 6405 6409 return -ENOMEM; ··· 6446 6440 bio->bi_end_io = btrfs_end_dio_bio; 6447 6441 6448 6442 map_length = orig_bio->bi_size; 6449 - ret = btrfs_map_block(root->fs_info, READ, 6443 + ret = btrfs_map_block(root->fs_info, rw, 6450 6444 start_sector << 9, 6451 6445 &map_length, NULL, 0); 6452 6446 if (ret) { ··· 6589 6583 { 6590 6584 struct file *file = iocb->ki_filp; 6591 6585 struct inode *inode = file->f_mapping->host; 6586 + ssize_t ret; 6592 6587 6593 6588 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6594 6589 offset, nr_segs)) 6595 6590 return 0; 6596 6591 6597 - return __blockdev_direct_IO(rw, iocb, inode, 6592 + ret = __blockdev_direct_IO(rw, iocb, inode, 6598 6593 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6599 6594 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6600 6595 btrfs_submit_direct, 0); 6596 + return ret; 6601 6597 } 6602 6598 6603 6599 #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
+1647
fs/btrfs/raid56.c
··· 1 + /* 2 + * Copyright (C) 2012 Fusion-io All rights reserved. 3 + * Copyright (C) 2012 Intel Corp. All rights reserved. 4 + * 5 + * This program is free software; you can redistribute it and/or 6 + * modify it under the terms of the GNU General Public 7 + * License v2 as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope that it will be useful, 10 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 + * General Public License for more details. 13 + * 14 + * You should have received a copy of the GNU General Public 15 + * License along with this program; if not, write to the 16 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 17 + * Boston, MA 021110-1307, USA. 18 + */ 19 + #include <linux/sched.h> 20 + #include <linux/wait.h> 21 + #include <linux/bio.h> 22 + #include <linux/slab.h> 23 + #include <linux/buffer_head.h> 24 + #include <linux/blkdev.h> 25 + #include <linux/random.h> 26 + #include <linux/iocontext.h> 27 + #include <linux/capability.h> 28 + #include <linux/ratelimit.h> 29 + #include <linux/kthread.h> 30 + #include <linux/raid/pq.h> 31 + #include <linux/hash.h> 32 + #include <linux/list_sort.h> 33 + #include <linux/raid/xor.h> 34 + #include <asm/div64.h> 35 + #include "compat.h" 36 + #include "ctree.h" 37 + #include "extent_map.h" 38 + #include "disk-io.h" 39 + #include "transaction.h" 40 + #include "print-tree.h" 41 + #include "volumes.h" 42 + #include "raid56.h" 43 + #include "async-thread.h" 44 + #include "check-integrity.h" 45 + #include "rcu-string.h" 46 + 47 + /* set when additional merges to this rbio are not allowed */ 48 + #define RBIO_RMW_LOCKED_BIT 1 49 + 50 + struct btrfs_raid_bio { 51 + struct btrfs_fs_info *fs_info; 52 + struct btrfs_bio *bbio; 53 + 54 + /* 55 + * logical block numbers for the start of each stripe 56 + * The last one or two are p/q. These are sorted, 57 + * so raid_map[0] is the start of our full stripe 58 + */ 59 + u64 *raid_map; 60 + 61 + /* while we're doing rmw on a stripe 62 + * we put it into a hash table so we can 63 + * lock the stripe and merge more rbios 64 + * into it. 65 + */ 66 + struct list_head hash_list; 67 + 68 + /* 69 + * for scheduling work in the helper threads 70 + */ 71 + struct btrfs_work work; 72 + 73 + /* 74 + * bio list and bio_list_lock are used 75 + * to add more bios into the stripe 76 + * in hopes of avoiding the full rmw 77 + */ 78 + struct bio_list bio_list; 79 + spinlock_t bio_list_lock; 80 + 81 + /* 82 + * also protected by the bio_list_lock, the 83 + * stripe locking code uses plug_list to hand off 84 + * the stripe lock to the next pending IO 85 + */ 86 + struct list_head plug_list; 87 + 88 + /* 89 + * flags that tell us if it is safe to 90 + * merge with this bio 91 + */ 92 + unsigned long flags; 93 + 94 + /* size of each individual stripe on disk */ 95 + int stripe_len; 96 + 97 + /* number of data stripes (no p/q) */ 98 + int nr_data; 99 + 100 + /* 101 + * set if we're doing a parity rebuild 102 + * for a read from higher up, which is handled 103 + * differently from a parity rebuild as part of 104 + * rmw 105 + */ 106 + int read_rebuild; 107 + 108 + /* first bad stripe */ 109 + int faila; 110 + 111 + /* second bad stripe (for raid6 use) */ 112 + int failb; 113 + 114 + /* 115 + * number of pages needed to represent the full 116 + * stripe 117 + */ 118 + int nr_pages; 119 + 120 + /* 121 + * size of all the bios in the bio_list. This 122 + * helps us decide if the rbio maps to a full 123 + * stripe or not 124 + */ 125 + int bio_list_bytes; 126 + 127 + atomic_t refs; 128 + 129 + /* 130 + * these are two arrays of pointers. We allocate the 131 + * rbio big enough to hold them both and setup their 132 + * locations when the rbio is allocated 133 + */ 134 + 135 + /* pointers to pages that we allocated for 136 + * reading/writing stripes directly from the disk (including P/Q) 137 + */ 138 + struct page **stripe_pages; 139 + 140 + /* 141 + * pointers to the pages in the bio_list. Stored 142 + * here for faster lookup 143 + */ 144 + struct page **bio_pages; 145 + }; 146 + 147 + static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 148 + static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 149 + static void rmw_work(struct btrfs_work *work); 150 + static void read_rebuild_work(struct btrfs_work *work); 151 + static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 152 + static void async_read_rebuild(struct btrfs_raid_bio *rbio); 153 + static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 154 + static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 155 + static void __free_raid_bio(struct btrfs_raid_bio *rbio); 156 + static void index_rbio_pages(struct btrfs_raid_bio *rbio); 157 + static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 158 + 159 + /* 160 + * the stripe hash table is used for locking, and to collect 161 + * bios in hopes of making a full stripe 162 + */ 163 + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 164 + { 165 + struct btrfs_stripe_hash_table *table; 166 + struct btrfs_stripe_hash_table *x; 167 + struct btrfs_stripe_hash *cur; 168 + struct btrfs_stripe_hash *h; 169 + int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 170 + int i; 171 + 172 + if (info->stripe_hash_table) 173 + return 0; 174 + 175 + table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS); 176 + if (!table) 177 + return -ENOMEM; 178 + 179 + table->table = (void *)(table + 1); 180 + h = table->table; 181 + 182 + for (i = 0; i < num_entries; i++) { 183 + cur = h + i; 184 + INIT_LIST_HEAD(&cur->hash_list); 185 + spin_lock_init(&cur->lock); 186 + init_waitqueue_head(&cur->wait); 187 + } 188 + 189 + x = cmpxchg(&info->stripe_hash_table, NULL, table); 190 + if (x) 191 + kfree(x); 192 + return 0; 193 + } 194 + 195 + /* 196 + * we hash on the first logical address of the stripe 197 + */ 198 + static int rbio_bucket(struct btrfs_raid_bio *rbio) 199 + { 200 + u64 num = rbio->raid_map[0]; 201 + 202 + /* 203 + * we shift down quite a bit. We're using byte 204 + * addressing, and most of the lower bits are zeros. 205 + * This tends to upset hash_64, and it consistently 206 + * returns just one or two different values. 207 + * 208 + * shifting off the lower bits fixes things. 209 + */ 210 + return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 211 + } 212 + 213 + /* 214 + * merging means we take the bio_list from the victim and 215 + * splice it into the destination. The victim should 216 + * be discarded afterwards. 217 + * 218 + * must be called with dest->rbio_list_lock held 219 + */ 220 + static void merge_rbio(struct btrfs_raid_bio *dest, 221 + struct btrfs_raid_bio *victim) 222 + { 223 + bio_list_merge(&dest->bio_list, &victim->bio_list); 224 + dest->bio_list_bytes += victim->bio_list_bytes; 225 + bio_list_init(&victim->bio_list); 226 + } 227 + 228 + /* 229 + * free the hash table used by unmount 230 + */ 231 + void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 232 + { 233 + if (!info->stripe_hash_table) 234 + return; 235 + kfree(info->stripe_hash_table); 236 + info->stripe_hash_table = NULL; 237 + } 238 + 239 + /* 240 + * helper function to run the xor_blocks api. It is only 241 + * able to do MAX_XOR_BLOCKS at a time, so we need to 242 + * loop through. 243 + */ 244 + static void run_xor(void **pages, int src_cnt, ssize_t len) 245 + { 246 + int src_off = 0; 247 + int xor_src_cnt = 0; 248 + void *dest = pages[src_cnt]; 249 + 250 + while(src_cnt > 0) { 251 + xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 252 + xor_blocks(xor_src_cnt, len, dest, pages + src_off); 253 + 254 + src_cnt -= xor_src_cnt; 255 + src_off += xor_src_cnt; 256 + } 257 + } 258 + 259 + /* 260 + * returns true if the bio list inside this rbio 261 + * covers an entire stripe (no rmw required). 262 + * Must be called with the bio list lock held, or 263 + * at a time when you know it is impossible to add 264 + * new bios into the list 265 + */ 266 + static int __rbio_is_full(struct btrfs_raid_bio *rbio) 267 + { 268 + unsigned long size = rbio->bio_list_bytes; 269 + int ret = 1; 270 + 271 + if (size != rbio->nr_data * rbio->stripe_len) 272 + ret = 0; 273 + 274 + BUG_ON(size > rbio->nr_data * rbio->stripe_len); 275 + return ret; 276 + } 277 + 278 + static int rbio_is_full(struct btrfs_raid_bio *rbio) 279 + { 280 + unsigned long flags; 281 + int ret; 282 + 283 + spin_lock_irqsave(&rbio->bio_list_lock, flags); 284 + ret = __rbio_is_full(rbio); 285 + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 286 + return ret; 287 + } 288 + 289 + /* 290 + * returns 1 if it is safe to merge two rbios together. 291 + * The merging is safe if the two rbios correspond to 292 + * the same stripe and if they are both going in the same 293 + * direction (read vs write), and if neither one is 294 + * locked for final IO 295 + * 296 + * The caller is responsible for locking such that 297 + * rmw_locked is safe to test 298 + */ 299 + static int rbio_can_merge(struct btrfs_raid_bio *last, 300 + struct btrfs_raid_bio *cur) 301 + { 302 + if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 303 + test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 304 + return 0; 305 + 306 + if (last->raid_map[0] != 307 + cur->raid_map[0]) 308 + return 0; 309 + 310 + /* reads can't merge with writes */ 311 + if (last->read_rebuild != 312 + cur->read_rebuild) { 313 + return 0; 314 + } 315 + 316 + return 1; 317 + } 318 + 319 + /* 320 + * helper to index into the pstripe 321 + */ 322 + static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 323 + { 324 + index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 325 + return rbio->stripe_pages[index]; 326 + } 327 + 328 + /* 329 + * helper to index into the qstripe, returns null 330 + * if there is no qstripe 331 + */ 332 + static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 333 + { 334 + if (rbio->nr_data + 1 == rbio->bbio->num_stripes) 335 + return NULL; 336 + 337 + index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 338 + PAGE_CACHE_SHIFT; 339 + return rbio->stripe_pages[index]; 340 + } 341 + 342 + /* 343 + * The first stripe in the table for a logical address 344 + * has the lock. rbios are added in one of three ways: 345 + * 346 + * 1) Nobody has the stripe locked yet. The rbio is given 347 + * the lock and 0 is returned. The caller must start the IO 348 + * themselves. 349 + * 350 + * 2) Someone has the stripe locked, but we're able to merge 351 + * with the lock owner. The rbio is freed and the IO will 352 + * start automatically along with the existing rbio. 1 is returned. 353 + * 354 + * 3) Someone has the stripe locked, but we're not able to merge. 355 + * The rbio is added to the lock owner's plug list, or merged into 356 + * an rbio already on the plug list. When the lock owner unlocks, 357 + * the next rbio on the list is run and the IO is started automatically. 358 + * 1 is returned 359 + * 360 + * If we return 0, the caller still owns the rbio and must continue with 361 + * IO submission. If we return 1, the caller must assume the rbio has 362 + * already been freed. 363 + */ 364 + static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 365 + { 366 + int bucket = rbio_bucket(rbio); 367 + struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 368 + struct btrfs_raid_bio *cur; 369 + struct btrfs_raid_bio *pending; 370 + unsigned long flags; 371 + DEFINE_WAIT(wait); 372 + struct btrfs_raid_bio *freeit = NULL; 373 + int ret = 0; 374 + int walk = 0; 375 + 376 + spin_lock_irqsave(&h->lock, flags); 377 + list_for_each_entry(cur, &h->hash_list, hash_list) { 378 + walk++; 379 + if (cur->raid_map[0] == rbio->raid_map[0]) { 380 + spin_lock(&cur->bio_list_lock); 381 + 382 + /* can we merge into the lock owner? */ 383 + if (rbio_can_merge(cur, rbio)) { 384 + merge_rbio(cur, rbio); 385 + spin_unlock(&cur->bio_list_lock); 386 + freeit = rbio; 387 + ret = 1; 388 + goto out; 389 + } 390 + 391 + /* 392 + * we couldn't merge with the running 393 + * rbio, see if we can merge with the 394 + * pending ones. We don't have to 395 + * check for rmw_locked because there 396 + * is no way they are inside finish_rmw 397 + * right now 398 + */ 399 + list_for_each_entry(pending, &cur->plug_list, 400 + plug_list) { 401 + if (rbio_can_merge(pending, rbio)) { 402 + merge_rbio(pending, rbio); 403 + spin_unlock(&cur->bio_list_lock); 404 + freeit = rbio; 405 + ret = 1; 406 + goto out; 407 + } 408 + } 409 + 410 + /* no merging, put us on the tail of the plug list, 411 + * our rbio will be started with the currently 412 + * running rbio unlocks 413 + */ 414 + list_add_tail(&rbio->plug_list, &cur->plug_list); 415 + spin_unlock(&cur->bio_list_lock); 416 + ret = 1; 417 + goto out; 418 + } 419 + } 420 + 421 + atomic_inc(&rbio->refs); 422 + list_add(&rbio->hash_list, &h->hash_list); 423 + out: 424 + spin_unlock_irqrestore(&h->lock, flags); 425 + if (freeit) 426 + __free_raid_bio(freeit); 427 + return ret; 428 + } 429 + 430 + /* 431 + * called as rmw or parity rebuild is completed. If the plug list has more 432 + * rbios waiting for this stripe, the next one on the list will be started 433 + */ 434 + static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 435 + { 436 + int bucket; 437 + struct btrfs_stripe_hash *h; 438 + unsigned long flags; 439 + 440 + bucket = rbio_bucket(rbio); 441 + h = rbio->fs_info->stripe_hash_table->table + bucket; 442 + 443 + spin_lock_irqsave(&h->lock, flags); 444 + spin_lock(&rbio->bio_list_lock); 445 + 446 + if (!list_empty(&rbio->hash_list)) { 447 + 448 + list_del_init(&rbio->hash_list); 449 + atomic_dec(&rbio->refs); 450 + 451 + /* 452 + * we use the plug list to hold all the rbios 453 + * waiting for the chance to lock this stripe. 454 + * hand the lock over to one of them. 455 + */ 456 + if (!list_empty(&rbio->plug_list)) { 457 + struct btrfs_raid_bio *next; 458 + struct list_head *head = rbio->plug_list.next; 459 + 460 + next = list_entry(head, struct btrfs_raid_bio, 461 + plug_list); 462 + 463 + list_del_init(&rbio->plug_list); 464 + 465 + list_add(&next->hash_list, &h->hash_list); 466 + atomic_inc(&next->refs); 467 + spin_unlock(&rbio->bio_list_lock); 468 + spin_unlock_irqrestore(&h->lock, flags); 469 + 470 + if (next->read_rebuild) 471 + async_read_rebuild(next); 472 + else 473 + async_rmw_stripe(next); 474 + 475 + goto done_nolock; 476 + 477 + } else if (waitqueue_active(&h->wait)) { 478 + spin_unlock(&rbio->bio_list_lock); 479 + spin_unlock_irqrestore(&h->lock, flags); 480 + wake_up(&h->wait); 481 + goto done_nolock; 482 + } 483 + } 484 + spin_unlock(&rbio->bio_list_lock); 485 + spin_unlock_irqrestore(&h->lock, flags); 486 + 487 + done_nolock: 488 + return; 489 + } 490 + 491 + static void __free_raid_bio(struct btrfs_raid_bio *rbio) 492 + { 493 + int i; 494 + 495 + WARN_ON(atomic_read(&rbio->refs) < 0); 496 + if (!atomic_dec_and_test(&rbio->refs)) 497 + return; 498 + 499 + WARN_ON(!list_empty(&rbio->hash_list)); 500 + WARN_ON(!bio_list_empty(&rbio->bio_list)); 501 + 502 + for (i = 0; i < rbio->nr_pages; i++) { 503 + if (rbio->stripe_pages[i]) { 504 + __free_page(rbio->stripe_pages[i]); 505 + rbio->stripe_pages[i] = NULL; 506 + } 507 + } 508 + kfree(rbio->raid_map); 509 + kfree(rbio->bbio); 510 + kfree(rbio); 511 + } 512 + 513 + static void free_raid_bio(struct btrfs_raid_bio *rbio) 514 + { 515 + unlock_stripe(rbio); 516 + __free_raid_bio(rbio); 517 + } 518 + 519 + /* 520 + * this frees the rbio and runs through all the bios in the 521 + * bio_list and calls end_io on them 522 + */ 523 + static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) 524 + { 525 + struct bio *cur = bio_list_get(&rbio->bio_list); 526 + struct bio *next; 527 + free_raid_bio(rbio); 528 + 529 + while (cur) { 530 + next = cur->bi_next; 531 + cur->bi_next = NULL; 532 + if (uptodate) 533 + set_bit(BIO_UPTODATE, &cur->bi_flags); 534 + bio_endio(cur, err); 535 + cur = next; 536 + } 537 + } 538 + 539 + /* 540 + * end io function used by finish_rmw. When we finally 541 + * get here, we've written a full stripe 542 + */ 543 + static void raid_write_end_io(struct bio *bio, int err) 544 + { 545 + struct btrfs_raid_bio *rbio = bio->bi_private; 546 + 547 + if (err) 548 + fail_bio_stripe(rbio, bio); 549 + 550 + bio_put(bio); 551 + 552 + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 553 + return; 554 + 555 + err = 0; 556 + 557 + /* OK, we have read all the stripes we need to. */ 558 + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 559 + err = -EIO; 560 + 561 + rbio_orig_end_io(rbio, err, 0); 562 + return; 563 + } 564 + 565 + /* 566 + * the read/modify/write code wants to use the original bio for 567 + * any pages it included, and then use the rbio for everything 568 + * else. This function decides if a given index (stripe number) 569 + * and page number in that stripe fall inside the original bio 570 + * or the rbio. 571 + * 572 + * if you set bio_list_only, you'll get a NULL back for any ranges 573 + * that are outside the bio_list 574 + * 575 + * This doesn't take any refs on anything, you get a bare page pointer 576 + * and the caller must bump refs as required. 577 + * 578 + * You must call index_rbio_pages once before you can trust 579 + * the answers from this function. 580 + */ 581 + static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 582 + int index, int pagenr, int bio_list_only) 583 + { 584 + int chunk_page; 585 + struct page *p = NULL; 586 + 587 + chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 588 + 589 + spin_lock_irq(&rbio->bio_list_lock); 590 + p = rbio->bio_pages[chunk_page]; 591 + spin_unlock_irq(&rbio->bio_list_lock); 592 + 593 + if (p || bio_list_only) 594 + return p; 595 + 596 + return rbio->stripe_pages[chunk_page]; 597 + } 598 + 599 + /* 600 + * number of pages we need for the entire stripe across all the 601 + * drives 602 + */ 603 + static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 604 + { 605 + unsigned long nr = stripe_len * nr_stripes; 606 + return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 607 + } 608 + 609 + /* 610 + * allocation and initial setup for the btrfs_raid_bio. Not 611 + * this does not allocate any pages for rbio->pages. 612 + */ 613 + static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, 614 + struct btrfs_bio *bbio, u64 *raid_map, 615 + u64 stripe_len) 616 + { 617 + struct btrfs_raid_bio *rbio; 618 + int nr_data = 0; 619 + int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); 620 + void *p; 621 + 622 + rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, 623 + GFP_NOFS); 624 + if (!rbio) { 625 + kfree(raid_map); 626 + kfree(bbio); 627 + return ERR_PTR(-ENOMEM); 628 + } 629 + 630 + bio_list_init(&rbio->bio_list); 631 + INIT_LIST_HEAD(&rbio->plug_list); 632 + spin_lock_init(&rbio->bio_list_lock); 633 + INIT_LIST_HEAD(&rbio->hash_list); 634 + rbio->bbio = bbio; 635 + rbio->raid_map = raid_map; 636 + rbio->fs_info = root->fs_info; 637 + rbio->stripe_len = stripe_len; 638 + rbio->nr_pages = num_pages; 639 + rbio->faila = -1; 640 + rbio->failb = -1; 641 + atomic_set(&rbio->refs, 1); 642 + 643 + /* 644 + * the stripe_pages and bio_pages array point to the extra 645 + * memory we allocated past the end of the rbio 646 + */ 647 + p = rbio + 1; 648 + rbio->stripe_pages = p; 649 + rbio->bio_pages = p + sizeof(struct page *) * num_pages; 650 + 651 + if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 652 + nr_data = bbio->num_stripes - 2; 653 + else 654 + nr_data = bbio->num_stripes - 1; 655 + 656 + rbio->nr_data = nr_data; 657 + return rbio; 658 + } 659 + 660 + /* allocate pages for all the stripes in the bio, including parity */ 661 + static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 662 + { 663 + int i; 664 + struct page *page; 665 + 666 + for (i = 0; i < rbio->nr_pages; i++) { 667 + if (rbio->stripe_pages[i]) 668 + continue; 669 + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 670 + if (!page) 671 + return -ENOMEM; 672 + rbio->stripe_pages[i] = page; 673 + ClearPageUptodate(page); 674 + } 675 + return 0; 676 + } 677 + 678 + /* allocate pages for just the p/q stripes */ 679 + static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 680 + { 681 + int i; 682 + struct page *page; 683 + 684 + i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 685 + 686 + for (; i < rbio->nr_pages; i++) { 687 + if (rbio->stripe_pages[i]) 688 + continue; 689 + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 690 + if (!page) 691 + return -ENOMEM; 692 + rbio->stripe_pages[i] = page; 693 + } 694 + return 0; 695 + } 696 + 697 + /* 698 + * add a single page from a specific stripe into our list of bios for IO 699 + * this will try to merge into existing bios if possible, and returns 700 + * zero if all went well. 701 + */ 702 + int rbio_add_io_page(struct btrfs_raid_bio *rbio, 703 + struct bio_list *bio_list, 704 + struct page *page, 705 + int stripe_nr, 706 + unsigned long page_index, 707 + unsigned long bio_max_len) 708 + { 709 + struct bio *last = bio_list->tail; 710 + u64 last_end = 0; 711 + int ret; 712 + struct bio *bio; 713 + struct btrfs_bio_stripe *stripe; 714 + u64 disk_start; 715 + 716 + stripe = &rbio->bbio->stripes[stripe_nr]; 717 + disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); 718 + 719 + /* if the device is missing, just fail this stripe */ 720 + if (!stripe->dev->bdev) 721 + return fail_rbio_index(rbio, stripe_nr); 722 + 723 + /* see if we can add this page onto our existing bio */ 724 + if (last) { 725 + last_end = (u64)last->bi_sector << 9; 726 + last_end += last->bi_size; 727 + 728 + /* 729 + * we can't merge these if they are from different 730 + * devices or if they are not contiguous 731 + */ 732 + if (last_end == disk_start && stripe->dev->bdev && 733 + test_bit(BIO_UPTODATE, &last->bi_flags) && 734 + last->bi_bdev == stripe->dev->bdev) { 735 + ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); 736 + if (ret == PAGE_CACHE_SIZE) 737 + return 0; 738 + } 739 + } 740 + 741 + /* put a new bio on the list */ 742 + bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 743 + if (!bio) 744 + return -ENOMEM; 745 + 746 + bio->bi_size = 0; 747 + bio->bi_bdev = stripe->dev->bdev; 748 + bio->bi_sector = disk_start >> 9; 749 + set_bit(BIO_UPTODATE, &bio->bi_flags); 750 + 751 + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 752 + bio_list_add(bio_list, bio); 753 + return 0; 754 + } 755 + 756 + /* 757 + * while we're doing the read/modify/write cycle, we could 758 + * have errors in reading pages off the disk. This checks 759 + * for errors and if we're not able to read the page it'll 760 + * trigger parity reconstruction. The rmw will be finished 761 + * after we've reconstructed the failed stripes 762 + */ 763 + static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 764 + { 765 + if (rbio->faila >= 0 || rbio->failb >= 0) { 766 + BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); 767 + __raid56_parity_recover(rbio); 768 + } else { 769 + finish_rmw(rbio); 770 + } 771 + } 772 + 773 + /* 774 + * these are just the pages from the rbio array, not from anything 775 + * the FS sent down to us 776 + */ 777 + static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) 778 + { 779 + int index; 780 + index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); 781 + index += page; 782 + return rbio->stripe_pages[index]; 783 + } 784 + 785 + /* 786 + * helper function to walk our bio list and populate the bio_pages array with 787 + * the result. This seems expensive, but it is faster than constantly 788 + * searching through the bio list as we setup the IO in finish_rmw or stripe 789 + * reconstruction. 790 + * 791 + * This must be called before you trust the answers from page_in_rbio 792 + */ 793 + static void index_rbio_pages(struct btrfs_raid_bio *rbio) 794 + { 795 + struct bio *bio; 796 + u64 start; 797 + unsigned long stripe_offset; 798 + unsigned long page_index; 799 + struct page *p; 800 + int i; 801 + 802 + spin_lock_irq(&rbio->bio_list_lock); 803 + bio_list_for_each(bio, &rbio->bio_list) { 804 + start = (u64)bio->bi_sector << 9; 805 + stripe_offset = start - rbio->raid_map[0]; 806 + page_index = stripe_offset >> PAGE_CACHE_SHIFT; 807 + 808 + for (i = 0; i < bio->bi_vcnt; i++) { 809 + p = bio->bi_io_vec[i].bv_page; 810 + rbio->bio_pages[page_index + i] = p; 811 + } 812 + } 813 + spin_unlock_irq(&rbio->bio_list_lock); 814 + } 815 + 816 + /* 817 + * this is called from one of two situations. We either 818 + * have a full stripe from the higher layers, or we've read all 819 + * the missing bits off disk. 820 + * 821 + * This will calculate the parity and then send down any 822 + * changed blocks. 823 + */ 824 + static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 825 + { 826 + struct btrfs_bio *bbio = rbio->bbio; 827 + void *pointers[bbio->num_stripes]; 828 + int stripe_len = rbio->stripe_len; 829 + int nr_data = rbio->nr_data; 830 + int stripe; 831 + int pagenr; 832 + int p_stripe = -1; 833 + int q_stripe = -1; 834 + struct bio_list bio_list; 835 + struct bio *bio; 836 + int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; 837 + int ret; 838 + 839 + bio_list_init(&bio_list); 840 + 841 + if (bbio->num_stripes - rbio->nr_data == 1) { 842 + p_stripe = bbio->num_stripes - 1; 843 + } else if (bbio->num_stripes - rbio->nr_data == 2) { 844 + p_stripe = bbio->num_stripes - 2; 845 + q_stripe = bbio->num_stripes - 1; 846 + } else { 847 + BUG(); 848 + } 849 + 850 + /* at this point we either have a full stripe, 851 + * or we've read the full stripe from the drive. 852 + * recalculate the parity and write the new results. 853 + * 854 + * We're not allowed to add any new bios to the 855 + * bio list here, anyone else that wants to 856 + * change this stripe needs to do their own rmw. 857 + */ 858 + spin_lock_irq(&rbio->bio_list_lock); 859 + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 860 + spin_unlock_irq(&rbio->bio_list_lock); 861 + 862 + atomic_set(&rbio->bbio->error, 0); 863 + 864 + /* 865 + * now that we've set rmw_locked, run through the 866 + * bio list one last time and map the page pointers 867 + */ 868 + index_rbio_pages(rbio); 869 + 870 + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 871 + struct page *p; 872 + /* first collect one page from each data stripe */ 873 + for (stripe = 0; stripe < nr_data; stripe++) { 874 + p = page_in_rbio(rbio, stripe, pagenr, 0); 875 + pointers[stripe] = kmap(p); 876 + } 877 + 878 + /* then add the parity stripe */ 879 + p = rbio_pstripe_page(rbio, pagenr); 880 + SetPageUptodate(p); 881 + pointers[stripe++] = kmap(p); 882 + 883 + if (q_stripe != -1) { 884 + 885 + /* 886 + * raid6, add the qstripe and call the 887 + * library function to fill in our p/q 888 + */ 889 + p = rbio_qstripe_page(rbio, pagenr); 890 + SetPageUptodate(p); 891 + pointers[stripe++] = kmap(p); 892 + 893 + raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, 894 + pointers); 895 + } else { 896 + /* raid5 */ 897 + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 898 + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 899 + } 900 + 901 + 902 + for (stripe = 0; stripe < bbio->num_stripes; stripe++) 903 + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 904 + } 905 + 906 + /* 907 + * time to start writing. Make bios for everything from the 908 + * higher layers (the bio_list in our rbio) and our p/q. Ignore 909 + * everything else. 910 + */ 911 + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 912 + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 913 + struct page *page; 914 + if (stripe < rbio->nr_data) { 915 + page = page_in_rbio(rbio, stripe, pagenr, 1); 916 + if (!page) 917 + continue; 918 + } else { 919 + page = rbio_stripe_page(rbio, stripe, pagenr); 920 + } 921 + 922 + ret = rbio_add_io_page(rbio, &bio_list, 923 + page, stripe, pagenr, rbio->stripe_len); 924 + if (ret) 925 + goto cleanup; 926 + } 927 + } 928 + 929 + atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); 930 + BUG_ON(atomic_read(&bbio->stripes_pending) == 0); 931 + 932 + while (1) { 933 + bio = bio_list_pop(&bio_list); 934 + if (!bio) 935 + break; 936 + 937 + bio->bi_private = rbio; 938 + bio->bi_end_io = raid_write_end_io; 939 + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 940 + submit_bio(WRITE, bio); 941 + } 942 + return; 943 + 944 + cleanup: 945 + rbio_orig_end_io(rbio, -EIO, 0); 946 + } 947 + 948 + /* 949 + * helper to find the stripe number for a given bio. Used to figure out which 950 + * stripe has failed. This expects the bio to correspond to a physical disk, 951 + * so it looks up based on physical sector numbers. 952 + */ 953 + static int find_bio_stripe(struct btrfs_raid_bio *rbio, 954 + struct bio *bio) 955 + { 956 + u64 physical = bio->bi_sector; 957 + u64 stripe_start; 958 + int i; 959 + struct btrfs_bio_stripe *stripe; 960 + 961 + physical <<= 9; 962 + 963 + for (i = 0; i < rbio->bbio->num_stripes; i++) { 964 + stripe = &rbio->bbio->stripes[i]; 965 + stripe_start = stripe->physical; 966 + if (physical >= stripe_start && 967 + physical < stripe_start + rbio->stripe_len) { 968 + return i; 969 + } 970 + } 971 + return -1; 972 + } 973 + 974 + /* 975 + * helper to find the stripe number for a given 976 + * bio (before mapping). Used to figure out which stripe has 977 + * failed. This looks up based on logical block numbers. 978 + */ 979 + static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 980 + struct bio *bio) 981 + { 982 + u64 logical = bio->bi_sector; 983 + u64 stripe_start; 984 + int i; 985 + 986 + logical <<= 9; 987 + 988 + for (i = 0; i < rbio->nr_data; i++) { 989 + stripe_start = rbio->raid_map[i]; 990 + if (logical >= stripe_start && 991 + logical < stripe_start + rbio->stripe_len) { 992 + return i; 993 + } 994 + } 995 + return -1; 996 + } 997 + 998 + /* 999 + * returns -EIO if we had too many failures 1000 + */ 1001 + static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1002 + { 1003 + unsigned long flags; 1004 + int ret = 0; 1005 + 1006 + spin_lock_irqsave(&rbio->bio_list_lock, flags); 1007 + 1008 + /* we already know this stripe is bad, move on */ 1009 + if (rbio->faila == failed || rbio->failb == failed) 1010 + goto out; 1011 + 1012 + if (rbio->faila == -1) { 1013 + /* first failure on this rbio */ 1014 + rbio->faila = failed; 1015 + atomic_inc(&rbio->bbio->error); 1016 + } else if (rbio->failb == -1) { 1017 + /* second failure on this rbio */ 1018 + rbio->failb = failed; 1019 + atomic_inc(&rbio->bbio->error); 1020 + } else { 1021 + ret = -EIO; 1022 + } 1023 + out: 1024 + spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1025 + 1026 + return ret; 1027 + } 1028 + 1029 + /* 1030 + * helper to fail a stripe based on a physical disk 1031 + * bio. 1032 + */ 1033 + static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1034 + struct bio *bio) 1035 + { 1036 + int failed = find_bio_stripe(rbio, bio); 1037 + 1038 + if (failed < 0) 1039 + return -EIO; 1040 + 1041 + return fail_rbio_index(rbio, failed); 1042 + } 1043 + 1044 + /* 1045 + * this sets each page in the bio uptodate. It should only be used on private 1046 + * rbio pages, nothing that comes in from the higher layers 1047 + */ 1048 + static void set_bio_pages_uptodate(struct bio *bio) 1049 + { 1050 + int i; 1051 + struct page *p; 1052 + 1053 + for (i = 0; i < bio->bi_vcnt; i++) { 1054 + p = bio->bi_io_vec[i].bv_page; 1055 + SetPageUptodate(p); 1056 + } 1057 + } 1058 + 1059 + /* 1060 + * end io for the read phase of the rmw cycle. All the bios here are physical 1061 + * stripe bios we've read from the disk so we can recalculate the parity of the 1062 + * stripe. 1063 + * 1064 + * This will usually kick off finish_rmw once all the bios are read in, but it 1065 + * may trigger parity reconstruction if we had any errors along the way 1066 + */ 1067 + static void raid_rmw_end_io(struct bio *bio, int err) 1068 + { 1069 + struct btrfs_raid_bio *rbio = bio->bi_private; 1070 + 1071 + if (err) 1072 + fail_bio_stripe(rbio, bio); 1073 + else 1074 + set_bio_pages_uptodate(bio); 1075 + 1076 + bio_put(bio); 1077 + 1078 + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1079 + return; 1080 + 1081 + err = 0; 1082 + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1083 + goto cleanup; 1084 + 1085 + /* 1086 + * this will normally call finish_rmw to start our write 1087 + * but if there are any failed stripes we'll reconstruct 1088 + * from parity first 1089 + */ 1090 + validate_rbio_for_rmw(rbio); 1091 + return; 1092 + 1093 + cleanup: 1094 + 1095 + rbio_orig_end_io(rbio, -EIO, 0); 1096 + } 1097 + 1098 + static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1099 + { 1100 + rbio->work.flags = 0; 1101 + rbio->work.func = rmw_work; 1102 + 1103 + btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1104 + &rbio->work); 1105 + } 1106 + 1107 + static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1108 + { 1109 + rbio->work.flags = 0; 1110 + rbio->work.func = read_rebuild_work; 1111 + 1112 + btrfs_queue_worker(&rbio->fs_info->rmw_workers, 1113 + &rbio->work); 1114 + } 1115 + 1116 + /* 1117 + * the stripe must be locked by the caller. It will 1118 + * unlock after all the writes are done 1119 + */ 1120 + static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1121 + { 1122 + int bios_to_read = 0; 1123 + struct btrfs_bio *bbio = rbio->bbio; 1124 + struct bio_list bio_list; 1125 + int ret; 1126 + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1127 + int pagenr; 1128 + int stripe; 1129 + struct bio *bio; 1130 + 1131 + bio_list_init(&bio_list); 1132 + 1133 + ret = alloc_rbio_pages(rbio); 1134 + if (ret) 1135 + goto cleanup; 1136 + 1137 + index_rbio_pages(rbio); 1138 + 1139 + atomic_set(&rbio->bbio->error, 0); 1140 + /* 1141 + * build a list of bios to read all the missing parts of this 1142 + * stripe 1143 + */ 1144 + for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1145 + for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1146 + struct page *page; 1147 + /* 1148 + * we want to find all the pages missing from 1149 + * the rbio and read them from the disk. If 1150 + * page_in_rbio finds a page in the bio list 1151 + * we don't need to read it off the stripe. 1152 + */ 1153 + page = page_in_rbio(rbio, stripe, pagenr, 1); 1154 + if (page) 1155 + continue; 1156 + 1157 + page = rbio_stripe_page(rbio, stripe, pagenr); 1158 + ret = rbio_add_io_page(rbio, &bio_list, page, 1159 + stripe, pagenr, rbio->stripe_len); 1160 + if (ret) 1161 + goto cleanup; 1162 + } 1163 + } 1164 + 1165 + bios_to_read = bio_list_size(&bio_list); 1166 + if (!bios_to_read) { 1167 + /* 1168 + * this can happen if others have merged with 1169 + * us, it means there is nothing left to read. 1170 + * But if there are missing devices it may not be 1171 + * safe to do the full stripe write yet. 1172 + */ 1173 + goto finish; 1174 + } 1175 + 1176 + /* 1177 + * the bbio may be freed once we submit the last bio. Make sure 1178 + * not to touch it after that 1179 + */ 1180 + atomic_set(&bbio->stripes_pending, bios_to_read); 1181 + while (1) { 1182 + bio = bio_list_pop(&bio_list); 1183 + if (!bio) 1184 + break; 1185 + 1186 + bio->bi_private = rbio; 1187 + bio->bi_end_io = raid_rmw_end_io; 1188 + 1189 + btrfs_bio_wq_end_io(rbio->fs_info, bio, 1190 + BTRFS_WQ_ENDIO_RAID56); 1191 + 1192 + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1193 + submit_bio(READ, bio); 1194 + } 1195 + /* the actual write will happen once the reads are done */ 1196 + return 0; 1197 + 1198 + cleanup: 1199 + rbio_orig_end_io(rbio, -EIO, 0); 1200 + return -EIO; 1201 + 1202 + finish: 1203 + validate_rbio_for_rmw(rbio); 1204 + return 0; 1205 + } 1206 + 1207 + /* 1208 + * if the upper layers pass in a full stripe, we thank them by only allocating 1209 + * enough pages to hold the parity, and sending it all down quickly. 1210 + */ 1211 + static int full_stripe_write(struct btrfs_raid_bio *rbio) 1212 + { 1213 + int ret; 1214 + 1215 + ret = alloc_rbio_parity_pages(rbio); 1216 + if (ret) 1217 + return ret; 1218 + 1219 + ret = lock_stripe_add(rbio); 1220 + if (ret == 0) 1221 + finish_rmw(rbio); 1222 + return 0; 1223 + } 1224 + 1225 + /* 1226 + * partial stripe writes get handed over to async helpers. 1227 + * We're really hoping to merge a few more writes into this 1228 + * rbio before calculating new parity 1229 + */ 1230 + static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1231 + { 1232 + int ret; 1233 + 1234 + ret = lock_stripe_add(rbio); 1235 + if (ret == 0) 1236 + async_rmw_stripe(rbio); 1237 + return 0; 1238 + } 1239 + 1240 + /* 1241 + * sometimes while we were reading from the drive to 1242 + * recalculate parity, enough new bios come into create 1243 + * a full stripe. So we do a check here to see if we can 1244 + * go directly to finish_rmw 1245 + */ 1246 + static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1247 + { 1248 + /* head off into rmw land if we don't have a full stripe */ 1249 + if (!rbio_is_full(rbio)) 1250 + return partial_stripe_write(rbio); 1251 + return full_stripe_write(rbio); 1252 + } 1253 + 1254 + /* 1255 + * our main entry point for writes from the rest of the FS. 1256 + */ 1257 + int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 1258 + struct btrfs_bio *bbio, u64 *raid_map, 1259 + u64 stripe_len) 1260 + { 1261 + struct btrfs_raid_bio *rbio; 1262 + 1263 + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1264 + if (IS_ERR(rbio)) { 1265 + kfree(raid_map); 1266 + kfree(bbio); 1267 + return PTR_ERR(rbio); 1268 + } 1269 + bio_list_add(&rbio->bio_list, bio); 1270 + rbio->bio_list_bytes = bio->bi_size; 1271 + return __raid56_parity_write(rbio); 1272 + } 1273 + 1274 + /* 1275 + * all parity reconstruction happens here. We've read in everything 1276 + * we can find from the drives and this does the heavy lifting of 1277 + * sorting the good from the bad. 1278 + */ 1279 + static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1280 + { 1281 + int pagenr, stripe; 1282 + void **pointers; 1283 + int faila = -1, failb = -1; 1284 + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1285 + struct page *page; 1286 + int err; 1287 + int i; 1288 + 1289 + pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), 1290 + GFP_NOFS); 1291 + if (!pointers) { 1292 + err = -ENOMEM; 1293 + goto cleanup_io; 1294 + } 1295 + 1296 + faila = rbio->faila; 1297 + failb = rbio->failb; 1298 + 1299 + if (rbio->read_rebuild) { 1300 + spin_lock_irq(&rbio->bio_list_lock); 1301 + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1302 + spin_unlock_irq(&rbio->bio_list_lock); 1303 + } 1304 + 1305 + index_rbio_pages(rbio); 1306 + 1307 + for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1308 + /* setup our array of pointers with pages 1309 + * from each stripe 1310 + */ 1311 + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1312 + /* 1313 + * if we're rebuilding a read, we have to use 1314 + * pages from the bio list 1315 + */ 1316 + if (rbio->read_rebuild && 1317 + (stripe == faila || stripe == failb)) { 1318 + page = page_in_rbio(rbio, stripe, pagenr, 0); 1319 + } else { 1320 + page = rbio_stripe_page(rbio, stripe, pagenr); 1321 + } 1322 + pointers[stripe] = kmap(page); 1323 + } 1324 + 1325 + /* all raid6 handling here */ 1326 + if (rbio->raid_map[rbio->bbio->num_stripes - 1] == 1327 + RAID6_Q_STRIPE) { 1328 + 1329 + /* 1330 + * single failure, rebuild from parity raid5 1331 + * style 1332 + */ 1333 + if (failb < 0) { 1334 + if (faila == rbio->nr_data) { 1335 + /* 1336 + * Just the P stripe has failed, without 1337 + * a bad data or Q stripe. 1338 + * TODO, we should redo the xor here. 1339 + */ 1340 + err = -EIO; 1341 + goto cleanup; 1342 + } 1343 + /* 1344 + * a single failure in raid6 is rebuilt 1345 + * in the pstripe code below 1346 + */ 1347 + goto pstripe; 1348 + } 1349 + 1350 + /* make sure our ps and qs are in order */ 1351 + if (faila > failb) { 1352 + int tmp = failb; 1353 + failb = faila; 1354 + faila = tmp; 1355 + } 1356 + 1357 + /* if the q stripe is failed, do a pstripe reconstruction 1358 + * from the xors. 1359 + * If both the q stripe and the P stripe are failed, we're 1360 + * here due to a crc mismatch and we can't give them the 1361 + * data they want 1362 + */ 1363 + if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { 1364 + if (rbio->raid_map[faila] == RAID5_P_STRIPE) { 1365 + err = -EIO; 1366 + goto cleanup; 1367 + } 1368 + /* 1369 + * otherwise we have one bad data stripe and 1370 + * a good P stripe. raid5! 1371 + */ 1372 + goto pstripe; 1373 + } 1374 + 1375 + if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 1376 + raid6_datap_recov(rbio->bbio->num_stripes, 1377 + PAGE_SIZE, faila, pointers); 1378 + } else { 1379 + raid6_2data_recov(rbio->bbio->num_stripes, 1380 + PAGE_SIZE, faila, failb, 1381 + pointers); 1382 + } 1383 + } else { 1384 + void *p; 1385 + 1386 + /* rebuild from P stripe here (raid5 or raid6) */ 1387 + BUG_ON(failb != -1); 1388 + pstripe: 1389 + /* Copy parity block into failed block to start with */ 1390 + memcpy(pointers[faila], 1391 + pointers[rbio->nr_data], 1392 + PAGE_CACHE_SIZE); 1393 + 1394 + /* rearrange the pointer array */ 1395 + p = pointers[faila]; 1396 + for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1397 + pointers[stripe] = pointers[stripe + 1]; 1398 + pointers[rbio->nr_data - 1] = p; 1399 + 1400 + /* xor in the rest */ 1401 + run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); 1402 + } 1403 + /* if we're doing this rebuild as part of an rmw, go through 1404 + * and set all of our private rbio pages in the 1405 + * failed stripes as uptodate. This way finish_rmw will 1406 + * know they can be trusted. If this was a read reconstruction, 1407 + * other endio functions will fiddle the uptodate bits 1408 + */ 1409 + if (!rbio->read_rebuild) { 1410 + for (i = 0; i < nr_pages; i++) { 1411 + if (faila != -1) { 1412 + page = rbio_stripe_page(rbio, faila, i); 1413 + SetPageUptodate(page); 1414 + } 1415 + if (failb != -1) { 1416 + page = rbio_stripe_page(rbio, failb, i); 1417 + SetPageUptodate(page); 1418 + } 1419 + } 1420 + } 1421 + for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1422 + /* 1423 + * if we're rebuilding a read, we have to use 1424 + * pages from the bio list 1425 + */ 1426 + if (rbio->read_rebuild && 1427 + (stripe == faila || stripe == failb)) { 1428 + page = page_in_rbio(rbio, stripe, pagenr, 0); 1429 + } else { 1430 + page = rbio_stripe_page(rbio, stripe, pagenr); 1431 + } 1432 + kunmap(page); 1433 + } 1434 + } 1435 + 1436 + err = 0; 1437 + cleanup: 1438 + kfree(pointers); 1439 + 1440 + cleanup_io: 1441 + 1442 + if (rbio->read_rebuild) { 1443 + rbio_orig_end_io(rbio, err, err == 0); 1444 + } else if (err == 0) { 1445 + rbio->faila = -1; 1446 + rbio->failb = -1; 1447 + finish_rmw(rbio); 1448 + } else { 1449 + rbio_orig_end_io(rbio, err, 0); 1450 + } 1451 + } 1452 + 1453 + /* 1454 + * This is called only for stripes we've read from disk to 1455 + * reconstruct the parity. 1456 + */ 1457 + static void raid_recover_end_io(struct bio *bio, int err) 1458 + { 1459 + struct btrfs_raid_bio *rbio = bio->bi_private; 1460 + 1461 + /* 1462 + * we only read stripe pages off the disk, set them 1463 + * up to date if there were no errors 1464 + */ 1465 + if (err) 1466 + fail_bio_stripe(rbio, bio); 1467 + else 1468 + set_bio_pages_uptodate(bio); 1469 + bio_put(bio); 1470 + 1471 + if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1472 + return; 1473 + 1474 + if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1475 + rbio_orig_end_io(rbio, -EIO, 0); 1476 + else 1477 + __raid_recover_end_io(rbio); 1478 + } 1479 + 1480 + /* 1481 + * reads everything we need off the disk to reconstruct 1482 + * the parity. endio handlers trigger final reconstruction 1483 + * when the IO is done. 1484 + * 1485 + * This is used both for reads from the higher layers and for 1486 + * parity construction required to finish a rmw cycle. 1487 + */ 1488 + static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 1489 + { 1490 + int bios_to_read = 0; 1491 + struct btrfs_bio *bbio = rbio->bbio; 1492 + struct bio_list bio_list; 1493 + int ret; 1494 + int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1495 + int pagenr; 1496 + int stripe; 1497 + struct bio *bio; 1498 + 1499 + bio_list_init(&bio_list); 1500 + 1501 + ret = alloc_rbio_pages(rbio); 1502 + if (ret) 1503 + goto cleanup; 1504 + 1505 + atomic_set(&rbio->bbio->error, 0); 1506 + 1507 + /* 1508 + * read everything that hasn't failed. 1509 + */ 1510 + for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1511 + if (rbio->faila == stripe || 1512 + rbio->failb == stripe) 1513 + continue; 1514 + 1515 + for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1516 + struct page *p; 1517 + 1518 + /* 1519 + * the rmw code may have already read this 1520 + * page in 1521 + */ 1522 + p = rbio_stripe_page(rbio, stripe, pagenr); 1523 + if (PageUptodate(p)) 1524 + continue; 1525 + 1526 + ret = rbio_add_io_page(rbio, &bio_list, 1527 + rbio_stripe_page(rbio, stripe, pagenr), 1528 + stripe, pagenr, rbio->stripe_len); 1529 + if (ret < 0) 1530 + goto cleanup; 1531 + } 1532 + } 1533 + 1534 + bios_to_read = bio_list_size(&bio_list); 1535 + if (!bios_to_read) { 1536 + /* 1537 + * we might have no bios to read just because the pages 1538 + * were up to date, or we might have no bios to read because 1539 + * the devices were gone. 1540 + */ 1541 + if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { 1542 + __raid_recover_end_io(rbio); 1543 + goto out; 1544 + } else { 1545 + goto cleanup; 1546 + } 1547 + } 1548 + 1549 + /* 1550 + * the bbio may be freed once we submit the last bio. Make sure 1551 + * not to touch it after that 1552 + */ 1553 + atomic_set(&bbio->stripes_pending, bios_to_read); 1554 + while (1) { 1555 + bio = bio_list_pop(&bio_list); 1556 + if (!bio) 1557 + break; 1558 + 1559 + bio->bi_private = rbio; 1560 + bio->bi_end_io = raid_recover_end_io; 1561 + 1562 + btrfs_bio_wq_end_io(rbio->fs_info, bio, 1563 + BTRFS_WQ_ENDIO_RAID56); 1564 + 1565 + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1566 + submit_bio(READ, bio); 1567 + } 1568 + out: 1569 + return 0; 1570 + 1571 + cleanup: 1572 + if (rbio->read_rebuild) 1573 + rbio_orig_end_io(rbio, -EIO, 0); 1574 + return -EIO; 1575 + } 1576 + 1577 + /* 1578 + * the main entry point for reads from the higher layers. This 1579 + * is really only called when the normal read path had a failure, 1580 + * so we assume the bio they send down corresponds to a failed part 1581 + * of the drive. 1582 + */ 1583 + int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 1584 + struct btrfs_bio *bbio, u64 *raid_map, 1585 + u64 stripe_len, int mirror_num) 1586 + { 1587 + struct btrfs_raid_bio *rbio; 1588 + int ret; 1589 + 1590 + rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1591 + if (IS_ERR(rbio)) { 1592 + return PTR_ERR(rbio); 1593 + } 1594 + 1595 + rbio->read_rebuild = 1; 1596 + bio_list_add(&rbio->bio_list, bio); 1597 + rbio->bio_list_bytes = bio->bi_size; 1598 + 1599 + rbio->faila = find_logical_bio_stripe(rbio, bio); 1600 + if (rbio->faila == -1) { 1601 + BUG(); 1602 + kfree(rbio); 1603 + return -EIO; 1604 + } 1605 + 1606 + /* 1607 + * reconstruct from the q stripe if they are 1608 + * asking for mirror 3 1609 + */ 1610 + if (mirror_num == 3) 1611 + rbio->failb = bbio->num_stripes - 2; 1612 + 1613 + ret = lock_stripe_add(rbio); 1614 + 1615 + /* 1616 + * __raid56_parity_recover will end the bio with 1617 + * any errors it hits. We don't want to return 1618 + * its error value up the stack because our caller 1619 + * will end up calling bio_endio with any nonzero 1620 + * return 1621 + */ 1622 + if (ret == 0) 1623 + __raid56_parity_recover(rbio); 1624 + /* 1625 + * our rbio has been added to the list of 1626 + * rbios that will be handled after the 1627 + * currently lock owner is done 1628 + */ 1629 + return 0; 1630 + 1631 + } 1632 + 1633 + static void rmw_work(struct btrfs_work *work) 1634 + { 1635 + struct btrfs_raid_bio *rbio; 1636 + 1637 + rbio = container_of(work, struct btrfs_raid_bio, work); 1638 + raid56_rmw_stripe(rbio); 1639 + } 1640 + 1641 + static void read_rebuild_work(struct btrfs_work *work) 1642 + { 1643 + struct btrfs_raid_bio *rbio; 1644 + 1645 + rbio = container_of(work, struct btrfs_raid_bio, work); 1646 + __raid56_parity_recover(rbio); 1647 + }
+51
fs/btrfs/raid56.h
··· 1 + /* 2 + * Copyright (C) 2012 Fusion-io All rights reserved. 3 + * Copyright (C) 2012 Intel Corp. All rights reserved. 4 + * 5 + * This program is free software; you can redistribute it and/or 6 + * modify it under the terms of the GNU General Public 7 + * License v2 as published by the Free Software Foundation. 8 + * 9 + * This program is distributed in the hope that it will be useful, 10 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 + * General Public License for more details. 13 + * 14 + * You should have received a copy of the GNU General Public 15 + * License along with this program; if not, write to the 16 + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 17 + * Boston, MA 021110-1307, USA. 18 + */ 19 + 20 + #ifndef __BTRFS_RAID56__ 21 + #define __BTRFS_RAID56__ 22 + static inline int nr_parity_stripes(struct map_lookup *map) 23 + { 24 + if (map->type & BTRFS_BLOCK_GROUP_RAID5) 25 + return 1; 26 + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 27 + return 2; 28 + else 29 + return 0; 30 + } 31 + 32 + static inline int nr_data_stripes(struct map_lookup *map) 33 + { 34 + return map->num_stripes - nr_parity_stripes(map); 35 + } 36 + #define RAID5_P_STRIPE ((u64)-2) 37 + #define RAID6_Q_STRIPE ((u64)-1) 38 + 39 + #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ 40 + ((x) == RAID6_Q_STRIPE)) 41 + 42 + int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 43 + struct btrfs_bio *bbio, u64 *raid_map, 44 + u64 stripe_len, int mirror_num); 45 + int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 46 + struct btrfs_bio *bbio, u64 *raid_map, 47 + u64 stripe_len); 48 + 49 + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); 50 + void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); 51 + #endif
+8
fs/btrfs/scrub.c
··· 28 28 #include "dev-replace.h" 29 29 #include "check-integrity.h" 30 30 #include "rcu-string.h" 31 + #include "raid56.h" 31 32 32 33 /* 33 34 * This is only the first step towards a full-features scrub. It reads all ··· 2246 2245 u64 extent_len; 2247 2246 struct btrfs_device *extent_dev; 2248 2247 int extent_mirror_num; 2248 + 2249 + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 2250 + BTRFS_BLOCK_GROUP_RAID6)) { 2251 + if (num >= nr_data_stripes(map)) { 2252 + return 0; 2253 + } 2254 + } 2249 2255 2250 2256 nstripes = length; 2251 2257 offset = 0;
+3
fs/btrfs/transaction.c
··· 686 686 struct extent_state *cached_state = NULL; 687 687 u64 start = 0; 688 688 u64 end; 689 + struct blk_plug plug; 689 690 691 + blk_start_plug(&plug); 690 692 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 691 693 mark, &cached_state)) { 692 694 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, ··· 702 700 } 703 701 if (err) 704 702 werr = err; 703 + blk_finish_plug(&plug); 705 704 return werr; 706 705 } 707 706
+344 -41
fs/btrfs/volumes.c
··· 25 25 #include <linux/capability.h> 26 26 #include <linux/ratelimit.h> 27 27 #include <linux/kthread.h> 28 + #include <linux/raid/pq.h> 29 + #include <asm/div64.h> 28 30 #include "compat.h" 29 31 #include "ctree.h" 30 32 #include "extent_map.h" ··· 34 32 #include "transaction.h" 35 33 #include "print-tree.h" 36 34 #include "volumes.h" 35 + #include "raid56.h" 37 36 #include "async-thread.h" 38 37 #include "check-integrity.h" 39 38 #include "rcu-string.h" ··· 1392 1389 } 1393 1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1394 1391 1392 + if ((all_avail & (BTRFS_BLOCK_GROUP_RAID5 | 1393 + BTRFS_BLOCK_GROUP_RAID6) && num_devices <= 3)) { 1394 + printk(KERN_ERR "btrfs: unable to go below three devices " 1395 + "on raid5 or raid6\n"); 1396 + ret = -EINVAL; 1397 + goto out; 1398 + } 1399 + 1395 1400 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1396 1401 printk(KERN_ERR "btrfs: unable to go below four devices " 1397 1402 "on raid10\n"); ··· 1410 1399 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { 1411 1400 printk(KERN_ERR "btrfs: unable to go below two " 1412 1401 "devices on raid1\n"); 1402 + ret = -EINVAL; 1403 + goto out; 1404 + } 1405 + 1406 + if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1407 + root->fs_info->fs_devices->rw_devices <= 2) { 1408 + printk(KERN_ERR "btrfs: unable to go below two " 1409 + "devices on raid5\n"); 1410 + ret = -EINVAL; 1411 + goto out; 1412 + } 1413 + if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && 1414 + root->fs_info->fs_devices->rw_devices <= 3) { 1415 + printk(KERN_ERR "btrfs: unable to go below three " 1416 + "devices on raid6\n"); 1413 1417 ret = -EINVAL; 1414 1418 goto out; 1415 1419 } ··· 2683 2657 return 0; 2684 2658 2685 2659 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2686 - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2687 - factor = 2; 2688 - else 2689 - factor = 1; 2690 - factor = num_stripes / factor; 2660 + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { 2661 + factor = num_stripes / 2; 2662 + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { 2663 + factor = num_stripes - 1; 2664 + } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { 2665 + factor = num_stripes - 2; 2666 + } else { 2667 + factor = num_stripes; 2668 + } 2691 2669 2692 2670 for (i = 0; i < num_stripes; i++) { 2693 2671 stripe = btrfs_stripe_nr(chunk, i); ··· 3006 2976 int mixed = 0; 3007 2977 int ret; 3008 2978 u64 num_devices; 2979 + int cancel = 0; 3009 2980 3010 2981 if (btrfs_fs_closing(fs_info) || 3011 2982 atomic_read(&fs_info->balance_pause_req) || ··· 3049 3018 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3050 3019 else 3051 3020 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3052 - BTRFS_BLOCK_GROUP_RAID10); 3021 + BTRFS_BLOCK_GROUP_RAID10 | 3022 + BTRFS_BLOCK_GROUP_RAID5 | 3023 + BTRFS_BLOCK_GROUP_RAID6); 3053 3024 3054 3025 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3055 3026 (!alloc_profile_is_valid(bctl->data.target, 1) || ··· 3091 3058 3092 3059 /* allow to reduce meta or sys integrity only if force set */ 3093 3060 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3094 - BTRFS_BLOCK_GROUP_RAID10; 3061 + BTRFS_BLOCK_GROUP_RAID10 | 3062 + BTRFS_BLOCK_GROUP_RAID5 | 3063 + BTRFS_BLOCK_GROUP_RAID6; 3064 + 3095 3065 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3096 3066 (fs_info->avail_system_alloc_bits & allowed) && 3097 3067 !(bctl->sys.target & allowed)) || ··· 3160 3124 } 3161 3125 3162 3126 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3163 - balance_need_close(fs_info)) { 3164 - __cancel_balance(fs_info); 3165 - } 3127 + balance_need_close(fs_info)) 3128 + cancel = 1; 3166 3129 3167 3130 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3168 3131 fs_info->num_tolerated_disk_barrier_failures = 3169 3132 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3170 3133 } 3134 + 3135 + if (cancel) 3136 + __cancel_balance(fs_info); 3171 3137 3172 3138 wake_up(&fs_info->balance_wait_q); 3173 3139 ··· 3531 3493 } 3532 3494 3533 3495 struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3496 + /* 3497 + * sub_stripes info for map, 3498 + * dev_stripes -- stripes per dev, 2 for DUP, 1 other wise 3499 + * devs_max -- max devices per stripe, 0 for unlimited 3500 + * devs_min -- min devices per stripe 3501 + * devs_increment -- ndevs must be a multiple of this 3502 + * ncopies -- how many copies of the data we have 3503 + */ 3534 3504 { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3535 3505 { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3536 3506 { 1, 2, 1, 1, 1, 2 /* dup */ }, 3537 3507 { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3538 3508 { 1, 1, 0, 1, 1, 1 /* single */ }, 3509 + { 1, 1, 0, 2, 1, 2 /* raid5 */ }, 3510 + { 1, 1, 0, 3, 1, 3 /* raid6 */ }, 3539 3511 }; 3512 + 3513 + static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) 3514 + { 3515 + /* TODO allow them to set a preferred stripe size */ 3516 + return 64 * 1024; 3517 + } 3518 + 3519 + static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 3520 + { 3521 + u64 features; 3522 + 3523 + if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) 3524 + return; 3525 + 3526 + features = btrfs_super_incompat_flags(info->super_copy); 3527 + if (features & BTRFS_FEATURE_INCOMPAT_RAID56) 3528 + return; 3529 + 3530 + features |= BTRFS_FEATURE_INCOMPAT_RAID56; 3531 + btrfs_set_super_incompat_flags(info->super_copy, features); 3532 + printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); 3533 + } 3540 3534 3541 3535 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3542 3536 struct btrfs_root *extent_root, ··· 3585 3515 struct btrfs_device_info *devices_info = NULL; 3586 3516 u64 total_avail; 3587 3517 int num_stripes; /* total number of stripes to allocate */ 3518 + int data_stripes; /* number of stripes that count for 3519 + block group size */ 3588 3520 int sub_stripes; /* sub_stripes info for map */ 3589 3521 int dev_stripes; /* stripes per dev */ 3590 3522 int devs_max; /* max devs to use */ ··· 3598 3526 u64 max_chunk_size; 3599 3527 u64 stripe_size; 3600 3528 u64 num_bytes; 3529 + u64 raid_stripe_len = BTRFS_STRIPE_LEN; 3601 3530 int ndevs; 3602 3531 int i; 3603 3532 int j; ··· 3724 3651 stripe_size = devices_info[ndevs-1].max_avail; 3725 3652 num_stripes = ndevs * dev_stripes; 3726 3653 3654 + /* 3655 + * this will have to be fixed for RAID1 and RAID10 over 3656 + * more drives 3657 + */ 3658 + data_stripes = num_stripes / ncopies; 3659 + 3727 3660 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3728 3661 stripe_size = max_chunk_size * ncopies; 3729 3662 do_div(stripe_size, ndevs); 3730 3663 } 3731 - 3664 + if (type & BTRFS_BLOCK_GROUP_RAID5) { 3665 + raid_stripe_len = find_raid56_stripe_len(ndevs - 1, 3666 + btrfs_super_stripesize(info->super_copy)); 3667 + data_stripes = num_stripes - 1; 3668 + } 3669 + if (type & BTRFS_BLOCK_GROUP_RAID6) { 3670 + raid_stripe_len = find_raid56_stripe_len(ndevs - 2, 3671 + btrfs_super_stripesize(info->super_copy)); 3672 + data_stripes = num_stripes - 2; 3673 + } 3732 3674 do_div(stripe_size, dev_stripes); 3733 3675 3734 3676 /* align to BTRFS_STRIPE_LEN */ 3735 - do_div(stripe_size, BTRFS_STRIPE_LEN); 3736 - stripe_size *= BTRFS_STRIPE_LEN; 3677 + do_div(stripe_size, raid_stripe_len); 3678 + stripe_size *= raid_stripe_len; 3737 3679 3738 3680 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3739 3681 if (!map) { ··· 3766 3678 } 3767 3679 } 3768 3680 map->sector_size = extent_root->sectorsize; 3769 - map->stripe_len = BTRFS_STRIPE_LEN; 3770 - map->io_align = BTRFS_STRIPE_LEN; 3771 - map->io_width = BTRFS_STRIPE_LEN; 3681 + map->stripe_len = raid_stripe_len; 3682 + map->io_align = raid_stripe_len; 3683 + map->io_width = raid_stripe_len; 3772 3684 map->type = type; 3773 3685 map->sub_stripes = sub_stripes; 3774 3686 3775 3687 *map_ret = map; 3776 - num_bytes = stripe_size * (num_stripes / ncopies); 3688 + num_bytes = stripe_size * data_stripes; 3777 3689 3778 3690 *stripe_size_out = stripe_size; 3779 3691 *num_bytes_out = num_bytes; ··· 3821 3733 goto error; 3822 3734 } 3823 3735 } 3736 + 3737 + check_raid56_incompat_flag(extent_root->fs_info, type); 3824 3738 3825 3739 kfree(devices_info); 3826 3740 return 0; ··· 4093 4003 ret = map->num_stripes; 4094 4004 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4095 4005 ret = map->sub_stripes; 4006 + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 4007 + ret = 2; 4008 + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4009 + ret = 3; 4096 4010 else 4097 4011 ret = 1; 4098 4012 free_extent_map(em); ··· 4106 4012 ret++; 4107 4013 btrfs_dev_replace_unlock(&fs_info->dev_replace); 4108 4014 4015 + return ret; 4016 + } 4017 + 4018 + unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 4019 + struct btrfs_mapping_tree *map_tree, 4020 + u64 logical) 4021 + { 4022 + struct extent_map *em; 4023 + struct map_lookup *map; 4024 + struct extent_map_tree *em_tree = &map_tree->map_tree; 4025 + unsigned long len = root->sectorsize; 4026 + 4027 + read_lock(&em_tree->lock); 4028 + em = lookup_extent_mapping(em_tree, logical, len); 4029 + read_unlock(&em_tree->lock); 4030 + BUG_ON(!em); 4031 + 4032 + BUG_ON(em->start > logical || em->start + em->len < logical); 4033 + map = (struct map_lookup *)em->bdev; 4034 + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4035 + BTRFS_BLOCK_GROUP_RAID6)) { 4036 + len = map->stripe_len * nr_data_stripes(map); 4037 + } 4038 + free_extent_map(em); 4039 + return len; 4040 + } 4041 + 4042 + int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 4043 + u64 logical, u64 len, int mirror_num) 4044 + { 4045 + struct extent_map *em; 4046 + struct map_lookup *map; 4047 + struct extent_map_tree *em_tree = &map_tree->map_tree; 4048 + int ret = 0; 4049 + 4050 + read_lock(&em_tree->lock); 4051 + em = lookup_extent_mapping(em_tree, logical, len); 4052 + read_unlock(&em_tree->lock); 4053 + BUG_ON(!em); 4054 + 4055 + BUG_ON(em->start > logical || em->start + em->len < logical); 4056 + map = (struct map_lookup *)em->bdev; 4057 + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4058 + BTRFS_BLOCK_GROUP_RAID6)) 4059 + ret = 1; 4060 + free_extent_map(em); 4109 4061 return ret; 4110 4062 } 4111 4063 ··· 4192 4052 return optimal; 4193 4053 } 4194 4054 4055 + static inline int parity_smaller(u64 a, u64 b) 4056 + { 4057 + return a > b; 4058 + } 4059 + 4060 + /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 4061 + static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) 4062 + { 4063 + struct btrfs_bio_stripe s; 4064 + int i; 4065 + u64 l; 4066 + int again = 1; 4067 + 4068 + while (again) { 4069 + again = 0; 4070 + for (i = 0; i < bbio->num_stripes - 1; i++) { 4071 + if (parity_smaller(raid_map[i], raid_map[i+1])) { 4072 + s = bbio->stripes[i]; 4073 + l = raid_map[i]; 4074 + bbio->stripes[i] = bbio->stripes[i+1]; 4075 + raid_map[i] = raid_map[i+1]; 4076 + bbio->stripes[i+1] = s; 4077 + raid_map[i+1] = l; 4078 + again = 1; 4079 + } 4080 + } 4081 + } 4082 + } 4083 + 4195 4084 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4196 4085 u64 logical, u64 *length, 4197 4086 struct btrfs_bio **bbio_ret, 4198 - int mirror_num) 4087 + int mirror_num, u64 **raid_map_ret) 4199 4088 { 4200 4089 struct extent_map *em; 4201 4090 struct map_lookup *map; ··· 4236 4067 u64 stripe_nr; 4237 4068 u64 stripe_nr_orig; 4238 4069 u64 stripe_nr_end; 4070 + u64 stripe_len; 4071 + u64 *raid_map = NULL; 4239 4072 int stripe_index; 4240 4073 int i; 4241 4074 int ret = 0; ··· 4249 4078 int num_alloc_stripes; 4250 4079 int patch_the_first_stripe_for_dev_replace = 0; 4251 4080 u64 physical_to_patch_in_first_stripe = 0; 4081 + u64 raid56_full_stripe_start = (u64)-1; 4252 4082 4253 4083 read_lock(&em_tree->lock); 4254 4084 em = lookup_extent_mapping(em_tree, logical, *length); ··· 4266 4094 map = (struct map_lookup *)em->bdev; 4267 4095 offset = logical - em->start; 4268 4096 4097 + if (mirror_num > map->num_stripes) 4098 + mirror_num = 0; 4099 + 4100 + stripe_len = map->stripe_len; 4269 4101 stripe_nr = offset; 4270 4102 /* 4271 4103 * stripe_nr counts the total number of stripes we have to stride 4272 4104 * to get to this block 4273 4105 */ 4274 - do_div(stripe_nr, map->stripe_len); 4106 + do_div(stripe_nr, stripe_len); 4275 4107 4276 - stripe_offset = stripe_nr * map->stripe_len; 4108 + stripe_offset = stripe_nr * stripe_len; 4277 4109 BUG_ON(offset < stripe_offset); 4278 4110 4279 4111 /* stripe_offset is the offset of this block in its stripe*/ 4280 4112 stripe_offset = offset - stripe_offset; 4281 4113 4282 - if (rw & REQ_DISCARD) 4114 + /* if we're here for raid56, we need to know the stripe aligned start */ 4115 + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4116 + unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 4117 + raid56_full_stripe_start = offset; 4118 + 4119 + /* allow a write of a full stripe, but make sure we don't 4120 + * allow straddling of stripes 4121 + */ 4122 + do_div(raid56_full_stripe_start, full_stripe_len); 4123 + raid56_full_stripe_start *= full_stripe_len; 4124 + } 4125 + 4126 + if (rw & REQ_DISCARD) { 4127 + /* we don't discard raid56 yet */ 4128 + if (map->type & 4129 + (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { 4130 + ret = -EOPNOTSUPP; 4131 + goto out; 4132 + } 4283 4133 *length = min_t(u64, em->len - offset, *length); 4284 - else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4285 - /* we limit the length of each bio to what fits in a stripe */ 4286 - *length = min_t(u64, em->len - offset, 4287 - map->stripe_len - stripe_offset); 4134 + } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4135 + u64 max_len; 4136 + /* For writes to RAID[56], allow a full stripeset across all disks. 4137 + For other RAID types and for RAID[56] reads, just allow a single 4138 + stripe (on a single disk). */ 4139 + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && 4140 + (rw & REQ_WRITE)) { 4141 + max_len = stripe_len * nr_data_stripes(map) - 4142 + (offset - raid56_full_stripe_start); 4143 + } else { 4144 + /* we limit the length of each bio to what fits in a stripe */ 4145 + max_len = stripe_len - stripe_offset; 4146 + } 4147 + *length = min_t(u64, em->len - offset, max_len); 4288 4148 } else { 4289 4149 *length = em->len - offset; 4290 4150 } 4291 4151 4152 + /* This is for when we're called from btrfs_merge_bio_hook() and all 4153 + it cares about is the length */ 4292 4154 if (!bbio_ret) 4293 4155 goto out; 4294 4156 ··· 4355 4149 u64 physical_of_found = 0; 4356 4150 4357 4151 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4358 - logical, &tmp_length, &tmp_bbio, 0); 4152 + logical, &tmp_length, &tmp_bbio, 0, NULL); 4359 4153 if (ret) { 4360 4154 WARN_ON(tmp_bbio != NULL); 4361 4155 goto out; ··· 4421 4215 do_div(stripe_nr_end, map->stripe_len); 4422 4216 stripe_end_offset = stripe_nr_end * map->stripe_len - 4423 4217 (offset + *length); 4218 + 4424 4219 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4425 4220 if (rw & REQ_DISCARD) 4426 4221 num_stripes = min_t(u64, map->num_stripes, ··· 4471 4264 current->pid % map->sub_stripes, 4472 4265 dev_replace_is_ongoing); 4473 4266 mirror_num = stripe_index - old_stripe_index + 1; 4267 + } 4268 + 4269 + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4270 + BTRFS_BLOCK_GROUP_RAID6)) { 4271 + u64 tmp; 4272 + 4273 + if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) 4274 + && raid_map_ret) { 4275 + int i, rot; 4276 + 4277 + /* push stripe_nr back to the start of the full stripe */ 4278 + stripe_nr = raid56_full_stripe_start; 4279 + do_div(stripe_nr, stripe_len); 4280 + 4281 + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 4282 + 4283 + /* RAID[56] write or recovery. Return all stripes */ 4284 + num_stripes = map->num_stripes; 4285 + max_errors = nr_parity_stripes(map); 4286 + 4287 + raid_map = kmalloc(sizeof(u64) * num_stripes, 4288 + GFP_NOFS); 4289 + if (!raid_map) { 4290 + ret = -ENOMEM; 4291 + goto out; 4292 + } 4293 + 4294 + /* Work out the disk rotation on this stripe-set */ 4295 + tmp = stripe_nr; 4296 + rot = do_div(tmp, num_stripes); 4297 + 4298 + /* Fill in the logical address of each stripe */ 4299 + tmp = stripe_nr * nr_data_stripes(map); 4300 + for (i = 0; i < nr_data_stripes(map); i++) 4301 + raid_map[(i+rot) % num_stripes] = 4302 + em->start + (tmp + i) * map->stripe_len; 4303 + 4304 + raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; 4305 + if (map->type & BTRFS_BLOCK_GROUP_RAID6) 4306 + raid_map[(i+rot+1) % num_stripes] = 4307 + RAID6_Q_STRIPE; 4308 + 4309 + *length = map->stripe_len; 4310 + stripe_index = 0; 4311 + stripe_offset = 0; 4312 + } else { 4313 + /* 4314 + * Mirror #0 or #1 means the original data block. 4315 + * Mirror #2 is RAID5 parity block. 4316 + * Mirror #3 is RAID6 Q block. 4317 + */ 4318 + stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 4319 + if (mirror_num > 1) 4320 + stripe_index = nr_data_stripes(map) + 4321 + mirror_num - 2; 4322 + 4323 + /* We distribute the parity blocks across stripes */ 4324 + tmp = stripe_nr + stripe_index; 4325 + stripe_index = do_div(tmp, map->num_stripes); 4474 4326 } 4475 4327 } else { 4476 4328 /* ··· 4639 4373 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4640 4374 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4641 4375 BTRFS_BLOCK_GROUP_RAID10 | 4376 + BTRFS_BLOCK_GROUP_RAID5 | 4642 4377 BTRFS_BLOCK_GROUP_DUP)) { 4643 4378 max_errors = 1; 4379 + } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { 4380 + max_errors = 2; 4644 4381 } 4645 4382 } 4646 4383 ··· 4744 4475 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4745 4476 bbio->mirror_num = map->num_stripes + 1; 4746 4477 } 4478 + if (raid_map) { 4479 + sort_parity_stripes(bbio, raid_map); 4480 + *raid_map_ret = raid_map; 4481 + } 4747 4482 out: 4748 4483 if (dev_replace_is_ongoing) 4749 4484 btrfs_dev_replace_unlock(dev_replace); ··· 4760 4487 struct btrfs_bio **bbio_ret, int mirror_num) 4761 4488 { 4762 4489 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4763 - mirror_num); 4490 + mirror_num, NULL); 4764 4491 } 4765 4492 4766 4493 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, ··· 4774 4501 u64 bytenr; 4775 4502 u64 length; 4776 4503 u64 stripe_nr; 4504 + u64 rmap_len; 4777 4505 int i, j, nr = 0; 4778 4506 4779 4507 read_lock(&em_tree->lock); ··· 4785 4511 map = (struct map_lookup *)em->bdev; 4786 4512 4787 4513 length = em->len; 4514 + rmap_len = map->stripe_len; 4515 + 4788 4516 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4789 4517 do_div(length, map->num_stripes / map->sub_stripes); 4790 4518 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4791 4519 do_div(length, map->num_stripes); 4520 + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | 4521 + BTRFS_BLOCK_GROUP_RAID6)) { 4522 + do_div(length, nr_data_stripes(map)); 4523 + rmap_len = map->stripe_len * nr_data_stripes(map); 4524 + } 4792 4525 4793 4526 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4794 4527 BUG_ON(!buf); /* -ENOMEM */ ··· 4815 4534 do_div(stripe_nr, map->sub_stripes); 4816 4535 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4817 4536 stripe_nr = stripe_nr * map->num_stripes + i; 4818 - } 4819 - bytenr = chunk_start + stripe_nr * map->stripe_len; 4537 + } /* else if RAID[56], multiply by nr_data_stripes(). 4538 + * Alternatively, just use rmap_len below instead of 4539 + * map->stripe_len */ 4540 + 4541 + bytenr = chunk_start + stripe_nr * rmap_len; 4820 4542 WARN_ON(nr >= map->num_stripes); 4821 4543 for (j = 0; j < nr; j++) { 4822 4544 if (buf[j] == bytenr) ··· 4833 4549 4834 4550 *logical = buf; 4835 4551 *naddrs = nr; 4836 - *stripe_len = map->stripe_len; 4552 + *stripe_len = rmap_len; 4837 4553 4838 4554 free_extent_map(em); 4839 4555 return 0; ··· 4907 4623 bio->bi_bdev = (struct block_device *) 4908 4624 (unsigned long)bbio->mirror_num; 4909 4625 /* only send an error to the higher layers if it is 4910 - * beyond the tolerance of the multi-bio 4626 + * beyond the tolerance of the btrfs bio 4911 4627 */ 4912 4628 if (atomic_read(&bbio->error) > bbio->max_errors) { 4913 4629 err = -EIO; ··· 4941 4657 * This will add one bio to the pending list for a device and make sure 4942 4658 * the work struct is scheduled. 4943 4659 */ 4944 - static noinline void schedule_bio(struct btrfs_root *root, 4660 + noinline void btrfs_schedule_bio(struct btrfs_root *root, 4945 4661 struct btrfs_device *device, 4946 4662 int rw, struct bio *bio) 4947 4663 { 4948 4664 int should_queue = 1; 4949 4665 struct btrfs_pending_bios *pending_bios; 4666 + 4667 + if (device->missing || !device->bdev) { 4668 + bio_endio(bio, -EIO); 4669 + return; 4670 + } 4950 4671 4951 4672 /* don't bother with additional async steps for reads, right now */ 4952 4673 if (!(rw & REQ_WRITE)) { ··· 5050 4761 #endif 5051 4762 bio->bi_bdev = dev->bdev; 5052 4763 if (async) 5053 - schedule_bio(root, dev, rw, bio); 4764 + btrfs_schedule_bio(root, dev, rw, bio); 5054 4765 else 5055 4766 btrfsic_submit_bio(rw, bio); 5056 4767 } ··· 5109 4820 u64 logical = (u64)bio->bi_sector << 9; 5110 4821 u64 length = 0; 5111 4822 u64 map_length; 4823 + u64 *raid_map = NULL; 5112 4824 int ret; 5113 4825 int dev_nr = 0; 5114 4826 int total_devs = 1; ··· 5118 4828 length = bio->bi_size; 5119 4829 map_length = length; 5120 4830 5121 - ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5122 - mirror_num); 5123 - if (ret) 4831 + ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 4832 + mirror_num, &raid_map); 4833 + if (ret) /* -ENOMEM */ 5124 4834 return ret; 5125 4835 5126 4836 total_devs = bbio->num_stripes; 4837 + bbio->orig_bio = first_bio; 4838 + bbio->private = first_bio->bi_private; 4839 + bbio->end_io = first_bio->bi_end_io; 4840 + atomic_set(&bbio->stripes_pending, bbio->num_stripes); 4841 + 4842 + if (raid_map) { 4843 + /* In this case, map_length has been set to the length of 4844 + a single stripe; not the whole write */ 4845 + if (rw & WRITE) { 4846 + return raid56_parity_write(root, bio, bbio, 4847 + raid_map, map_length); 4848 + } else { 4849 + return raid56_parity_recover(root, bio, bbio, 4850 + raid_map, map_length, 4851 + mirror_num); 4852 + } 4853 + } 4854 + 5127 4855 if (map_length < length) { 5128 4856 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5129 4857 "len %llu\n", (unsigned long long)logical, ··· 5149 4841 (unsigned long long)map_length); 5150 4842 BUG(); 5151 4843 } 5152 - 5153 - bbio->orig_bio = first_bio; 5154 - bbio->private = first_bio->bi_private; 5155 - bbio->end_io = first_bio->bi_end_io; 5156 - atomic_set(&bbio->stripes_pending, bbio->num_stripes); 5157 4844 5158 4845 while (dev_nr < total_devs) { 5159 4846 dev = bbio->stripes[dev_nr].dev;
+8 -1
fs/btrfs/volumes.h
··· 321 321 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 322 322 struct btrfs_device *tgtdev); 323 323 int btrfs_scratch_superblock(struct btrfs_device *device); 324 - 324 + void btrfs_schedule_bio(struct btrfs_root *root, 325 + struct btrfs_device *device, 326 + int rw, struct bio *bio); 327 + int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 328 + u64 logical, u64 len, int mirror_num); 329 + unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 330 + struct btrfs_mapping_tree *map_tree, 331 + u64 logical); 325 332 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 326 333 int index) 327 334 {