Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'md/3.16' of git://neil.brown.name/md

Pull md updates from Neil Brown:
"Assorted md fixes for 3.16

Mostly performance improvements with a few corner-case bug fixes"

* tag 'md/3.16' of git://neil.brown.name/md:
raid5: speedup sync_request processing
md/raid5: deadlock between retry_aligned_read with barrier io
raid5: add an option to avoid copy data from bio to stripe cache
md/bitmap: remove confusing code from filemap_get_page.
raid5: avoid release list until last reference of the stripe
md: md_clear_badblocks should return an error code on failure.
md/raid56: Don't perform reads to support writes until stripe is ready.
md: refuse to change shape of array if it is active but read-only

+138 -42
+1 -5
drivers/md/bitmap.c
··· 669 669 /* 670 670 * return a pointer to the page in the filemap that contains the given bit 671 671 * 672 - * this lookup is complicated by the fact that the bitmap sb might be exactly 673 - * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page 674 - * 0 or page 1 675 672 */ 676 673 static inline struct page *filemap_get_page(struct bitmap_storage *store, 677 674 unsigned long chunk) 678 675 { 679 676 if (file_page_index(store, chunk) >= store->file_pages) 680 677 return NULL; 681 - return store->filemap[file_page_index(store, chunk) 682 - - file_page_index(store, 0)]; 678 + return store->filemap[file_page_index(store, chunk)]; 683 679 } 684 680 685 681 static int bitmap_storage_alloc(struct bitmap_storage *store,
+11 -1
drivers/md/md.c
··· 3448 3448 mddev->level = LEVEL_NONE; 3449 3449 return rv; 3450 3450 } 3451 + if (mddev->ro) 3452 + return -EROFS; 3451 3453 3452 3454 /* request to change the personality. Need to ensure: 3453 3455 * - array is not engaged in resync/recovery/reshape ··· 3636 3634 int err; 3637 3635 if (mddev->pers->check_reshape == NULL) 3638 3636 return -EBUSY; 3637 + if (mddev->ro) 3638 + return -EROFS; 3639 3639 mddev->new_layout = n; 3640 3640 err = mddev->pers->check_reshape(mddev); 3641 3641 if (err) { ··· 3727 3723 int err; 3728 3724 if (mddev->pers->check_reshape == NULL) 3729 3725 return -EBUSY; 3726 + if (mddev->ro) 3727 + return -EROFS; 3730 3728 mddev->new_chunk_sectors = n >> 9; 3731 3729 err = mddev->pers->check_reshape(mddev); 3732 3730 if (err) { ··· 6141 6135 */ 6142 6136 if (mddev->sync_thread) 6143 6137 return -EBUSY; 6138 + if (mddev->ro) 6139 + return -EROFS; 6144 6140 6145 6141 rdev_for_each(rdev, mddev) { 6146 6142 sector_t avail = rdev->sectors; ··· 6165 6157 /* change the number of raid disks */ 6166 6158 if (mddev->pers->check_reshape == NULL) 6167 6159 return -EINVAL; 6160 + if (mddev->ro) 6161 + return -EROFS; 6168 6162 if (raid_disks <= 0 || 6169 6163 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6170 6164 return -EINVAL; ··· 8343 8333 if (a < s) { 8344 8334 /* we need to split this range */ 8345 8335 if (bb->count >= MD_MAX_BADBLOCKS) { 8346 - rv = 0; 8336 + rv = -ENOSPC; 8347 8337 goto out; 8348 8338 } 8349 8339 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
+123 -35
drivers/md/raid5.c
··· 292 292 BUG_ON(atomic_read(&conf->active_stripes)==0); 293 293 if (test_bit(STRIPE_HANDLE, &sh->state)) { 294 294 if (test_bit(STRIPE_DELAYED, &sh->state) && 295 - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 295 + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 296 296 list_add_tail(&sh->lru, &conf->delayed_list); 297 - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 297 + if (atomic_read(&conf->preread_active_stripes) 298 + < IO_THRESHOLD) 299 + md_wakeup_thread(conf->mddev->thread); 300 + } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 298 301 sh->bm_seq - conf->seq_write > 0) 299 302 list_add_tail(&sh->lru, &conf->bitmap_list); 300 303 else { ··· 416 413 int hash; 417 414 bool wakeup; 418 415 416 + /* Avoid release_list until the last reference. 417 + */ 418 + if (atomic_add_unless(&sh->count, -1, 1)) 419 + return; 420 + 419 421 if (unlikely(!conf->mddev->thread) || 420 422 test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) 421 423 goto slow_path; ··· 487 479 int num = sh->raid_conf->pool_size; 488 480 489 481 for (i = 0; i < num ; i++) { 482 + WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 490 483 p = sh->dev[i].page; 491 484 if (!p) 492 485 continue; ··· 508 499 return 1; 509 500 } 510 501 sh->dev[i].page = page; 502 + sh->dev[i].orig_page = page; 511 503 } 512 504 return 0; 513 505 } ··· 865 855 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 866 856 bi->bi_rw |= REQ_NOMERGE; 867 857 858 + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 859 + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 860 + sh->dev[i].vec.bv_page = sh->dev[i].page; 868 861 bi->bi_vcnt = 1; 869 862 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 870 863 bi->bi_io_vec[0].bv_offset = 0; ··· 912 899 else 913 900 rbi->bi_iter.bi_sector = (sh->sector 914 901 + rrdev->data_offset); 902 + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 903 + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 904 + sh->dev[i].rvec.bv_page = sh->dev[i].page; 915 905 rbi->bi_vcnt = 1; 916 906 rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; 917 907 rbi->bi_io_vec[0].bv_offset = 0; ··· 943 927 } 944 928 945 929 static struct dma_async_tx_descriptor * 946 - async_copy_data(int frombio, struct bio *bio, struct page *page, 947 - sector_t sector, struct dma_async_tx_descriptor *tx) 930 + async_copy_data(int frombio, struct bio *bio, struct page **page, 931 + sector_t sector, struct dma_async_tx_descriptor *tx, 932 + struct stripe_head *sh) 948 933 { 949 934 struct bio_vec bvl; 950 935 struct bvec_iter iter; ··· 982 965 if (clen > 0) { 983 966 b_offset += bvl.bv_offset; 984 967 bio_page = bvl.bv_page; 985 - if (frombio) 986 - tx = async_memcpy(page, bio_page, page_offset, 968 + if (frombio) { 969 + if (sh->raid_conf->skip_copy && 970 + b_offset == 0 && page_offset == 0 && 971 + clen == STRIPE_SIZE) 972 + *page = bio_page; 973 + else 974 + tx = async_memcpy(*page, bio_page, page_offset, 987 975 b_offset, clen, &submit); 988 - else 989 - tx = async_memcpy(bio_page, page, b_offset, 976 + } else 977 + tx = async_memcpy(bio_page, *page, b_offset, 990 978 page_offset, clen, &submit); 991 979 } 992 980 /* chain the operations */ ··· 1067 1045 spin_unlock_irq(&sh->stripe_lock); 1068 1046 while (rbi && rbi->bi_iter.bi_sector < 1069 1047 dev->sector + STRIPE_SECTORS) { 1070 - tx = async_copy_data(0, rbi, dev->page, 1071 - dev->sector, tx); 1048 + tx = async_copy_data(0, rbi, &dev->page, 1049 + dev->sector, tx, sh); 1072 1050 rbi = r5_next_bio(rbi, dev->sector); 1073 1051 } 1074 1052 } ··· 1406 1384 BUG_ON(dev->written); 1407 1385 wbi = dev->written = chosen; 1408 1386 spin_unlock_irq(&sh->stripe_lock); 1387 + WARN_ON(dev->page != dev->orig_page); 1409 1388 1410 1389 while (wbi && wbi->bi_iter.bi_sector < 1411 1390 dev->sector + STRIPE_SECTORS) { ··· 1416 1393 set_bit(R5_SyncIO, &dev->flags); 1417 1394 if (wbi->bi_rw & REQ_DISCARD) 1418 1395 set_bit(R5_Discard, &dev->flags); 1419 - else 1420 - tx = async_copy_data(1, wbi, dev->page, 1421 - dev->sector, tx); 1396 + else { 1397 + tx = async_copy_data(1, wbi, &dev->page, 1398 + dev->sector, tx, sh); 1399 + if (dev->page != dev->orig_page) { 1400 + set_bit(R5_SkipCopy, &dev->flags); 1401 + clear_bit(R5_UPTODATE, &dev->flags); 1402 + clear_bit(R5_OVERWRITE, &dev->flags); 1403 + } 1404 + } 1422 1405 wbi = r5_next_bio(wbi, dev->sector); 1423 1406 } 1424 1407 } ··· 1455 1426 struct r5dev *dev = &sh->dev[i]; 1456 1427 1457 1428 if (dev->written || i == pd_idx || i == qd_idx) { 1458 - if (!discard) 1429 + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) 1459 1430 set_bit(R5_UPTODATE, &dev->flags); 1460 1431 if (fua) 1461 1432 set_bit(R5_WantFUA, &dev->flags); ··· 1868 1839 osh = get_free_stripe(conf, hash); 1869 1840 unlock_device_hash_lock(conf, hash); 1870 1841 atomic_set(&nsh->count, 1); 1871 - for(i=0; i<conf->pool_size; i++) 1842 + for(i=0; i<conf->pool_size; i++) { 1872 1843 nsh->dev[i].page = osh->dev[i].page; 1844 + nsh->dev[i].orig_page = osh->dev[i].page; 1845 + } 1873 1846 for( ; i<newsize; i++) 1874 1847 nsh->dev[i].page = NULL; 1875 1848 nsh->hash_lock_index = hash; ··· 1927 1896 if (nsh->dev[i].page == NULL) { 1928 1897 struct page *p = alloc_page(GFP_NOIO); 1929 1898 nsh->dev[i].page = p; 1899 + nsh->dev[i].orig_page = p; 1930 1900 if (!p) 1931 1901 err = -ENOMEM; 1932 1902 } ··· 2165 2133 } 2166 2134 2167 2135 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 2168 - 2136 + 2169 2137 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 2170 2138 { 2171 2139 struct r5dev *dev = &sh->dev[i]; 2172 2140 2173 2141 bio_init(&dev->req); 2174 2142 dev->req.bi_io_vec = &dev->vec; 2175 - dev->req.bi_vcnt++; 2176 - dev->req.bi_max_vecs++; 2143 + dev->req.bi_max_vecs = 1; 2177 2144 dev->req.bi_private = sh; 2178 - dev->vec.bv_page = dev->page; 2179 2145 2180 2146 bio_init(&dev->rreq); 2181 2147 dev->rreq.bi_io_vec = &dev->rvec; 2182 - dev->rreq.bi_vcnt++; 2183 - dev->rreq.bi_max_vecs++; 2148 + dev->rreq.bi_max_vecs = 1; 2184 2149 dev->rreq.bi_private = sh; 2185 - dev->rvec.bv_page = dev->page; 2186 2150 2187 2151 dev->flags = 0; 2188 2152 dev->sector = compute_blocknr(sh, i, previous); ··· 2778 2750 /* and fail all 'written' */ 2779 2751 bi = sh->dev[i].written; 2780 2752 sh->dev[i].written = NULL; 2753 + if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { 2754 + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 2755 + sh->dev[i].page = sh->dev[i].orig_page; 2756 + } 2757 + 2781 2758 if (bi) bitmap_end = 1; 2782 2759 while (bi && bi->bi_iter.bi_sector < 2783 2760 sh->dev[i].sector + STRIPE_SECTORS) { ··· 2919 2886 (s->failed >= 1 && fdev[0]->toread) || 2920 2887 (s->failed >= 2 && fdev[1]->toread) || 2921 2888 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && 2889 + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && 2922 2890 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || 2923 - (sh->raid_conf->level == 6 && s->failed && s->to_write))) { 2891 + (sh->raid_conf->level == 6 && s->failed && s->to_write && 2892 + s->to_write < sh->raid_conf->raid_disks - 2 && 2893 + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { 2924 2894 /* we would like to get this block, possibly by computing it, 2925 2895 * otherwise read it if the backing disk is insync 2926 2896 */ ··· 3027 2991 dev = &sh->dev[i]; 3028 2992 if (!test_bit(R5_LOCKED, &dev->flags) && 3029 2993 (test_bit(R5_UPTODATE, &dev->flags) || 3030 - test_bit(R5_Discard, &dev->flags))) { 2994 + test_bit(R5_Discard, &dev->flags) || 2995 + test_bit(R5_SkipCopy, &dev->flags))) { 3031 2996 /* We can return any write requests */ 3032 2997 struct bio *wbi, *wbi2; 3033 2998 pr_debug("Return write for disc %d\n", i); 3034 2999 if (test_and_clear_bit(R5_Discard, &dev->flags)) 3035 3000 clear_bit(R5_UPTODATE, &dev->flags); 3001 + if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { 3002 + WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); 3003 + dev->page = dev->orig_page; 3004 + } 3036 3005 wbi = dev->written; 3037 3006 dev->written = NULL; 3038 3007 while (wbi && wbi->bi_iter.bi_sector < ··· 3056 3015 0); 3057 3016 } else if (test_bit(R5_Discard, &dev->flags)) 3058 3017 discard_pending = 1; 3018 + WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); 3019 + WARN_ON(dev->page != dev->orig_page); 3059 3020 } 3060 3021 if (!discard_pending && 3061 3022 test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { ··· 3129 3086 !test_bit(R5_LOCKED, &dev->flags) && 3130 3087 !(test_bit(R5_UPTODATE, &dev->flags) || 3131 3088 test_bit(R5_Wantcompute, &dev->flags))) { 3132 - if (test_bit(R5_Insync, &dev->flags)) rcw++; 3089 + if (test_bit(R5_Insync, &dev->flags)) 3090 + rcw++; 3133 3091 else 3134 3092 rcw += 2*disks; 3135 3093 } ··· 3151 3107 !(test_bit(R5_UPTODATE, &dev->flags) || 3152 3108 test_bit(R5_Wantcompute, &dev->flags)) && 3153 3109 test_bit(R5_Insync, &dev->flags)) { 3154 - if ( 3155 - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3156 - pr_debug("Read_old block " 3157 - "%d for r-m-w\n", i); 3110 + if (test_bit(STRIPE_PREREAD_ACTIVE, 3111 + &sh->state)) { 3112 + pr_debug("Read_old block %d for r-m-w\n", 3113 + i); 3158 3114 set_bit(R5_LOCKED, &dev->flags); 3159 3115 set_bit(R5_Wantread, &dev->flags); 3160 3116 s->locked++; ··· 3177 3133 !(test_bit(R5_UPTODATE, &dev->flags) || 3178 3134 test_bit(R5_Wantcompute, &dev->flags))) { 3179 3135 rcw++; 3180 - if (!test_bit(R5_Insync, &dev->flags)) 3181 - continue; /* it's a failed drive */ 3182 - if ( 3183 - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3136 + if (test_bit(R5_Insync, &dev->flags) && 3137 + test_bit(STRIPE_PREREAD_ACTIVE, 3138 + &sh->state)) { 3184 3139 pr_debug("Read_old block " 3185 3140 "%d for Reconstruct\n", i); 3186 3141 set_bit(R5_LOCKED, &dev->flags); ··· 5074 5031 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 5075 5032 5076 5033 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 5034 + set_bit(STRIPE_HANDLE, &sh->state); 5077 5035 5078 - handle_stripe(sh); 5079 5036 release_stripe(sh); 5080 5037 5081 5038 return STRIPE_SECTORS; ··· 5115 5072 /* already done this stripe */ 5116 5073 continue; 5117 5074 5118 - sh = get_active_stripe(conf, sector, 0, 1, 0); 5075 + sh = get_active_stripe(conf, sector, 0, 1, 1); 5119 5076 5120 5077 if (!sh) { 5121 5078 /* failed to get a stripe - must wait */ ··· 5398 5355 raid5_store_preread_threshold); 5399 5356 5400 5357 static ssize_t 5358 + raid5_show_skip_copy(struct mddev *mddev, char *page) 5359 + { 5360 + struct r5conf *conf = mddev->private; 5361 + if (conf) 5362 + return sprintf(page, "%d\n", conf->skip_copy); 5363 + else 5364 + return 0; 5365 + } 5366 + 5367 + static ssize_t 5368 + raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) 5369 + { 5370 + struct r5conf *conf = mddev->private; 5371 + unsigned long new; 5372 + if (len >= PAGE_SIZE) 5373 + return -EINVAL; 5374 + if (!conf) 5375 + return -ENODEV; 5376 + 5377 + if (kstrtoul(page, 10, &new)) 5378 + return -EINVAL; 5379 + new = !!new; 5380 + if (new == conf->skip_copy) 5381 + return len; 5382 + 5383 + mddev_suspend(mddev); 5384 + conf->skip_copy = new; 5385 + if (new) 5386 + mddev->queue->backing_dev_info.capabilities |= 5387 + BDI_CAP_STABLE_WRITES; 5388 + else 5389 + mddev->queue->backing_dev_info.capabilities &= 5390 + ~BDI_CAP_STABLE_WRITES; 5391 + mddev_resume(mddev); 5392 + return len; 5393 + } 5394 + 5395 + static struct md_sysfs_entry 5396 + raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, 5397 + raid5_show_skip_copy, 5398 + raid5_store_skip_copy); 5399 + 5400 + 5401 + static ssize_t 5401 5402 stripe_cache_active_show(struct mddev *mddev, char *page) 5402 5403 { 5403 5404 struct r5conf *conf = mddev->private; ··· 5526 5439 &raid5_stripecache_active.attr, 5527 5440 &raid5_preread_bypass_threshold.attr, 5528 5441 &raid5_group_thread_cnt.attr, 5442 + &raid5_skip_copy.attr, 5529 5443 NULL, 5530 5444 }; 5531 5445 static struct attribute_group raid5_attrs_group = {
+3 -1
drivers/md/raid5.h
··· 232 232 */ 233 233 struct bio req, rreq; 234 234 struct bio_vec vec, rvec; 235 - struct page *page; 235 + struct page *page, *orig_page; 236 236 struct bio *toread, *read, *towrite, *written; 237 237 sector_t sector; /* sector of this page */ 238 238 unsigned long flags; ··· 299 299 * data in, and now is a good time to write it out. 300 300 */ 301 301 R5_Discard, /* Discard the stripe */ 302 + R5_SkipCopy, /* Don't copy data from bio to stripe cache */ 302 303 }; 303 304 304 305 /* ··· 437 436 atomic_t pending_full_writes; /* full write backlog */ 438 437 int bypass_count; /* bypassed prereads */ 439 438 int bypass_threshold; /* preread nice */ 439 + int skip_copy; /* Don't copy data from bio to stripe cache */ 440 440 struct list_head *last_hold; /* detect hold_list promotions */ 441 441 442 442 atomic_t reshape_stripes; /* stripes with pending writes for reshape */