md/r5cache: read data into orig_page for prexor of cached data

With write back cache, we use orig_page to do prexor. This patch
makes sure we read data into orig_page for it.

Flag R5_OrigPageUPTDODATE is added to show whether orig_page
has the latest data from raid disk.

We introduce a helper function uptodate_for_rmw() to simplify
the a couple conditions in handle_stripe_dirtying().

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>

authored by Song Liu and committed by Shaohua Li 86aa1397 d46d29f0

+42 -9
+2
drivers/md/raid5-cache.c
··· 2349 struct page *p = sh->dev[i].orig_page; 2350 2351 sh->dev[i].orig_page = sh->dev[i].page; 2352 if (!using_disk_info_extra_page) 2353 put_page(p); 2354 }
··· 2349 struct page *p = sh->dev[i].orig_page; 2350 2351 sh->dev[i].orig_page = sh->dev[i].page; 2352 + clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2353 + 2354 if (!using_disk_info_extra_page) 2355 put_page(p); 2356 }
+35 -9
drivers/md/raid5.c
··· 1015 1016 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1017 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1018 - sh->dev[i].vec.bv_page = sh->dev[i].page; 1019 bi->bi_vcnt = 1; 1020 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1021 bi->bi_io_vec[0].bv_offset = 0; ··· 2390 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2391 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2392 2393 if (atomic_read(&rdev->read_errors)) 2394 atomic_set(&rdev->read_errors, 0); 2395 } else { ··· 3611 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3612 } 3613 3614 static int handle_stripe_dirtying(struct r5conf *conf, 3615 struct stripe_head *sh, 3616 struct stripe_head_state *s, ··· 3657 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || 3658 test_bit(R5_InJournal, &dev->flags)) && 3659 !test_bit(R5_LOCKED, &dev->flags) && 3660 - !((test_bit(R5_UPTODATE, &dev->flags) && 3661 - (!test_bit(R5_InJournal, &dev->flags) || 3662 - dev->page != dev->orig_page)) || 3663 test_bit(R5_Wantcompute, &dev->flags))) { 3664 if (test_bit(R5_Insync, &dev->flags)) 3665 rmw++; ··· 3669 i != sh->pd_idx && i != sh->qd_idx && 3670 !test_bit(R5_LOCKED, &dev->flags) && 3671 !(test_bit(R5_UPTODATE, &dev->flags) || 3672 - test_bit(R5_InJournal, &dev->flags) || 3673 test_bit(R5_Wantcompute, &dev->flags))) { 3674 if (test_bit(R5_Insync, &dev->flags)) 3675 rcw++; ··· 3722 i == sh->pd_idx || i == sh->qd_idx || 3723 test_bit(R5_InJournal, &dev->flags)) && 3724 !test_bit(R5_LOCKED, &dev->flags) && 3725 - !((test_bit(R5_UPTODATE, &dev->flags) && 3726 - (!test_bit(R5_InJournal, &dev->flags) || 3727 - dev->page != dev->orig_page)) || 3728 test_bit(R5_Wantcompute, &dev->flags)) && 3729 test_bit(R5_Insync, &dev->flags)) { 3730 if (test_bit(STRIPE_PREREAD_ACTIVE, ··· 3749 i != sh->pd_idx && i != sh->qd_idx && 3750 !test_bit(R5_LOCKED, &dev->flags) && 3751 !(test_bit(R5_UPTODATE, &dev->flags) || 3752 - test_bit(R5_InJournal, &dev->flags) || 3753 test_bit(R5_Wantcompute, &dev->flags))) { 3754 rcw++; 3755 if (test_bit(R5_Insync, &dev->flags) &&
··· 1015 1016 if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) 1017 WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); 1018 + 1019 + if (!op_is_write(op) && 1020 + test_bit(R5_InJournal, &sh->dev[i].flags)) 1021 + /* 1022 + * issuing read for a page in journal, this 1023 + * must be preparing for prexor in rmw; read 1024 + * the data into orig_page 1025 + */ 1026 + sh->dev[i].vec.bv_page = sh->dev[i].orig_page; 1027 + else 1028 + sh->dev[i].vec.bv_page = sh->dev[i].page; 1029 bi->bi_vcnt = 1; 1030 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 1031 bi->bi_io_vec[0].bv_offset = 0; ··· 2380 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) 2381 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); 2382 2383 + if (test_bit(R5_InJournal, &sh->dev[i].flags)) 2384 + /* 2385 + * end read for a page in journal, this 2386 + * must be preparing for prexor in rmw 2387 + */ 2388 + set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); 2389 + 2390 if (atomic_read(&rdev->read_errors)) 2391 atomic_set(&rdev->read_errors, 0); 2392 } else { ··· 3594 break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); 3595 } 3596 3597 + /* 3598 + * For RMW in write back cache, we need extra page in prexor to store the 3599 + * old data. This page is stored in dev->orig_page. 3600 + * 3601 + * This function checks whether we have data for prexor. The exact logic 3602 + * is: 3603 + * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) 3604 + */ 3605 + static inline bool uptodate_for_rmw(struct r5dev *dev) 3606 + { 3607 + return (test_bit(R5_UPTODATE, &dev->flags)) && 3608 + (!test_bit(R5_InJournal, &dev->flags) || 3609 + test_bit(R5_OrigPageUPTDODATE, &dev->flags)); 3610 + } 3611 + 3612 static int handle_stripe_dirtying(struct r5conf *conf, 3613 struct stripe_head *sh, 3614 struct stripe_head_state *s, ··· 3625 if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || 3626 test_bit(R5_InJournal, &dev->flags)) && 3627 !test_bit(R5_LOCKED, &dev->flags) && 3628 + !(uptodate_for_rmw(dev) || 3629 test_bit(R5_Wantcompute, &dev->flags))) { 3630 if (test_bit(R5_Insync, &dev->flags)) 3631 rmw++; ··· 3639 i != sh->pd_idx && i != sh->qd_idx && 3640 !test_bit(R5_LOCKED, &dev->flags) && 3641 !(test_bit(R5_UPTODATE, &dev->flags) || 3642 test_bit(R5_Wantcompute, &dev->flags))) { 3643 if (test_bit(R5_Insync, &dev->flags)) 3644 rcw++; ··· 3693 i == sh->pd_idx || i == sh->qd_idx || 3694 test_bit(R5_InJournal, &dev->flags)) && 3695 !test_bit(R5_LOCKED, &dev->flags) && 3696 + !(uptodate_for_rmw(dev) || 3697 test_bit(R5_Wantcompute, &dev->flags)) && 3698 test_bit(R5_Insync, &dev->flags)) { 3699 if (test_bit(STRIPE_PREREAD_ACTIVE, ··· 3722 i != sh->pd_idx && i != sh->qd_idx && 3723 !test_bit(R5_LOCKED, &dev->flags) && 3724 !(test_bit(R5_UPTODATE, &dev->flags) || 3725 test_bit(R5_Wantcompute, &dev->flags))) { 3726 rcw++; 3727 if (test_bit(R5_Insync, &dev->flags) &&
+5
drivers/md/raid5.h
··· 322 * data and parity being written are in the journal 323 * device 324 */ 325 }; 326 327 /*
··· 322 * data and parity being written are in the journal 323 * device 324 */ 325 + R5_OrigPageUPTDODATE, /* with write back cache, we read old data into 326 + * dev->orig_page for prexor. When this flag is 327 + * set, orig_page contains latest data in the 328 + * raid disk. 329 + */ 330 }; 331 332 /*