Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

md/raid5: let multiple devices of stripe_head share page

In current implementation, grow_buffers() uses alloc_page() to
allocate the buffers for each stripe_head, i.e. allocate a page
for each dev[i] in stripe_head.

After setting stripe_size as a configurable value by writing
sysfs entry, it means that we always allocate 64K buffers, but
just use 4K of them when stripe_size is 4K in 64KB arm64.

To avoid wasting memory, we try to let multiple sh->dev share
one real page. That means, multiple sh->dev[i].page will point
to the only page with different offset. Example of 64K PAGE_SIZE
and 4K stripe_size as following:

64K PAGE_SIZE
+---+---+---+---+------------------------------+
| | | | |
| | | | |
+-+-+-+-+-+-+-+-+------------------------------+
^ ^ ^ ^
| | | +----------------------------+
| | | |
| | +-------------------+ |
| | | |
| +----------+ | |
| | | |
+-+ | | |
| | | |
+-----+-----+------+-----+------+-----+------+------+
sh | offset(0) | offset(4K) | offset(8K) | offset(12K) |
+ +-----------+------------+------------+-------------+
+----> dev[0].page dev[1].page dev[2].page dev[3].page

A new 'pages' array will be added into stripe_head to record shared
page used by this stripe_head. Allocate them when grow_buffers()
and free them when shrink_buffers().

After trying to share page, the users of sh->dev[i].page need to take
care of the related page offset: page of issued bio and page passed
to xor compution functions. But thanks for previous different page offset
supported. Here, we just need to set correct dev[i].offset.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>

authored by

Yufen Yu and committed by
Song Liu
046169f0 4f86ff55

+114 -3
+87 -2
drivers/md/raid5.c
··· 448 448 return sh; 449 449 } 450 450 451 + #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 452 + static void free_stripe_pages(struct stripe_head *sh) 453 + { 454 + int i; 455 + struct page *p; 456 + 457 + /* Have not allocate page pool */ 458 + if (!sh->pages) 459 + return; 460 + 461 + for (i = 0; i < sh->nr_pages; i++) { 462 + p = sh->pages[i]; 463 + if (p) 464 + put_page(p); 465 + sh->pages[i] = NULL; 466 + } 467 + } 468 + 469 + static int alloc_stripe_pages(struct stripe_head *sh, gfp_t gfp) 470 + { 471 + int i; 472 + struct page *p; 473 + 474 + for (i = 0; i < sh->nr_pages; i++) { 475 + /* The page have allocated. */ 476 + if (sh->pages[i]) 477 + continue; 478 + 479 + p = alloc_page(gfp); 480 + if (!p) { 481 + free_stripe_pages(sh); 482 + return -ENOMEM; 483 + } 484 + sh->pages[i] = p; 485 + } 486 + return 0; 487 + } 488 + 489 + static int 490 + init_stripe_shared_pages(struct stripe_head *sh, struct r5conf *conf, int disks) 491 + { 492 + int nr_pages, cnt; 493 + 494 + if (sh->pages) 495 + return 0; 496 + 497 + /* Each of the sh->dev[i] need one conf->stripe_size */ 498 + cnt = PAGE_SIZE / conf->stripe_size; 499 + nr_pages = (disks + cnt - 1) / cnt; 500 + 501 + sh->pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 502 + if (!sh->pages) 503 + return -ENOMEM; 504 + sh->nr_pages = nr_pages; 505 + sh->stripes_per_page = cnt; 506 + return 0; 507 + } 508 + #endif 509 + 451 510 static void shrink_buffers(struct stripe_head *sh) 452 511 { 453 - struct page *p; 454 512 int i; 455 513 int num = sh->raid_conf->pool_size; 456 514 515 + #if PAGE_SIZE == DEFAULT_STRIPE_SIZE 457 516 for (i = 0; i < num ; i++) { 517 + struct page *p; 518 + 458 519 WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); 459 520 p = sh->dev[i].page; 460 521 if (!p) ··· 523 462 sh->dev[i].page = NULL; 524 463 put_page(p); 525 464 } 465 + #else 466 + for (i = 0; i < num; i++) 467 + sh->dev[i].page = NULL; 468 + free_stripe_pages(sh); /* Free pages */ 469 + #endif 526 470 } 527 471 528 472 static int grow_buffers(struct stripe_head *sh, gfp_t gfp) ··· 535 469 int i; 536 470 int num = sh->raid_conf->pool_size; 537 471 472 + #if PAGE_SIZE == DEFAULT_STRIPE_SIZE 538 473 for (i = 0; i < num; i++) { 539 474 struct page *page; 540 475 ··· 546 479 sh->dev[i].orig_page = page; 547 480 sh->dev[i].offset = 0; 548 481 } 482 + #else 483 + if (alloc_stripe_pages(sh, gfp)) 484 + return -ENOMEM; 549 485 486 + for (i = 0; i < num; i++) { 487 + sh->dev[i].page = raid5_get_dev_page(sh, i); 488 + sh->dev[i].orig_page = sh->dev[i].page; 489 + sh->dev[i].offset = raid5_get_page_offset(sh, i); 490 + } 491 + #endif 550 492 return 0; 551 493 } 552 494 ··· 2281 2205 2282 2206 static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) 2283 2207 { 2208 + #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 2209 + kfree(sh->pages); 2210 + #endif 2284 2211 if (sh->ppl_page) 2285 2212 __free_page(sh->ppl_page); 2286 2213 kmem_cache_free(sc, sh); ··· 2317 2238 sh->ppl_page = alloc_page(gfp); 2318 2239 if (!sh->ppl_page) { 2319 2240 free_stripe(sc, sh); 2320 - sh = NULL; 2241 + return NULL; 2321 2242 } 2322 2243 } 2244 + #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 2245 + if (init_stripe_shared_pages(sh, conf, disks)) { 2246 + free_stripe(sc, sh); 2247 + return NULL; 2248 + } 2249 + #endif 2323 2250 } 2324 2251 return sh; 2325 2252 }
+27 -1
drivers/md/raid5.h
··· 195 195 reconstruct_state_result, 196 196 }; 197 197 198 + #define DEFAULT_STRIPE_SIZE 4096 198 199 struct stripe_head { 199 200 struct hlist_node hash; 200 201 struct list_head lru; /* inactive_list or handle_list */ ··· 247 246 int target, target2; 248 247 enum sum_check_flags zero_sum_result; 249 248 } ops; 249 + 250 + #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 251 + /* These pages will be used by bios in dev[i] */ 252 + struct page **pages; 253 + int nr_pages; /* page array size */ 254 + int stripes_per_page; 255 + #endif 250 256 struct r5dev { 251 257 /* rreq and rvec are used for the replacement device when 252 258 * writing data to both devices. ··· 481 473 */ 482 474 483 475 #define NR_STRIPES 256 484 - #define DEFAULT_STRIPE_SIZE 4096 485 476 486 477 #if PAGE_SIZE == DEFAULT_STRIPE_SIZE 487 478 #define STRIPE_SIZE PAGE_SIZE ··· 778 771 { 779 772 return layout >= 8 && layout <= 10; 780 773 } 774 + 775 + #if PAGE_SIZE != DEFAULT_STRIPE_SIZE 776 + /* 777 + * Return offset of the corresponding page for r5dev. 778 + */ 779 + static inline int raid5_get_page_offset(struct stripe_head *sh, int disk_idx) 780 + { 781 + return (disk_idx % sh->stripes_per_page) * RAID5_STRIPE_SIZE(sh->raid_conf); 782 + } 783 + 784 + /* 785 + * Return corresponding page address for r5dev. 786 + */ 787 + static inline struct page * 788 + raid5_get_dev_page(struct stripe_head *sh, int disk_idx) 789 + { 790 + return sh->pages[disk_idx / sh->stripes_per_page]; 791 + } 792 + #endif 781 793 782 794 extern void md_raid5_kick_device(struct r5conf *conf); 783 795 extern int raid5_set_cache_size(struct mddev *mddev, int size);