Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ore: RAID5 read

This patch introduces the first stage of RAID5 support
mainly the skip-over-raid-units when reading. For
writes it inserts BLANK units, into where XOR blocks
should be calculated and written to.

It introduces the new "general raid maths", and the main
additional parameters and components needed for raid5.

Since at this stage it could corrupt future version that
actually do support raid5. The enablement of raid5
mounting and setting of parity-count > 0 is disabled. So
the raid5 code will never be used. Mounting of raid5 is
only enabled later once the basic XOR write is also in.
But if the patch "enable RAID5" is applied this code has
been tested to be able to properly read raid5 volumes
and is according to standard.

Also it has been tested that the new maths still properly
supports RAID0 and grouping code just as before.
(BTW: I have found more bugs in the pnfs-obj RAID math
fixed here)

The ore.c file is getting too big, so new ore_raid.[hc]
files are added that will include the special raid stuff
that are not used in striping and mirrors. In future write
support these will get bigger.
When adding the ore_raid.c to Kbuild file I was forced to
rename ore.ko to libore.ko. Is it possible to keep source
file, say ore.c and module file ore.ko the same even if there
are multiple files inside ore.ko?

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>

+473 -81
+2 -1
fs/exofs/Kbuild
··· 13 13 # 14 14 15 15 # ore module library 16 - obj-$(CONFIG_ORE) += ore.o 16 + libore-y := ore.o ore_raid.o 17 + obj-$(CONFIG_ORE) += libore.o 17 18 18 19 exofs-y := inode.o file.o symlink.o namei.o dir.o super.o 19 20 obj-$(CONFIG_EXOFS_FS) += exofs.o
+249 -77
fs/exofs/ore.c
··· 24 24 25 25 #include <linux/slab.h> 26 26 #include <asm/div64.h> 27 + #include <linux/lcm.h> 27 28 28 - #include <scsi/osd_ore.h> 29 - 30 - #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) 31 - 32 - #ifdef CONFIG_EXOFS_DEBUG 33 - #define ORE_DBGMSG(fmt, a...) \ 34 - printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) 35 - #else 36 - #define ORE_DBGMSG(fmt, a...) \ 37 - do { if (0) printk(fmt, ##a); } while (0) 38 - #endif 39 - 40 - /* u64 has problems with printk this will cast it to unsigned long long */ 41 - #define _LLU(x) (unsigned long long)(x) 42 - 43 - #define ORE_DBGMSG2(M...) do {} while (0) 44 - /* #define ORE_DBGMSG2 ORE_DBGMSG */ 29 + #include "ore_raid.h" 45 30 46 31 MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); 47 32 MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); ··· 118 133 return ore_comp_dev(ios->oc, index); 119 134 } 120 135 121 - static int _get_io_state(struct ore_layout *layout, 122 - struct ore_components *oc, unsigned numdevs, 123 - struct ore_io_state **pios) 136 + static int _ore_get_io_state(struct ore_layout *layout, 137 + struct ore_components *oc, unsigned numdevs, 138 + unsigned sgs_per_dev, unsigned num_par_pages, 139 + struct ore_io_state **pios) 124 140 { 125 141 struct ore_io_state *ios; 142 + struct page **pages; 143 + struct osd_sg_entry *sgilist; 144 + struct __alloc_all_io_state { 145 + struct ore_io_state ios; 146 + struct ore_per_dev_state per_dev[numdevs]; 147 + union { 148 + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; 149 + struct page *pages[num_par_pages]; 150 + }; 151 + } *_aios; 126 152 127 - /*TODO: Maybe use kmem_cach per sbi of size 128 - * exofs_io_state_size(layout->s_numdevs) 129 - */ 130 - ios = kzalloc(ore_io_state_size(numdevs), GFP_KERNEL); 131 - if (unlikely(!ios)) { 132 - ORE_DBGMSG("Failed kzalloc bytes=%d\n", 133 - ore_io_state_size(numdevs)); 134 - *pios = NULL; 135 - return -ENOMEM; 153 + if (likely(sizeof(*_aios) <= PAGE_SIZE)) { 154 + _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); 155 + if (unlikely(!_aios)) { 156 + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", 157 + sizeof(*_aios)); 158 + *pios = NULL; 159 + return -ENOMEM; 160 + } 161 + pages = num_par_pages ? _aios->pages : NULL; 162 + sgilist = sgs_per_dev ? _aios->sglist : NULL; 163 + ios = &_aios->ios; 164 + } else { 165 + struct __alloc_small_io_state { 166 + struct ore_io_state ios; 167 + struct ore_per_dev_state per_dev[numdevs]; 168 + } *_aio_small; 169 + union __extra_part { 170 + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; 171 + struct page *pages[num_par_pages]; 172 + } *extra_part; 173 + 174 + _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); 175 + if (unlikely(!_aio_small)) { 176 + ORE_DBGMSG("Failed alloc first part bytes=%zd\n", 177 + sizeof(*_aio_small)); 178 + *pios = NULL; 179 + return -ENOMEM; 180 + } 181 + extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); 182 + if (unlikely(!extra_part)) { 183 + ORE_DBGMSG("Failed alloc second part bytes=%zd\n", 184 + sizeof(*extra_part)); 185 + kfree(_aio_small); 186 + *pios = NULL; 187 + return -ENOMEM; 188 + } 189 + 190 + pages = num_par_pages ? extra_part->pages : NULL; 191 + sgilist = sgs_per_dev ? extra_part->sglist : NULL; 192 + /* In this case the per_dev[0].sgilist holds the pointer to 193 + * be freed 194 + */ 195 + ios = &_aio_small->ios; 196 + ios->extra_part_alloc = true; 197 + } 198 + 199 + if (pages) { 200 + ios->parity_pages = pages; 201 + ios->max_par_pages = num_par_pages; 202 + } 203 + if (sgilist) { 204 + unsigned d; 205 + 206 + for (d = 0; d < numdevs; ++d) { 207 + ios->per_dev[d].sglist = sgilist; 208 + sgilist += sgs_per_dev; 209 + } 210 + ios->sgs_per_dev = sgs_per_dev; 136 211 } 137 212 138 213 ios->layout = layout; ··· 223 178 { 224 179 struct ore_io_state *ios; 225 180 unsigned numdevs = layout->group_width * layout->mirrors_p1; 181 + unsigned sgs_per_dev = 0, max_par_pages = 0; 226 182 int ret; 227 183 228 - ret = _get_io_state(layout, oc, numdevs, pios); 184 + if (layout->parity && length) { 185 + unsigned data_devs = layout->group_width - layout->parity; 186 + unsigned stripe_size = layout->stripe_unit * data_devs; 187 + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; 188 + u32 remainder; 189 + u64 num_stripes; 190 + u64 num_raid_units; 191 + 192 + num_stripes = div_u64_rem(length, stripe_size, &remainder); 193 + if (remainder) 194 + ++num_stripes; 195 + 196 + num_raid_units = num_stripes * layout->parity; 197 + 198 + if (is_reading) { 199 + /* For reads add per_dev sglist array */ 200 + /* TODO: Raid 6 we need twice more. Actually: 201 + * num_stripes / LCMdP(W,P); 202 + * if (W%P != 0) num_stripes *= parity; 203 + */ 204 + 205 + /* first/last seg is split */ 206 + num_raid_units += layout->group_width; 207 + sgs_per_dev = div_u64(num_raid_units, data_devs); 208 + } else { 209 + /* For Writes add parity pages array. */ 210 + max_par_pages = num_raid_units * pages_in_unit * 211 + sizeof(struct page *); 212 + } 213 + } 214 + 215 + ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, 216 + pios); 229 217 if (unlikely(ret)) 230 218 return ret; 231 219 ··· 267 189 ios->offset = offset; 268 190 269 191 if (length) { 270 - ore_calc_stripe_info(layout, offset, &ios->si); 271 - ios->length = (length <= ios->si.group_length) ? length : 272 - ios->si.group_length; 192 + ore_calc_stripe_info(layout, offset, length, &ios->si); 193 + ios->length = ios->si.length; 273 194 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; 195 + if (layout->parity) 196 + _ore_post_alloc_raid_stuff(ios); 274 197 } 275 198 276 199 return 0; ··· 288 209 int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, 289 210 struct ore_io_state **pios) 290 211 { 291 - return _get_io_state(layout, oc, oc->numdevs, pios); 212 + return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); 292 213 } 293 214 EXPORT_SYMBOL(ore_get_io_state); 294 215 ··· 306 227 bio_put(per_dev->bio); 307 228 } 308 229 230 + _ore_free_raid_stuff(ios); 309 231 kfree(ios); 310 232 } 311 233 } ··· 447 367 /* 448 368 * L - logical offset into the file 449 369 * 450 - * U - The number of bytes in a stripe within a group 370 + * D - number of Data devices 371 + * D = group_width - parity 451 372 * 452 - * U = stripe_unit * group_width 373 + * U - The number of bytes in a stripe within a group 374 + * U = stripe_unit * D 453 375 * 454 376 * T - The number of bytes striped within a group of component objects 455 377 * (before advancing to the next group) 456 - * 457 - * T = stripe_unit * group_width * group_depth 378 + * T = U * group_depth 458 379 * 459 380 * S - The number of bytes striped across all component objects 460 381 * before the pattern repeats 382 + * S = T * group_count 461 383 * 462 - * S = stripe_unit * group_width * group_depth * group_count 463 - * 464 - * M - The "major" (i.e., across all components) stripe number 465 - * 384 + * M - The "major" (i.e., across all components) cycle number 466 385 * M = L / S 467 386 * 468 - * G - Counts the groups from the beginning of the major stripe 469 - * 387 + * G - Counts the groups from the beginning of the major cycle 470 388 * G = (L - (M * S)) / T [or (L % S) / T] 471 389 * 472 390 * H - The byte offset within the group 473 - * 474 391 * H = (L - (M * S)) % T [or (L % S) % T] 475 392 * 476 393 * N - The "minor" (i.e., across the group) stripe number 477 - * 478 394 * N = H / U 479 395 * 480 396 * C - The component index coresponding to L 481 397 * 482 - * C = (H - (N * U)) / stripe_unit + G * group_width 483 - * [or (L % U) / stripe_unit + G * group_width] 398 + * C = (H - (N * U)) / stripe_unit + G * D 399 + * [or (L % U) / stripe_unit + G * D] 484 400 * 485 401 * O - The component offset coresponding to L 486 - * 487 402 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit 403 + * 404 + * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity 405 + * divide by parity 406 + * LCMdP = lcm(group_width, parity) / parity 407 + * 408 + * R - The parity Rotation stripe 409 + * (Note parity cycle always starts at a group's boundary) 410 + * R = N % LCMdP 411 + * 412 + * I = the first parity device index 413 + * I = (group_width + group_width - R*parity - parity) % group_width 414 + * 415 + * Craid - The component index Rotated 416 + * Craid = (group_width + C - R*parity) % group_width 417 + * (We add the group_width to avoid negative numbers modulo math) 488 418 */ 489 419 void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, 490 - struct ore_striping_info *si) 420 + u64 length, struct ore_striping_info *si) 491 421 { 492 422 u32 stripe_unit = layout->stripe_unit; 493 423 u32 group_width = layout->group_width; 494 424 u64 group_depth = layout->group_depth; 425 + u32 parity = layout->parity; 495 426 496 - u32 U = stripe_unit * group_width; 427 + u32 D = group_width - parity; 428 + u32 U = D * stripe_unit; 497 429 u64 T = U * group_depth; 498 430 u64 S = T * layout->group_count; 499 431 u64 M = div64_u64(file_offset, S); ··· 521 429 u32 N = div_u64(H, U); 522 430 523 431 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 524 - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; 525 - si->dev *= layout->mirrors_p1; 432 + u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; 526 433 527 434 div_u64_rem(file_offset, stripe_unit, &si->unit_off); 528 435 529 436 si->obj_offset = si->unit_off + (N * stripe_unit) + 530 437 (M * group_depth * stripe_unit); 531 438 532 - si->group_length = T - H; 439 + if (parity) { 440 + u32 LCMdP = lcm(group_width, parity) / parity; 441 + /* R = N % LCMdP; */ 442 + u32 RxP = (N % LCMdP) * parity; 443 + u32 first_dev = C - C % group_width; 444 + 445 + si->par_dev = (group_width + group_width - parity - RxP) % 446 + group_width + first_dev; 447 + si->dev = (group_width + C - RxP) % group_width + first_dev; 448 + si->bytes_in_stripe = U; 449 + si->first_stripe_start = M * S + G * T + N * U; 450 + } else { 451 + /* Make the math correct see _prepare_one_group */ 452 + si->par_dev = group_width; 453 + si->dev = C; 454 + } 455 + 456 + si->dev *= layout->mirrors_p1; 457 + si->par_dev *= layout->mirrors_p1; 458 + si->offset = file_offset; 459 + si->length = T - H; 460 + if (si->length > length) 461 + si->length = length; 533 462 si->M = M; 534 463 } 535 464 EXPORT_SYMBOL(ore_calc_stripe_info); 536 465 537 - static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 538 - unsigned pgbase, struct ore_per_dev_state *per_dev, 539 - int cur_len) 466 + int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 467 + unsigned pgbase, struct page **pages, 468 + struct ore_per_dev_state *per_dev, int cur_len) 540 469 { 541 470 unsigned pg = *cur_pg; 542 471 struct request_queue *q = ··· 568 455 if (per_dev->bio == NULL) { 569 456 unsigned pages_in_stripe = ios->layout->group_width * 570 457 (ios->layout->stripe_unit / PAGE_SIZE); 571 - unsigned bio_size = (ios->nr_pages + pages_in_stripe) / 572 - ios->layout->group_width; 458 + unsigned nr_pages = ios->nr_pages * ios->layout->group_width / 459 + (ios->layout->group_width - 460 + ios->layout->parity); 461 + unsigned bio_size = (nr_pages + pages_in_stripe) / 462 + ios->layout->group_width; 573 463 574 464 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 575 465 if (unlikely(!per_dev->bio)) { ··· 587 471 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); 588 472 unsigned added_len; 589 473 590 - BUG_ON(ios->nr_pages <= pg); 591 474 cur_len -= pglen; 592 475 593 - added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], 476 + added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], 594 477 pglen, pgbase); 595 478 if (unlikely(pglen != added_len)) { 479 + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", 480 + per_dev->bio->bi_vcnt); 596 481 ret = -ENOMEM; 597 482 goto out; 598 483 } ··· 618 501 struct ore_striping_info *si = &ios->si; 619 502 unsigned stripe_unit = ios->layout->stripe_unit; 620 503 unsigned mirrors_p1 = ios->layout->mirrors_p1; 621 - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; 504 + unsigned group_width = ios->layout->group_width; 505 + unsigned devs_in_group = group_width * mirrors_p1; 622 506 unsigned dev = si->dev; 623 507 unsigned first_dev = dev - (dev % devs_in_group); 508 + unsigned dev_order; 624 509 unsigned cur_pg = ios->pages_consumed; 625 510 u64 length = ios->length; 626 511 int ret = 0; ··· 632 513 return 0; 633 514 } 634 515 635 - BUG_ON(length > si->group_length); 516 + BUG_ON(length > si->length); 517 + 518 + dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); 519 + si->cur_comp = dev_order; 636 520 637 521 while (length) { 638 522 unsigned comp = dev - first_dev; ··· 644 522 645 523 if (!per_dev->length) { 646 524 per_dev->dev = dev; 647 - if (dev < si->dev) { 648 - per_dev->offset = si->obj_offset + stripe_unit - 649 - si->unit_off; 650 - cur_len = stripe_unit; 651 - } else if (dev == si->dev) { 525 + if (dev == si->dev) { 526 + WARN_ON(dev == si->par_dev); 652 527 per_dev->offset = si->obj_offset; 653 528 cur_len = stripe_unit - si->unit_off; 654 529 page_off = si->unit_off & ~PAGE_MASK; 655 530 BUG_ON(page_off && (page_off != ios->pgbase)); 656 - } else { /* dev > si->dev */ 657 - per_dev->offset = si->obj_offset - si->unit_off; 531 + } else { 532 + if (si->cur_comp > dev_order) 533 + per_dev->offset = 534 + si->obj_offset - si->unit_off; 535 + else /* si->cur_comp < dev_order */ 536 + per_dev->offset = 537 + si->obj_offset + stripe_unit - 538 + si->unit_off; 658 539 cur_len = stripe_unit; 659 540 } 660 541 } else { ··· 666 541 if (cur_len >= length) 667 542 cur_len = length; 668 543 669 - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, 670 - cur_len); 544 + ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, 545 + per_dev, cur_len); 671 546 if (unlikely(ret)) 672 547 goto out; 673 548 ··· 675 550 dev = (dev % devs_in_group) + first_dev; 676 551 677 552 length -= cur_len; 553 + 554 + si->cur_comp = (si->cur_comp + 1) % group_width; 555 + if (unlikely((dev == si->par_dev) || 556 + (!length && ios->parity_pages))) { 557 + if (!length) 558 + /* If we are writing and this is the very last 559 + * stripe. then operate on parity dev. 560 + */ 561 + dev = si->par_dev; 562 + if (ios->reading) 563 + /* In writes cur_len just means if it's the 564 + * last one. See _ore_add_parity_unit. 565 + */ 566 + cur_len = length; 567 + per_dev = &ios->per_dev[dev - first_dev]; 568 + if (!per_dev->length) { 569 + /* Only/always the parity unit of the first 570 + * stripe will be empty. So this is a chance to 571 + * initialize the per_dev info. 572 + */ 573 + per_dev->dev = dev; 574 + per_dev->offset = si->obj_offset - si->unit_off; 575 + } 576 + 577 + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); 578 + if (unlikely(ret)) 579 + goto out; 580 + 581 + /* Rotate next par_dev backwards with wraping */ 582 + si->par_dev = (devs_in_group + si->par_dev - 583 + ios->layout->parity * mirrors_p1) % 584 + devs_in_group + first_dev; 585 + /* Next stripe, start fresh */ 586 + si->cur_comp = 0; 587 + } 678 588 } 679 589 out: 680 590 ios->numdevs = devs_in_group; ··· 907 747 per_dev->or = or; 908 748 909 749 if (ios->pages) { 910 - osd_req_read(or, obj, per_dev->offset, 911 - per_dev->bio, per_dev->length); 750 + if (per_dev->cur_sg) { 751 + /* finalize the last sg_entry */ 752 + _ore_add_sg_seg(per_dev, 0, false); 753 + if (unlikely(!per_dev->cur_sg)) 754 + return 0; /* Skip parity only device */ 755 + 756 + osd_req_read_sg(or, obj, per_dev->bio, 757 + per_dev->sglist, per_dev->cur_sg); 758 + } else { 759 + /* The no raid case */ 760 + osd_req_read(or, obj, per_dev->offset, 761 + per_dev->bio, per_dev->length); 762 + } 763 + 912 764 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" 913 - " dev=%d\n", _LLU(obj->id), 765 + " dev=%d sg_len=%d\n", _LLU(obj->id), 914 766 _LLU(per_dev->offset), _LLU(per_dev->length), 915 - first_dev); 767 + first_dev, per_dev->cur_sg); 916 768 } else { 917 769 BUG_ON(ios->kern_buff); 918 770 ··· 1021 849 { 1022 850 unsigned stripe_unit = layout->stripe_unit; 1023 851 1024 - ore_calc_stripe_info(layout, file_offset, &ti->si); 852 + ore_calc_stripe_info(layout, file_offset, 0, &ti->si); 1025 853 1026 854 ti->prev_group_obj_off = ti->si.M * stripe_unit; 1027 855 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
+140
fs/exofs/ore_raid.c
··· 1 + /* 2 + * Copyright (C) 2011 3 + * Boaz Harrosh <bharrosh@panasas.com> 4 + * 5 + * This file is part of the objects raid engine (ore). 6 + * 7 + * It is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as published 9 + * by the Free Software Foundation. 10 + * 11 + * You should have received a copy of the GNU General Public License 12 + * along with "ore". If not, write to the Free Software Foundation, Inc: 13 + * "Free Software Foundation <info@fsf.org>" 14 + */ 15 + 16 + #include <linux/gfp.h> 17 + 18 + #include "ore_raid.h" 19 + 20 + struct page *_raid_page_alloc(void) 21 + { 22 + return alloc_page(GFP_KERNEL); 23 + } 24 + 25 + void _raid_page_free(struct page *p) 26 + { 27 + __free_page(p); 28 + } 29 + 30 + void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, 31 + bool not_last) 32 + { 33 + struct osd_sg_entry *sge; 34 + 35 + ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " 36 + "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", 37 + per_dev->dev, cur_len, not_last, per_dev->cur_sg, 38 + _LLU(per_dev->offset), per_dev->length, 39 + per_dev->last_sgs_total); 40 + 41 + if (!per_dev->cur_sg) { 42 + sge = per_dev->sglist; 43 + 44 + /* First time we prepare two entries */ 45 + if (per_dev->length) { 46 + ++per_dev->cur_sg; 47 + sge->offset = per_dev->offset; 48 + sge->len = per_dev->length; 49 + } else { 50 + /* Here the parity is the first unit of this object. 51 + * This happens every time we reach a parity device on 52 + * the same stripe as the per_dev->offset. We need to 53 + * just skip this unit. 54 + */ 55 + per_dev->offset += cur_len; 56 + return; 57 + } 58 + } else { 59 + /* finalize the last one */ 60 + sge = &per_dev->sglist[per_dev->cur_sg - 1]; 61 + sge->len = per_dev->length - per_dev->last_sgs_total; 62 + } 63 + 64 + if (not_last) { 65 + /* Partly prepare the next one */ 66 + struct osd_sg_entry *next_sge = sge + 1; 67 + 68 + ++per_dev->cur_sg; 69 + next_sge->offset = sge->offset + sge->len + cur_len; 70 + /* Save cur len so we know how mutch was added next time */ 71 + per_dev->last_sgs_total = per_dev->length; 72 + next_sge->len = 0; 73 + } else if (!sge->len) { 74 + /* Optimize for when the last unit is a parity */ 75 + --per_dev->cur_sg; 76 + } 77 + } 78 + 79 + /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ 80 + int _ore_add_parity_unit(struct ore_io_state *ios, 81 + struct ore_striping_info *si, 82 + struct ore_per_dev_state *per_dev, 83 + unsigned cur_len) 84 + { 85 + if (ios->reading) { 86 + BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); 87 + _ore_add_sg_seg(per_dev, cur_len, true); 88 + } else { 89 + struct page **pages = ios->parity_pages + ios->cur_par_page; 90 + unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE; 91 + unsigned array_start = 0; 92 + unsigned i; 93 + int ret; 94 + 95 + for (i = 0; i < num_pages; i++) { 96 + pages[i] = _raid_page_alloc(); 97 + if (unlikely(!pages[i])) 98 + return -ENOMEM; 99 + 100 + ++(ios->cur_par_page); 101 + /* TODO: only read support for now */ 102 + clear_highpage(pages[i]); 103 + } 104 + 105 + ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d", 106 + per_dev->dev, num_pages, ios->cur_par_page); 107 + 108 + ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, 109 + per_dev, num_pages * PAGE_SIZE); 110 + if (unlikely(ret)) 111 + return ret; 112 + } 113 + return 0; 114 + } 115 + 116 + int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) 117 + { 118 + /*TODO: Only raid writes has stuff to add here */ 119 + return 0; 120 + } 121 + 122 + void _ore_free_raid_stuff(struct ore_io_state *ios) 123 + { 124 + if (ios->parity_pages) { /* writing and raid */ 125 + unsigned i; 126 + 127 + for (i = 0; i < ios->cur_par_page; i++) { 128 + struct page *page = ios->parity_pages[i]; 129 + 130 + if (page) 131 + _raid_page_free(page); 132 + } 133 + if (ios->extra_part_alloc) 134 + kfree(ios->parity_pages); 135 + } else { 136 + /* Will only be set if raid reading && sglist is big */ 137 + if (ios->extra_part_alloc) 138 + kfree(ios->per_dev[0].sglist); 139 + } 140 + }
+64
fs/exofs/ore_raid.h
··· 1 + /* 2 + * Copyright (C) from 2011 3 + * Boaz Harrosh <bharrosh@panasas.com> 4 + * 5 + * This file is part of the objects raid engine (ore). 6 + * 7 + * It is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License version 2 as published 9 + * by the Free Software Foundation. 10 + * 11 + * You should have received a copy of the GNU General Public License 12 + * along with "ore". If not, write to the Free Software Foundation, Inc: 13 + * "Free Software Foundation <info@fsf.org>" 14 + */ 15 + 16 + #include <scsi/osd_ore.h> 17 + 18 + #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) 19 + 20 + #ifdef CONFIG_EXOFS_DEBUG 21 + #define ORE_DBGMSG(fmt, a...) \ 22 + printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) 23 + #else 24 + #define ORE_DBGMSG(fmt, a...) \ 25 + do { if (0) printk(fmt, ##a); } while (0) 26 + #endif 27 + 28 + /* u64 has problems with printk this will cast it to unsigned long long */ 29 + #define _LLU(x) (unsigned long long)(x) 30 + 31 + #define ORE_DBGMSG2(M...) do {} while (0) 32 + /* #define ORE_DBGMSG2 ORE_DBGMSG */ 33 + 34 + /* Calculate the component order in a stripe. eg the logical data unit 35 + * address within the stripe of @dev given the @par_dev of this stripe. 36 + */ 37 + static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1, 38 + unsigned par_dev, unsigned dev) 39 + { 40 + unsigned first_dev = dev - dev % devs_in_group; 41 + 42 + dev -= first_dev; 43 + par_dev -= first_dev; 44 + 45 + if (devs_in_group == par_dev) /* The raid 0 case */ 46 + return dev / mirrors_p1; 47 + /* raid4/5/6 case */ 48 + return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) / 49 + mirrors_p1; 50 + } 51 + 52 + /* ios_raid.c stuff needed by ios.c */ 53 + int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); 54 + void _ore_free_raid_stuff(struct ore_io_state *ios); 55 + 56 + void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, 57 + bool not_last); 58 + int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, 59 + struct ore_per_dev_state *per_dev, unsigned cur_len); 60 + 61 + /* ios.c stuff needed by ios_raid.c */ 62 + int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 63 + unsigned pgbase, struct page **pages, 64 + struct ore_per_dev_state *per_dev, int cur_len);
+18 -3
include/scsi/osd_ore.h
··· 40 40 unsigned mirrors_p1; 41 41 42 42 unsigned group_width; 43 + unsigned parity; 43 44 u64 group_depth; 44 45 unsigned group_count; 45 46 ··· 90 89 } 91 90 92 91 struct ore_striping_info { 92 + u64 offset; 93 93 u64 obj_offset; 94 - u64 group_length; 94 + u64 length; 95 + u64 first_stripe_start; /* only used in raid writes */ 95 96 u64 M; /* for truncate */ 97 + unsigned bytes_in_stripe; 96 98 unsigned dev; 99 + unsigned par_dev; 97 100 unsigned unit_off; 101 + unsigned cur_comp; 98 102 }; 99 103 100 104 struct ore_io_state; ··· 133 127 134 128 bool reading; 135 129 130 + /* House keeping of Parity pages */ 131 + bool extra_part_alloc; 132 + struct page **parity_pages; 133 + unsigned max_par_pages; 134 + unsigned cur_par_page; 135 + unsigned sgs_per_dev; 136 + 136 137 /* Variable array of size numdevs */ 137 138 unsigned numdevs; 138 139 struct ore_per_dev_state { ··· 147 134 struct bio *bio; 148 135 loff_t offset; 149 136 unsigned length; 137 + unsigned last_sgs_total; 150 138 unsigned dev; 139 + struct osd_sg_entry *sglist; 140 + unsigned cur_sg; 151 141 } per_dev[]; 152 142 }; 153 143 ··· 163 147 /* ore.c */ 164 148 int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); 165 149 void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, 166 - struct ore_striping_info *si); 167 - 150 + u64 length, struct ore_striping_info *si); 168 151 int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, 169 152 bool is_reading, u64 offset, u64 length, 170 153 struct ore_io_state **ios);