ore: RAID5 Write · tjh.dev/kernel@769ba8d

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

ore: RAID5 Write

This is finally the RAID5 Write support.

The bigger part of this patch is not the XOR engine itself, But the
read4write logic, which is a complete mini prepare_for_striping
reading engine that can read scattered pages of a stripe into cache
so it can be used for XOR calculation. That is, if the write was not
stripe aligned.

The main algorithm behind the XOR engine is the 2 dimensional array:
struct __stripe_pages_2d.
A drawing might save 1000 words
---

__stripe_pages_2d
|
n = pages_in_stripe_unit;
w = group_width - parity;
| pages array presented to the XOR lib
| |
V |
__1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---|
| |
__1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <---
|
... | ...
|
__1_page_stripe[n].pages --> [c0][c1]..[cw][c_par]
^
|
data added columns first then row

---
The pages are put on this array columns first. .i.e:
p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ...
So we are doing a corner turn of the pages.

Note that pages will zigzag down and left. but are put sequentially
in growing order. So when the time comes to XOR the stripe, only the
beginning and end of the array need be checked. We scan the array
and any NULL spot will be field by pages-to-be-read.

The FS that wants to support RAID5 needs to supply an
operations-vector that searches a given page in cache, and specifies
if the page is uptodate or need reading. All these pages to be read
are put on a slave ore_io_state and synchronously read. All the pages
of a stripe are read in one IO, using the scatter gather mechanism.

In write we constrain our IO to only be incomplete on a single
stripe. Meaning either the complete IO is within a single stripe so
we might have pages to read from both beginning or end of the
strip. Or we have some reading to do at beginning but end at strip
boundary. The left over pages are pushed to the next IO by the API
already established by previous work, where an IO offset/length
combination presented to the ORE might get the length truncated and
the user must re-submit the leftover pages. (Both exofs and NFS
support this)

But any ORE user should make it's best effort to align it's IO
before hand and avoid complications. A cached ore_layout->stripe_size
member can be used for that calculation. (NOTE: that ORE demands
that stripe_size may not be bigger then 32bit)

What else? Well read it and tell me.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>

Boaz Harrosh 14 years ago 769ba8d9 a1fec1db

+587 -16

5 changed files

expand all

exofs

Kconfig

ore.c

ore_raid.c

ore_raid.h

include

scsi

osd_ore.h

+8 -1

fs/exofs/Kconfig

··· 1 + # Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects 2 + # for every ORE user we do it like this. Any user should add itself here 3 + # at the "depends on EXOFS_FS || ..." with an ||. The dependencies are 4 + # selected here, and we default to "ON". So in effect it is like been 5 + # selected by any of the users. 1 6 config ORE 2 7 tristate 8 + depends on EXOFS_FS 9 + select ASYNC_XOR 10 + default SCSI_OSD_ULD 3 11 4 12 config EXOFS_FS 5 13 tristate "exofs: OSD based file system support" 6 14 depends on SCSI_OSD_ULD 7 - select ORE 8 15 help 9 16 EXOFS is a file system that uses an OSD storage device, 10 17 as its backing storage.

+28 -8

fs/exofs/ore.c

··· 95 95 layout->max_io_length = 96 96 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * 97 97 layout->group_width; 98 + if (layout->parity) { 99 + unsigned stripe_length = 100 + (layout->group_width - layout->parity) * 101 + layout->stripe_unit; 102 + 103 + layout->max_io_length /= stripe_length; 104 + layout->max_io_length *= stripe_length; 105 + } 98 106 return 0; 99 107 } 100 108 EXPORT_SYMBOL(ore_verify_layout); ··· 126 118 return ore_comp_dev(ios->oc, index); 127 119 } 128 120 129 - static int _ore_get_io_state(struct ore_layout *layout, 121 + int _ore_get_io_state(struct ore_layout *layout, 130 122 struct ore_components *oc, unsigned numdevs, 131 123 unsigned sgs_per_dev, unsigned num_par_pages, 132 124 struct ore_io_state **pios) ··· 342 334 kref_put(&ios->kref, _last_io); 343 335 } 344 336 345 - static int ore_io_execute(struct ore_io_state *ios) 337 + int ore_io_execute(struct ore_io_state *ios) 346 338 { 347 339 DECLARE_COMPLETION_ONSTACK(wait); 348 340 bool sync = (ios->done == NULL); ··· 605 597 ret = -ENOMEM; 606 598 goto out; 607 599 } 600 + _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); 601 + 608 602 pgbase = 0; 609 603 ++pg; 610 604 } ··· 646 636 647 637 dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); 648 638 si->cur_comp = dev_order; 639 + si->cur_pg = si->unit_off / PAGE_SIZE; 649 640 650 641 while (length) { 651 642 unsigned comp = dev - first_dev; ··· 688 677 length -= cur_len; 689 678 690 679 si->cur_comp = (si->cur_comp + 1) % group_width; 691 - if (unlikely((dev == si->par_dev) || 692 - (!length && ios->parity_pages))) { 693 - if (!length) 680 + if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { 681 + if (!length && ios->sp2d) { 694 682 /* If we are writing and this is the very last 695 683 * stripe. then operate on parity dev. 696 684 */ 697 685 dev = si->par_dev; 698 - if (ios->reading) 686 + } 687 + if (ios->sp2d) 699 688 /* In writes cur_len just means if it's the 700 689 * last one. See _ore_add_parity_unit. 701 690 */ ··· 720 709 devs_in_group + first_dev; 721 710 /* Next stripe, start fresh */ 722 711 si->cur_comp = 0; 712 + si->cur_pg = 0; 723 713 } 724 714 } 725 715 out: ··· 885 873 int i; 886 874 int ret; 887 875 876 + if (unlikely(ios->sp2d && !ios->r4w)) { 877 + /* A library is attempting a RAID-write without providing 878 + * a pages lock interface. 879 + */ 880 + WARN_ON_ONCE(1); 881 + return -ENOTSUPP; 882 + } 883 + 888 884 ret = _prepare_for_striping(ios); 889 885 if (unlikely(ret)) 890 886 return ret; ··· 908 888 } 909 889 EXPORT_SYMBOL(ore_write); 910 890 911 - static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) 891 + int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) 912 892 { 913 893 struct osd_request *or; 914 894 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; ··· 972 952 return ret; 973 953 974 954 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 975 - ret = _read_mirror(ios, i); 955 + ret = _ore_read_mirror(ios, i); 976 956 if (unlikely(ret)) 977 957 return ret; 978 958 }

+527 -7

fs/exofs/ore_raid.c

··· 14 14 */ 15 15 16 16 #include <linux/gfp.h> 17 + #include <linux/async_tx.h> 17 18 18 19 #include "ore_raid.h" 20 + 21 + #undef ORE_DBGMSG2 22 + #define ORE_DBGMSG2 ORE_DBGMSG 19 23 20 24 struct page *_raid_page_alloc(void) 21 25 { ··· 29 25 void _raid_page_free(struct page *p) 30 26 { 31 27 __free_page(p); 28 + } 29 + 30 + /* This struct is forward declare in ore_io_state, but is private to here. 31 + * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. 32 + * 33 + * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. 34 + * Ascending page index access is sp2d(p-minor, c-major). But storage is 35 + * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor 36 + * API. 37 + */ 38 + struct __stripe_pages_2d { 39 + /* Cache some hot path repeated calculations */ 40 + unsigned parity; 41 + unsigned data_devs; 42 + unsigned pages_in_unit; 43 + 44 + bool needed ; 45 + 46 + /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ 47 + struct __1_page_stripe { 48 + bool alloc; 49 + unsigned write_count; 50 + struct async_submit_ctl submit; 51 + struct dma_async_tx_descriptor *tx; 52 + 53 + /* The size of this array is data_devs + parity */ 54 + struct page **pages; 55 + struct page **scribble; 56 + /* bool array, size of this array is data_devs */ 57 + char *page_is_read; 58 + } _1p_stripes[]; 59 + }; 60 + 61 + /* This can get bigger then a page. So support multiple page allocations 62 + * _sp2d_free should be called even if _sp2d_alloc fails (by returning 63 + * none-zero). 64 + */ 65 + static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, 66 + unsigned parity, struct __stripe_pages_2d **psp2d) 67 + { 68 + struct __stripe_pages_2d *sp2d; 69 + unsigned data_devs = group_width - parity; 70 + struct _alloc_all_bytes { 71 + struct __alloc_stripe_pages_2d { 72 + struct __stripe_pages_2d sp2d; 73 + struct __1_page_stripe _1p_stripes[pages_in_unit]; 74 + } __asp2d; 75 + struct __alloc_1p_arrays { 76 + struct page *pages[group_width]; 77 + struct page *scribble[group_width]; 78 + char page_is_read[data_devs]; 79 + } __a1pa[pages_in_unit]; 80 + } *_aab; 81 + struct __alloc_1p_arrays *__a1pa; 82 + struct __alloc_1p_arrays *__a1pa_end; 83 + const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); 84 + unsigned num_a1pa, alloc_size, i; 85 + 86 + /* FIXME: check these numbers in ore_verify_layout */ 87 + BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); 88 + BUG_ON(sizeof__a1pa > PAGE_SIZE); 89 + 90 + if (sizeof(*_aab) > PAGE_SIZE) { 91 + num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; 92 + alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; 93 + } else { 94 + num_a1pa = pages_in_unit; 95 + alloc_size = sizeof(*_aab); 96 + } 97 + 98 + _aab = kzalloc(alloc_size, GFP_KERNEL); 99 + if (unlikely(!_aab)) { 100 + ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); 101 + return -ENOMEM; 102 + } 103 + 104 + sp2d = &_aab->__asp2d.sp2d; 105 + *psp2d = sp2d; /* From here Just call _sp2d_free */ 106 + 107 + __a1pa = _aab->__a1pa; 108 + __a1pa_end = __a1pa + num_a1pa; 109 + 110 + for (i = 0; i < pages_in_unit; ++i) { 111 + if (unlikely(__a1pa >= __a1pa_end)) { 112 + num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, 113 + pages_in_unit - i); 114 + 115 + __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); 116 + if (unlikely(!__a1pa)) { 117 + ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", 118 + num_a1pa); 119 + return -ENOMEM; 120 + } 121 + __a1pa_end = __a1pa + num_a1pa; 122 + /* First *pages is marked for kfree of the buffer */ 123 + sp2d->_1p_stripes[i].alloc = true; 124 + } 125 + 126 + sp2d->_1p_stripes[i].pages = __a1pa->pages; 127 + sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; 128 + sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; 129 + ++__a1pa; 130 + } 131 + 132 + sp2d->parity = parity; 133 + sp2d->data_devs = data_devs; 134 + sp2d->pages_in_unit = pages_in_unit; 135 + return 0; 136 + } 137 + 138 + static void _sp2d_reset(struct __stripe_pages_2d *sp2d, 139 + const struct _ore_r4w_op *r4w, void *priv) 140 + { 141 + unsigned data_devs = sp2d->data_devs; 142 + unsigned group_width = data_devs + sp2d->parity; 143 + unsigned p; 144 + 145 + if (!sp2d->needed) 146 + return; 147 + 148 + for (p = 0; p < sp2d->pages_in_unit; p++) { 149 + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; 150 + 151 + if (_1ps->write_count < group_width) { 152 + unsigned c; 153 + 154 + for (c = 0; c < data_devs; c++) 155 + if (_1ps->page_is_read[c]) { 156 + struct page *page = _1ps->pages[c]; 157 + 158 + r4w->put_page(priv, page); 159 + _1ps->page_is_read[c] = false; 160 + } 161 + } 162 + 163 + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); 164 + _1ps->write_count = 0; 165 + _1ps->tx = NULL; 166 + } 167 + 168 + sp2d->needed = false; 169 + } 170 + 171 + static void _sp2d_free(struct __stripe_pages_2d *sp2d) 172 + { 173 + unsigned i; 174 + 175 + if (!sp2d) 176 + return; 177 + 178 + for (i = 0; i < sp2d->pages_in_unit; ++i) { 179 + if (sp2d->_1p_stripes[i].alloc) 180 + kfree(sp2d->_1p_stripes[i].pages); 181 + } 182 + 183 + kfree(sp2d); 184 + } 185 + 186 + static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) 187 + { 188 + unsigned p; 189 + 190 + for (p = 0; p < sp2d->pages_in_unit; p++) { 191 + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; 192 + 193 + if (_1ps->write_count) 194 + return p; 195 + } 196 + 197 + return ~0; 198 + } 199 + 200 + static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) 201 + { 202 + unsigned p; 203 + 204 + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { 205 + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; 206 + 207 + if (_1ps->write_count) 208 + return p; 209 + } 210 + 211 + return ~0; 212 + } 213 + 214 + static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) 215 + { 216 + unsigned p; 217 + for (p = 0; p < sp2d->pages_in_unit; p++) { 218 + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; 219 + 220 + if (!_1ps->write_count) 221 + continue; 222 + 223 + init_async_submit(&_1ps->submit, 224 + ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK, 225 + NULL, 226 + NULL, NULL, 227 + (addr_conv_t *)_1ps->scribble); 228 + 229 + /* TODO: raid6 */ 230 + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages, 231 + 0, sp2d->data_devs, PAGE_SIZE, 232 + &_1ps->submit); 233 + } 234 + 235 + for (p = 0; p < sp2d->pages_in_unit; p++) { 236 + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; 237 + /* NOTE: We wait for HW synchronously (I don't have such HW 238 + * to test with.) Is parallelism needed with today's multi 239 + * cores? 240 + */ 241 + async_tx_issue_pending(_1ps->tx); 242 + } 243 + } 244 + 245 + void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, 246 + struct ore_striping_info *si, struct page *page) 247 + { 248 + struct __1_page_stripe *_1ps; 249 + 250 + sp2d->needed = true; 251 + 252 + _1ps = &sp2d->_1p_stripes[si->cur_pg]; 253 + _1ps->pages[si->cur_comp] = page; 254 + ++_1ps->write_count; 255 + 256 + si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; 257 + /* si->cur_comp is advanced outside at main loop */ 32 258 } 33 259 34 260 void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, ··· 310 76 } 311 77 } 312 78 79 + static int _alloc_read_4_write(struct ore_io_state *ios) 80 + { 81 + struct ore_layout *layout = ios->layout; 82 + int ret; 83 + /* We want to only read those pages not in cache so worst case 84 + * is a stripe populated with every other page 85 + */ 86 + unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; 87 + 88 + ret = _ore_get_io_state(layout, ios->oc, 89 + layout->group_width * layout->mirrors_p1, 90 + sgs_per_dev, 0, &ios->ios_read_4_write); 91 + return ret; 92 + } 93 + 94 + /* @si contains info of the to-be-inserted page. Update of @si should be 95 + * maintained by caller. Specificaly si->dev, si->obj_offset, ... 96 + */ 97 + static int _add_to_read_4_write(struct ore_io_state *ios, 98 + struct ore_striping_info *si, struct page *page) 99 + { 100 + struct request_queue *q; 101 + struct ore_per_dev_state *per_dev; 102 + struct ore_io_state *read_ios; 103 + unsigned first_dev = si->dev - (si->dev % 104 + (ios->layout->group_width * ios->layout->mirrors_p1)); 105 + unsigned comp = si->dev - first_dev; 106 + unsigned added_len; 107 + 108 + if (!ios->ios_read_4_write) { 109 + int ret = _alloc_read_4_write(ios); 110 + 111 + if (unlikely(ret)) 112 + return ret; 113 + } 114 + 115 + read_ios = ios->ios_read_4_write; 116 + read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; 117 + 118 + per_dev = &read_ios->per_dev[comp]; 119 + if (!per_dev->length) { 120 + per_dev->bio = bio_kmalloc(GFP_KERNEL, 121 + ios->sp2d->pages_in_unit); 122 + if (unlikely(!per_dev->bio)) { 123 + ORE_DBGMSG("Failed to allocate BIO size=%u\n", 124 + ios->sp2d->pages_in_unit); 125 + return -ENOMEM; 126 + } 127 + per_dev->offset = si->obj_offset; 128 + per_dev->dev = si->dev; 129 + } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { 130 + u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); 131 + 132 + _ore_add_sg_seg(per_dev, gap, true); 133 + } 134 + q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); 135 + added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); 136 + if (unlikely(added_len != PAGE_SIZE)) { 137 + ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", 138 + per_dev->bio->bi_vcnt); 139 + return -ENOMEM; 140 + } 141 + 142 + per_dev->length += PAGE_SIZE; 143 + return 0; 144 + } 145 + 146 + static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) 147 + { 148 + struct bio_vec *bv; 149 + unsigned i, d; 150 + 151 + /* loop on all devices all pages */ 152 + for (d = 0; d < ios->numdevs; d++) { 153 + struct bio *bio = ios->per_dev[d].bio; 154 + 155 + if (!bio) 156 + continue; 157 + 158 + __bio_for_each_segment(bv, bio, i, 0) { 159 + struct page *page = bv->bv_page; 160 + 161 + SetPageUptodate(page); 162 + if (PageError(page)) 163 + ClearPageError(page); 164 + } 165 + } 166 + } 167 + 168 + /* read_4_write is hacked to read the start of the first stripe and/or 169 + * the end of the last stripe. If needed, with an sg-gap at each device/page. 170 + * It is assumed to be called after the to_be_written pages of the first stripe 171 + * are populating ios->sp2d[][] 172 + * 173 + * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations 174 + * These pages are held at sp2d[p].pages[c] but with 175 + * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are 176 + * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is 177 + * @uptodate=true, so we don't need to read it, only unlock, after IO. 178 + * 179 + * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then 180 + * to-be-written count, we should consider the xor-in-place mode. 181 + * need_to_read_pages_count is the actual number of pages not present in cache. 182 + * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough 183 + * approximation? In this mode the read pages are put in the empty places of 184 + * ios->sp2d[p][*], xor is calculated the same way. These pages are 185 + * allocated/freed and don't go through cache 186 + */ 187 + static int _read_4_write(struct ore_io_state *ios) 188 + { 189 + struct ore_io_state *ios_read; 190 + struct ore_striping_info read_si; 191 + struct __stripe_pages_2d *sp2d = ios->sp2d; 192 + u64 offset = ios->si.first_stripe_start; 193 + u64 last_stripe_end; 194 + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; 195 + unsigned i, c, p, min_p = sp2d->pages_in_unit, max_p = -1; 196 + int ret; 197 + 198 + if (offset == ios->offset) /* Go to start collect $200 */ 199 + goto read_last_stripe; 200 + 201 + min_p = _sp2d_min_pg(sp2d); 202 + max_p = _sp2d_max_pg(sp2d); 203 + 204 + for (c = 0; ; c++) { 205 + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); 206 + read_si.obj_offset += min_p * PAGE_SIZE; 207 + offset += min_p * PAGE_SIZE; 208 + for (p = min_p; p <= max_p; p++) { 209 + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; 210 + struct page **pp = &_1ps->pages[c]; 211 + bool uptodate; 212 + 213 + if (*pp) 214 + /* to-be-written pages start here */ 215 + goto read_last_stripe; 216 + 217 + *pp = ios->r4w->get_page(ios->private, offset, 218 + &uptodate); 219 + if (unlikely(!*pp)) 220 + return -ENOMEM; 221 + 222 + if (!uptodate) 223 + _add_to_read_4_write(ios, &read_si, *pp); 224 + 225 + /* Mark read-pages to be cache_released */ 226 + _1ps->page_is_read[c] = true; 227 + read_si.obj_offset += PAGE_SIZE; 228 + offset += PAGE_SIZE; 229 + } 230 + offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; 231 + } 232 + 233 + read_last_stripe: 234 + offset = ios->offset + (ios->length + PAGE_SIZE - 1) / 235 + PAGE_SIZE * PAGE_SIZE; 236 + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) 237 + * bytes_in_stripe; 238 + if (offset == last_stripe_end) /* Optimize for the aligned case */ 239 + goto read_it; 240 + 241 + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); 242 + p = read_si.unit_off / PAGE_SIZE; 243 + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, 244 + ios->layout->mirrors_p1, read_si.par_dev, read_si.dev); 245 + 246 + BUG_ON(ios->si.first_stripe_start + bytes_in_stripe != last_stripe_end); 247 + /* unaligned IO must be within a single stripe */ 248 + 249 + if (min_p == sp2d->pages_in_unit) { 250 + /* Didn't do it yet */ 251 + min_p = _sp2d_min_pg(sp2d); 252 + max_p = _sp2d_max_pg(sp2d); 253 + } 254 + 255 + while (offset < last_stripe_end) { 256 + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; 257 + 258 + if ((min_p <= p) && (p <= max_p)) { 259 + struct page *page; 260 + bool uptodate; 261 + 262 + BUG_ON(_1ps->pages[c]); 263 + page = ios->r4w->get_page(ios->private, offset, 264 + &uptodate); 265 + if (unlikely(!page)) 266 + return -ENOMEM; 267 + 268 + _1ps->pages[c] = page; 269 + /* Mark read-pages to be cache_released */ 270 + _1ps->page_is_read[c] = true; 271 + if (!uptodate) 272 + _add_to_read_4_write(ios, &read_si, page); 273 + } 274 + 275 + offset += PAGE_SIZE; 276 + if (p == (sp2d->pages_in_unit - 1)) { 277 + ++c; 278 + p = 0; 279 + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); 280 + } else { 281 + read_si.obj_offset += PAGE_SIZE; 282 + ++p; 283 + } 284 + } 285 + 286 + read_it: 287 + ios_read = ios->ios_read_4_write; 288 + if (!ios_read) 289 + return 0; 290 + 291 + /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change 292 + * to check for per_dev->bio 293 + */ 294 + ios_read->pages = ios->pages; 295 + 296 + /* Now read these devices */ 297 + for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { 298 + ret = _ore_read_mirror(ios_read, i); 299 + if (unlikely(ret)) 300 + return ret; 301 + } 302 + 303 + ret = ore_io_execute(ios_read); /* Synchronus execution */ 304 + if (unlikely(ret)) { 305 + ORE_DBGMSG("!! ore_io_execute => %d\n", ret); 306 + return ret; 307 + } 308 + 309 + _mark_read4write_pages_uptodate(ios_read, ret); 310 + return 0; 311 + } 312 + 313 313 /* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ 314 314 int _ore_add_parity_unit(struct ore_io_state *ios, 315 315 struct ore_striping_info *si, ··· 554 86 BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); 555 87 _ore_add_sg_seg(per_dev, cur_len, true); 556 88 } else { 89 + struct __stripe_pages_2d *sp2d = ios->sp2d; 557 90 struct page **pages = ios->parity_pages + ios->cur_par_page; 558 - unsigned num_pages = ios->layout->stripe_unit / PAGE_SIZE; 91 + unsigned num_pages; 559 92 unsigned array_start = 0; 560 93 unsigned i; 561 94 int ret; 95 + 96 + si->cur_pg = _sp2d_min_pg(sp2d); 97 + num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; 98 + 99 + if (!cur_len) /* If last stripe operate on parity comp */ 100 + si->cur_comp = sp2d->data_devs; 101 + 102 + if (!per_dev->length) { 103 + per_dev->offset += si->cur_pg * PAGE_SIZE; 104 + /* If first stripe, Read in all read4write pages 105 + * (if needed) before we calculate the first parity. 106 + */ 107 + _read_4_write(ios); 108 + } 562 109 563 110 for (i = 0; i < num_pages; i++) { 564 111 pages[i] = _raid_page_alloc(); ··· 581 98 return -ENOMEM; 582 99 583 100 ++(ios->cur_par_page); 584 - /* TODO: only read support for now */ 585 - clear_highpage(pages[i]); 586 101 } 587 102 588 - ORE_DBGMSG("writing dev=%d num_pages=%d cur_par_page=%d", 589 - per_dev->dev, num_pages, ios->cur_par_page); 103 + BUG_ON(si->cur_comp != sp2d->data_devs); 104 + BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); 590 105 591 106 ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, 592 107 per_dev, num_pages * PAGE_SIZE); 593 108 if (unlikely(ret)) 594 109 return ret; 110 + 111 + /* TODO: raid6 if (last_parity_dev) */ 112 + _gen_xor_unit(sp2d); 113 + _sp2d_reset(sp2d, ios->r4w, ios->private); 595 114 } 596 115 return 0; 597 116 } 598 117 599 118 int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) 600 119 { 601 - /*TODO: Only raid writes has stuff to add here */ 120 + struct ore_layout *layout = ios->layout; 121 + 122 + if (ios->parity_pages) { 123 + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; 124 + unsigned stripe_size = ios->si.bytes_in_stripe; 125 + u64 last_stripe, first_stripe; 126 + 127 + if (_sp2d_alloc(pages_in_unit, layout->group_width, 128 + layout->parity, &ios->sp2d)) { 129 + return -ENOMEM; 130 + } 131 + 132 + BUG_ON(ios->offset % PAGE_SIZE); 133 + 134 + /* Round io down to last full strip */ 135 + first_stripe = div_u64(ios->offset, stripe_size); 136 + last_stripe = div_u64(ios->offset + ios->length, stripe_size); 137 + 138 + /* If an IO spans more then a single stripe it must end at 139 + * a stripe boundary. The reminder at the end is pushed into the 140 + * next IO. 141 + */ 142 + if (last_stripe != first_stripe) { 143 + ios->length = last_stripe * stripe_size - ios->offset; 144 + 145 + BUG_ON(!ios->length); 146 + ios->nr_pages = (ios->length + PAGE_SIZE - 1) / 147 + PAGE_SIZE; 148 + ios->si.length = ios->length; /*make it consistent */ 149 + } 150 + } 602 151 return 0; 603 152 } 604 153 605 154 void _ore_free_raid_stuff(struct ore_io_state *ios) 606 155 { 607 - if (ios->parity_pages) { /* writing and raid */ 156 + if (ios->sp2d) { /* writing and raid */ 608 157 unsigned i; 609 158 610 159 for (i = 0; i < ios->cur_par_page; i++) { ··· 647 132 } 648 133 if (ios->extra_part_alloc) 649 134 kfree(ios->parity_pages); 135 + /* If IO returned an error pages might need unlocking */ 136 + _sp2d_reset(ios->sp2d, ios->r4w, ios->private); 137 + _sp2d_free(ios->sp2d); 650 138 } else { 651 139 /* Will only be set if raid reading && sglist is big */ 652 140 if (ios->extra_part_alloc) 653 141 kfree(ios->per_dev[0].sglist); 654 142 } 143 + if (ios->ios_read_4_write) 144 + ore_put_io_state(ios->ios_read_4_write); 655 145 }

+15

fs/exofs/ore_raid.h

··· 57 57 bool not_last); 58 58 int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, 59 59 struct ore_per_dev_state *per_dev, unsigned cur_len); 60 + void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, 61 + struct ore_striping_info *si, struct page *page); 62 + static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, 63 + struct ore_striping_info *si, struct page *page) 64 + { 65 + if (!sp2d) /* Inline the fast path */ 66 + return; /* Hay no raid stuff */ 67 + _ore_add_stripe_page(sp2d, si, page); 68 + } 60 69 61 70 /* ios.c stuff needed by ios_raid.c */ 71 + int _ore_get_io_state(struct ore_layout *layout, 72 + struct ore_components *oc, unsigned numdevs, 73 + unsigned sgs_per_dev, unsigned num_par_pages, 74 + struct ore_io_state **pios); 62 75 int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 63 76 unsigned pgbase, struct page **pages, 64 77 struct ore_per_dev_state *per_dev, int cur_len); 78 + int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); 79 + int ore_io_execute(struct ore_io_state *ios);

include/scsi/osd_ore.h

··· 99 99 unsigned dev; 100 100 unsigned par_dev; 101 101 unsigned unit_off; 102 + unsigned cur_pg; 102 103 unsigned cur_comp; 103 104 }; 104 105 105 106 struct ore_io_state; 106 107 typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); 108 + struct _ore_r4w_op { 109 + /* @Priv given here is passed ios->private */ 110 + struct page * (*get_page)(void *priv, u64 page_index, bool *uptodate); 111 + void (*put_page)(void *priv, struct page *page); 112 + }; 107 113 108 114 struct ore_io_state { 109 115 struct kref kref; ··· 145 139 unsigned max_par_pages; 146 140 unsigned cur_par_page; 147 141 unsigned sgs_per_dev; 142 + struct __stripe_pages_2d *sp2d; 143 + struct ore_io_state *ios_read_4_write; 144 + const struct _ore_r4w_op *r4w; 148 145 149 146 /* Variable array of size numdevs */ 150 147 unsigned numdevs;