Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.6 1120 lines 30 kB view raw
1/* 2 * Copyright (C) 2005, 2006 3 * Avishay Traeger (avishay@gmail.com) 4 * Copyright (C) 2008, 2009 5 * Boaz Harrosh <bharrosh@panasas.com> 6 * 7 * This file is part of exofs. 8 * 9 * exofs is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation. Since it is based on ext2, and the only 12 * valid version of GPL for the Linux kernel is version 2, the only valid 13 * version of GPL for exofs is version 2. 14 * 15 * exofs is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with exofs; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25#include <linux/slab.h> 26#include <linux/module.h> 27#include <asm/div64.h> 28#include <linux/lcm.h> 29 30#include "ore_raid.h" 31 32MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); 33MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); 34MODULE_LICENSE("GPL"); 35 36/* ore_verify_layout does a couple of things: 37 * 1. Given a minimum number of needed parameters fixes up the rest of the 38 * members to be operatonals for the ore. The needed parameters are those 39 * that are defined by the pnfs-objects layout STD. 40 * 2. Check to see if the current ore code actually supports these parameters 41 * for example stripe_unit must be a multple of the system PAGE_SIZE, 42 * and etc... 43 * 3. Cache some havily used calculations that will be needed by users. 44 */ 45 46enum { BIO_MAX_PAGES_KMALLOC = 47 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; 48 49int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) 50{ 51 u64 stripe_length; 52 53 switch (layout->raid_algorithm) { 54 case PNFS_OSD_RAID_0: 55 layout->parity = 0; 56 break; 57 case PNFS_OSD_RAID_5: 58 layout->parity = 1; 59 break; 60 case PNFS_OSD_RAID_PQ: 61 case PNFS_OSD_RAID_4: 62 default: 63 ORE_ERR("Only RAID_0/5 for now\n"); 64 return -EINVAL; 65 } 66 if (0 != (layout->stripe_unit & ~PAGE_MASK)) { 67 ORE_ERR("Stripe Unit(0x%llx)" 68 " must be Multples of PAGE_SIZE(0x%lx)\n", 69 _LLU(layout->stripe_unit), PAGE_SIZE); 70 return -EINVAL; 71 } 72 if (layout->group_width) { 73 if (!layout->group_depth) { 74 ORE_ERR("group_depth == 0 && group_width != 0\n"); 75 return -EINVAL; 76 } 77 if (total_comps < (layout->group_width * layout->mirrors_p1)) { 78 ORE_ERR("Data Map wrong, " 79 "numdevs=%d < group_width=%d * mirrors=%d\n", 80 total_comps, layout->group_width, 81 layout->mirrors_p1); 82 return -EINVAL; 83 } 84 layout->group_count = total_comps / layout->mirrors_p1 / 85 layout->group_width; 86 } else { 87 if (layout->group_depth) { 88 printk(KERN_NOTICE "Warning: group_depth ignored " 89 "group_width == 0 && group_depth == %lld\n", 90 _LLU(layout->group_depth)); 91 } 92 layout->group_width = total_comps / layout->mirrors_p1; 93 layout->group_depth = -1; 94 layout->group_count = 1; 95 } 96 97 stripe_length = (u64)layout->group_width * layout->stripe_unit; 98 if (stripe_length >= (1ULL << 32)) { 99 ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", 100 _LLU(stripe_length)); 101 return -EINVAL; 102 } 103 104 layout->max_io_length = 105 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * 106 layout->group_width; 107 if (layout->parity) { 108 unsigned stripe_length = 109 (layout->group_width - layout->parity) * 110 layout->stripe_unit; 111 112 layout->max_io_length /= stripe_length; 113 layout->max_io_length *= stripe_length; 114 } 115 return 0; 116} 117EXPORT_SYMBOL(ore_verify_layout); 118 119static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) 120{ 121 return ios->oc->comps[index & ios->oc->single_comp].cred; 122} 123 124static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) 125{ 126 return &ios->oc->comps[index & ios->oc->single_comp].obj; 127} 128 129static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) 130{ 131 ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", 132 ios->oc->first_dev, ios->oc->numdevs, index, 133 ios->oc->ods); 134 135 return ore_comp_dev(ios->oc, index); 136} 137 138int _ore_get_io_state(struct ore_layout *layout, 139 struct ore_components *oc, unsigned numdevs, 140 unsigned sgs_per_dev, unsigned num_par_pages, 141 struct ore_io_state **pios) 142{ 143 struct ore_io_state *ios; 144 struct page **pages; 145 struct osd_sg_entry *sgilist; 146 struct __alloc_all_io_state { 147 struct ore_io_state ios; 148 struct ore_per_dev_state per_dev[numdevs]; 149 union { 150 struct osd_sg_entry sglist[sgs_per_dev * numdevs]; 151 struct page *pages[num_par_pages]; 152 }; 153 } *_aios; 154 155 if (likely(sizeof(*_aios) <= PAGE_SIZE)) { 156 _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); 157 if (unlikely(!_aios)) { 158 ORE_DBGMSG("Failed kzalloc bytes=%zd\n", 159 sizeof(*_aios)); 160 *pios = NULL; 161 return -ENOMEM; 162 } 163 pages = num_par_pages ? _aios->pages : NULL; 164 sgilist = sgs_per_dev ? _aios->sglist : NULL; 165 ios = &_aios->ios; 166 } else { 167 struct __alloc_small_io_state { 168 struct ore_io_state ios; 169 struct ore_per_dev_state per_dev[numdevs]; 170 } *_aio_small; 171 union __extra_part { 172 struct osd_sg_entry sglist[sgs_per_dev * numdevs]; 173 struct page *pages[num_par_pages]; 174 } *extra_part; 175 176 _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); 177 if (unlikely(!_aio_small)) { 178 ORE_DBGMSG("Failed alloc first part bytes=%zd\n", 179 sizeof(*_aio_small)); 180 *pios = NULL; 181 return -ENOMEM; 182 } 183 extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); 184 if (unlikely(!extra_part)) { 185 ORE_DBGMSG("Failed alloc second part bytes=%zd\n", 186 sizeof(*extra_part)); 187 kfree(_aio_small); 188 *pios = NULL; 189 return -ENOMEM; 190 } 191 192 pages = num_par_pages ? extra_part->pages : NULL; 193 sgilist = sgs_per_dev ? extra_part->sglist : NULL; 194 /* In this case the per_dev[0].sgilist holds the pointer to 195 * be freed 196 */ 197 ios = &_aio_small->ios; 198 ios->extra_part_alloc = true; 199 } 200 201 if (pages) { 202 ios->parity_pages = pages; 203 ios->max_par_pages = num_par_pages; 204 } 205 if (sgilist) { 206 unsigned d; 207 208 for (d = 0; d < numdevs; ++d) { 209 ios->per_dev[d].sglist = sgilist; 210 sgilist += sgs_per_dev; 211 } 212 ios->sgs_per_dev = sgs_per_dev; 213 } 214 215 ios->layout = layout; 216 ios->oc = oc; 217 *pios = ios; 218 return 0; 219} 220 221/* Allocate an io_state for only a single group of devices 222 * 223 * If a user needs to call ore_read/write() this version must be used becase it 224 * allocates extra stuff for striping and raid. 225 * The ore might decide to only IO less then @length bytes do to alignmets 226 * and constrains as follows: 227 * - The IO cannot cross group boundary. 228 * - In raid5/6 The end of the IO must align at end of a stripe eg. 229 * (@offset + @length) % strip_size == 0. Or the complete range is within a 230 * single stripe. 231 * - Memory condition only permitted a shorter IO. (A user can use @length=~0 232 * And check the returned ios->length for max_io_size.) 233 * 234 * The caller must check returned ios->length (and/or ios->nr_pages) and 235 * re-issue these pages that fall outside of ios->length 236 */ 237int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, 238 bool is_reading, u64 offset, u64 length, 239 struct ore_io_state **pios) 240{ 241 struct ore_io_state *ios; 242 unsigned numdevs = layout->group_width * layout->mirrors_p1; 243 unsigned sgs_per_dev = 0, max_par_pages = 0; 244 int ret; 245 246 if (layout->parity && length) { 247 unsigned data_devs = layout->group_width - layout->parity; 248 unsigned stripe_size = layout->stripe_unit * data_devs; 249 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; 250 u32 remainder; 251 u64 num_stripes; 252 u64 num_raid_units; 253 254 num_stripes = div_u64_rem(length, stripe_size, &remainder); 255 if (remainder) 256 ++num_stripes; 257 258 num_raid_units = num_stripes * layout->parity; 259 260 if (is_reading) { 261 /* For reads add per_dev sglist array */ 262 /* TODO: Raid 6 we need twice more. Actually: 263 * num_stripes / LCMdP(W,P); 264 * if (W%P != 0) num_stripes *= parity; 265 */ 266 267 /* first/last seg is split */ 268 num_raid_units += layout->group_width; 269 sgs_per_dev = div_u64(num_raid_units, data_devs) + 2; 270 } else { 271 /* For Writes add parity pages array. */ 272 max_par_pages = num_raid_units * pages_in_unit * 273 sizeof(struct page *); 274 } 275 } 276 277 ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, 278 pios); 279 if (unlikely(ret)) 280 return ret; 281 282 ios = *pios; 283 ios->reading = is_reading; 284 ios->offset = offset; 285 286 if (length) { 287 ore_calc_stripe_info(layout, offset, length, &ios->si); 288 ios->length = ios->si.length; 289 ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE; 290 if (layout->parity) 291 _ore_post_alloc_raid_stuff(ios); 292 } 293 294 return 0; 295} 296EXPORT_SYMBOL(ore_get_rw_state); 297 298/* Allocate an io_state for all the devices in the comps array 299 * 300 * This version of io_state allocation is used mostly by create/remove 301 * and trunc where we currently need all the devices. The only wastful 302 * bit is the read/write_attributes with no IO. Those sites should 303 * be converted to use ore_get_rw_state() with length=0 304 */ 305int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, 306 struct ore_io_state **pios) 307{ 308 return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); 309} 310EXPORT_SYMBOL(ore_get_io_state); 311 312void ore_put_io_state(struct ore_io_state *ios) 313{ 314 if (ios) { 315 unsigned i; 316 317 for (i = 0; i < ios->numdevs; i++) { 318 struct ore_per_dev_state *per_dev = &ios->per_dev[i]; 319 320 if (per_dev->or) 321 osd_end_request(per_dev->or); 322 if (per_dev->bio) 323 bio_put(per_dev->bio); 324 } 325 326 _ore_free_raid_stuff(ios); 327 kfree(ios); 328 } 329} 330EXPORT_SYMBOL(ore_put_io_state); 331 332static void _sync_done(struct ore_io_state *ios, void *p) 333{ 334 struct completion *waiting = p; 335 336 complete(waiting); 337} 338 339static void _last_io(struct kref *kref) 340{ 341 struct ore_io_state *ios = container_of( 342 kref, struct ore_io_state, kref); 343 344 ios->done(ios, ios->private); 345} 346 347static void _done_io(struct osd_request *or, void *p) 348{ 349 struct ore_io_state *ios = p; 350 351 kref_put(&ios->kref, _last_io); 352} 353 354int ore_io_execute(struct ore_io_state *ios) 355{ 356 DECLARE_COMPLETION_ONSTACK(wait); 357 bool sync = (ios->done == NULL); 358 int i, ret; 359 360 if (sync) { 361 ios->done = _sync_done; 362 ios->private = &wait; 363 } 364 365 for (i = 0; i < ios->numdevs; i++) { 366 struct osd_request *or = ios->per_dev[i].or; 367 if (unlikely(!or)) 368 continue; 369 370 ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL); 371 if (unlikely(ret)) { 372 ORE_DBGMSG("Failed to osd_finalize_request() => %d\n", 373 ret); 374 return ret; 375 } 376 } 377 378 kref_init(&ios->kref); 379 380 for (i = 0; i < ios->numdevs; i++) { 381 struct osd_request *or = ios->per_dev[i].or; 382 if (unlikely(!or)) 383 continue; 384 385 kref_get(&ios->kref); 386 osd_execute_request_async(or, _done_io, ios); 387 } 388 389 kref_put(&ios->kref, _last_io); 390 ret = 0; 391 392 if (sync) { 393 wait_for_completion(&wait); 394 ret = ore_check_io(ios, NULL); 395 } 396 return ret; 397} 398 399static void _clear_bio(struct bio *bio) 400{ 401 struct bio_vec *bv; 402 unsigned i; 403 404 __bio_for_each_segment(bv, bio, i, 0) { 405 unsigned this_count = bv->bv_len; 406 407 if (likely(PAGE_SIZE == this_count)) 408 clear_highpage(bv->bv_page); 409 else 410 zero_user(bv->bv_page, bv->bv_offset, this_count); 411 } 412} 413 414int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) 415{ 416 enum osd_err_priority acumulated_osd_err = 0; 417 int acumulated_lin_err = 0; 418 int i; 419 420 for (i = 0; i < ios->numdevs; i++) { 421 struct osd_sense_info osi; 422 struct ore_per_dev_state *per_dev = &ios->per_dev[i]; 423 struct osd_request *or = per_dev->or; 424 int ret; 425 426 if (unlikely(!or)) 427 continue; 428 429 ret = osd_req_decode_sense(or, &osi); 430 if (likely(!ret)) 431 continue; 432 433 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { 434 /* start read offset passed endof file */ 435 _clear_bio(per_dev->bio); 436 ORE_DBGMSG("start read offset passed end of file " 437 "offset=0x%llx, length=0x%llx\n", 438 _LLU(per_dev->offset), 439 _LLU(per_dev->length)); 440 441 continue; /* we recovered */ 442 } 443 444 if (on_dev_error) { 445 u64 residual = ios->reading ? 446 or->in.residual : or->out.residual; 447 u64 offset = (ios->offset + ios->length) - residual; 448 unsigned dev = per_dev->dev - ios->oc->first_dev; 449 struct ore_dev *od = ios->oc->ods[dev]; 450 451 on_dev_error(ios, od, dev, osi.osd_err_pri, 452 offset, residual); 453 } 454 if (osi.osd_err_pri >= acumulated_osd_err) { 455 acumulated_osd_err = osi.osd_err_pri; 456 acumulated_lin_err = ret; 457 } 458 } 459 460 return acumulated_lin_err; 461} 462EXPORT_SYMBOL(ore_check_io); 463 464/* 465 * L - logical offset into the file 466 * 467 * D - number of Data devices 468 * D = group_width - parity 469 * 470 * U - The number of bytes in a stripe within a group 471 * U = stripe_unit * D 472 * 473 * T - The number of bytes striped within a group of component objects 474 * (before advancing to the next group) 475 * T = U * group_depth 476 * 477 * S - The number of bytes striped across all component objects 478 * before the pattern repeats 479 * S = T * group_count 480 * 481 * M - The "major" (i.e., across all components) cycle number 482 * M = L / S 483 * 484 * G - Counts the groups from the beginning of the major cycle 485 * G = (L - (M * S)) / T [or (L % S) / T] 486 * 487 * H - The byte offset within the group 488 * H = (L - (M * S)) % T [or (L % S) % T] 489 * 490 * N - The "minor" (i.e., across the group) stripe number 491 * N = H / U 492 * 493 * C - The component index coresponding to L 494 * 495 * C = (H - (N * U)) / stripe_unit + G * D 496 * [or (L % U) / stripe_unit + G * D] 497 * 498 * O - The component offset coresponding to L 499 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit 500 * 501 * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity 502 * divide by parity 503 * LCMdP = lcm(group_width, parity) / parity 504 * 505 * R - The parity Rotation stripe 506 * (Note parity cycle always starts at a group's boundary) 507 * R = N % LCMdP 508 * 509 * I = the first parity device index 510 * I = (group_width + group_width - R*parity - parity) % group_width 511 * 512 * Craid - The component index Rotated 513 * Craid = (group_width + C - R*parity) % group_width 514 * (We add the group_width to avoid negative numbers modulo math) 515 */ 516void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, 517 u64 length, struct ore_striping_info *si) 518{ 519 u32 stripe_unit = layout->stripe_unit; 520 u32 group_width = layout->group_width; 521 u64 group_depth = layout->group_depth; 522 u32 parity = layout->parity; 523 524 u32 D = group_width - parity; 525 u32 U = D * stripe_unit; 526 u64 T = U * group_depth; 527 u64 S = T * layout->group_count; 528 u64 M = div64_u64(file_offset, S); 529 530 /* 531 G = (L - (M * S)) / T 532 H = (L - (M * S)) % T 533 */ 534 u64 LmodS = file_offset - M * S; 535 u32 G = div64_u64(LmodS, T); 536 u64 H = LmodS - G * T; 537 538 u32 N = div_u64(H, U); 539 540 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 541 u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; 542 543 div_u64_rem(file_offset, stripe_unit, &si->unit_off); 544 545 si->obj_offset = si->unit_off + (N * stripe_unit) + 546 (M * group_depth * stripe_unit); 547 548 if (parity) { 549 u32 LCMdP = lcm(group_width, parity) / parity; 550 /* R = N % LCMdP; */ 551 u32 RxP = (N % LCMdP) * parity; 552 u32 first_dev = C - C % group_width; 553 554 si->par_dev = (group_width + group_width - parity - RxP) % 555 group_width + first_dev; 556 si->dev = (group_width + C - RxP) % group_width + first_dev; 557 si->bytes_in_stripe = U; 558 si->first_stripe_start = M * S + G * T + N * U; 559 } else { 560 /* Make the math correct see _prepare_one_group */ 561 si->par_dev = group_width; 562 si->dev = C; 563 } 564 565 si->dev *= layout->mirrors_p1; 566 si->par_dev *= layout->mirrors_p1; 567 si->offset = file_offset; 568 si->length = T - H; 569 if (si->length > length) 570 si->length = length; 571 si->M = M; 572} 573EXPORT_SYMBOL(ore_calc_stripe_info); 574 575int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 576 unsigned pgbase, struct page **pages, 577 struct ore_per_dev_state *per_dev, int cur_len) 578{ 579 unsigned pg = *cur_pg; 580 struct request_queue *q = 581 osd_request_queue(_ios_od(ios, per_dev->dev)); 582 unsigned len = cur_len; 583 int ret; 584 585 if (per_dev->bio == NULL) { 586 unsigned pages_in_stripe = ios->layout->group_width * 587 (ios->layout->stripe_unit / PAGE_SIZE); 588 unsigned nr_pages = ios->nr_pages * ios->layout->group_width / 589 (ios->layout->group_width - 590 ios->layout->parity); 591 unsigned bio_size = (nr_pages + pages_in_stripe) / 592 ios->layout->group_width; 593 594 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 595 if (unlikely(!per_dev->bio)) { 596 ORE_DBGMSG("Failed to allocate BIO size=%u\n", 597 bio_size); 598 ret = -ENOMEM; 599 goto out; 600 } 601 } 602 603 while (cur_len > 0) { 604 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); 605 unsigned added_len; 606 607 cur_len -= pglen; 608 609 added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], 610 pglen, pgbase); 611 if (unlikely(pglen != added_len)) { 612 ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n", 613 per_dev->bio->bi_vcnt); 614 ret = -ENOMEM; 615 goto out; 616 } 617 _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); 618 619 pgbase = 0; 620 ++pg; 621 } 622 BUG_ON(cur_len); 623 624 per_dev->length += len; 625 *cur_pg = pg; 626 ret = 0; 627out: /* we fail the complete unit on an error eg don't advance 628 * per_dev->length and cur_pg. This means that we might have a bigger 629 * bio than the CDB requested length (per_dev->length). That's fine 630 * only the oposite is fatal. 631 */ 632 return ret; 633} 634 635static int _prepare_for_striping(struct ore_io_state *ios) 636{ 637 struct ore_striping_info *si = &ios->si; 638 unsigned stripe_unit = ios->layout->stripe_unit; 639 unsigned mirrors_p1 = ios->layout->mirrors_p1; 640 unsigned group_width = ios->layout->group_width; 641 unsigned devs_in_group = group_width * mirrors_p1; 642 unsigned dev = si->dev; 643 unsigned first_dev = dev - (dev % devs_in_group); 644 unsigned dev_order; 645 unsigned cur_pg = ios->pages_consumed; 646 u64 length = ios->length; 647 int ret = 0; 648 649 if (!ios->pages) { 650 ios->numdevs = ios->layout->mirrors_p1; 651 return 0; 652 } 653 654 BUG_ON(length > si->length); 655 656 dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); 657 si->cur_comp = dev_order; 658 si->cur_pg = si->unit_off / PAGE_SIZE; 659 660 while (length) { 661 unsigned comp = dev - first_dev; 662 struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; 663 unsigned cur_len, page_off = 0; 664 665 if (!per_dev->length) { 666 per_dev->dev = dev; 667 if (dev == si->dev) { 668 WARN_ON(dev == si->par_dev); 669 per_dev->offset = si->obj_offset; 670 cur_len = stripe_unit - si->unit_off; 671 page_off = si->unit_off & ~PAGE_MASK; 672 BUG_ON(page_off && (page_off != ios->pgbase)); 673 } else { 674 if (si->cur_comp > dev_order) 675 per_dev->offset = 676 si->obj_offset - si->unit_off; 677 else /* si->cur_comp < dev_order */ 678 per_dev->offset = 679 si->obj_offset + stripe_unit - 680 si->unit_off; 681 cur_len = stripe_unit; 682 } 683 } else { 684 cur_len = stripe_unit; 685 } 686 if (cur_len >= length) 687 cur_len = length; 688 689 ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, 690 per_dev, cur_len); 691 if (unlikely(ret)) 692 goto out; 693 694 dev += mirrors_p1; 695 dev = (dev % devs_in_group) + first_dev; 696 697 length -= cur_len; 698 699 si->cur_comp = (si->cur_comp + 1) % group_width; 700 if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { 701 if (!length && ios->sp2d) { 702 /* If we are writing and this is the very last 703 * stripe. then operate on parity dev. 704 */ 705 dev = si->par_dev; 706 } 707 if (ios->sp2d) 708 /* In writes cur_len just means if it's the 709 * last one. See _ore_add_parity_unit. 710 */ 711 cur_len = length; 712 per_dev = &ios->per_dev[dev - first_dev]; 713 if (!per_dev->length) { 714 /* Only/always the parity unit of the first 715 * stripe will be empty. So this is a chance to 716 * initialize the per_dev info. 717 */ 718 per_dev->dev = dev; 719 per_dev->offset = si->obj_offset - si->unit_off; 720 } 721 722 ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); 723 if (unlikely(ret)) 724 goto out; 725 726 /* Rotate next par_dev backwards with wraping */ 727 si->par_dev = (devs_in_group + si->par_dev - 728 ios->layout->parity * mirrors_p1) % 729 devs_in_group + first_dev; 730 /* Next stripe, start fresh */ 731 si->cur_comp = 0; 732 si->cur_pg = 0; 733 } 734 } 735out: 736 ios->numdevs = devs_in_group; 737 ios->pages_consumed = cur_pg; 738 return ret; 739} 740 741int ore_create(struct ore_io_state *ios) 742{ 743 int i, ret; 744 745 for (i = 0; i < ios->oc->numdevs; i++) { 746 struct osd_request *or; 747 748 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); 749 if (unlikely(!or)) { 750 ORE_ERR("%s: osd_start_request failed\n", __func__); 751 ret = -ENOMEM; 752 goto out; 753 } 754 ios->per_dev[i].or = or; 755 ios->numdevs++; 756 757 osd_req_create_object(or, _ios_obj(ios, i)); 758 } 759 ret = ore_io_execute(ios); 760 761out: 762 return ret; 763} 764EXPORT_SYMBOL(ore_create); 765 766int ore_remove(struct ore_io_state *ios) 767{ 768 int i, ret; 769 770 for (i = 0; i < ios->oc->numdevs; i++) { 771 struct osd_request *or; 772 773 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); 774 if (unlikely(!or)) { 775 ORE_ERR("%s: osd_start_request failed\n", __func__); 776 ret = -ENOMEM; 777 goto out; 778 } 779 ios->per_dev[i].or = or; 780 ios->numdevs++; 781 782 osd_req_remove_object(or, _ios_obj(ios, i)); 783 } 784 ret = ore_io_execute(ios); 785 786out: 787 return ret; 788} 789EXPORT_SYMBOL(ore_remove); 790 791static int _write_mirror(struct ore_io_state *ios, int cur_comp) 792{ 793 struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp]; 794 unsigned dev = ios->per_dev[cur_comp].dev; 795 unsigned last_comp = cur_comp + ios->layout->mirrors_p1; 796 int ret = 0; 797 798 if (ios->pages && !master_dev->length) 799 return 0; /* Just an empty slot */ 800 801 for (; cur_comp < last_comp; ++cur_comp, ++dev) { 802 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 803 struct osd_request *or; 804 805 or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); 806 if (unlikely(!or)) { 807 ORE_ERR("%s: osd_start_request failed\n", __func__); 808 ret = -ENOMEM; 809 goto out; 810 } 811 per_dev->or = or; 812 813 if (ios->pages) { 814 struct bio *bio; 815 816 if (per_dev != master_dev) { 817 bio = bio_kmalloc(GFP_KERNEL, 818 master_dev->bio->bi_max_vecs); 819 if (unlikely(!bio)) { 820 ORE_DBGMSG( 821 "Failed to allocate BIO size=%u\n", 822 master_dev->bio->bi_max_vecs); 823 ret = -ENOMEM; 824 goto out; 825 } 826 827 __bio_clone(bio, master_dev->bio); 828 bio->bi_bdev = NULL; 829 bio->bi_next = NULL; 830 per_dev->offset = master_dev->offset; 831 per_dev->length = master_dev->length; 832 per_dev->bio = bio; 833 per_dev->dev = dev; 834 } else { 835 bio = master_dev->bio; 836 /* FIXME: bio_set_dir() */ 837 bio->bi_rw |= REQ_WRITE; 838 } 839 840 osd_req_write(or, _ios_obj(ios, cur_comp), 841 per_dev->offset, bio, per_dev->length); 842 ORE_DBGMSG("write(0x%llx) offset=0x%llx " 843 "length=0x%llx dev=%d\n", 844 _LLU(_ios_obj(ios, cur_comp)->id), 845 _LLU(per_dev->offset), 846 _LLU(per_dev->length), dev); 847 } else if (ios->kern_buff) { 848 per_dev->offset = ios->si.obj_offset; 849 per_dev->dev = ios->si.dev + dev; 850 851 /* no cross device without page array */ 852 BUG_ON((ios->layout->group_width > 1) && 853 (ios->si.unit_off + ios->length > 854 ios->layout->stripe_unit)); 855 856 ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp), 857 per_dev->offset, 858 ios->kern_buff, ios->length); 859 if (unlikely(ret)) 860 goto out; 861 ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " 862 "length=0x%llx dev=%d\n", 863 _LLU(_ios_obj(ios, cur_comp)->id), 864 _LLU(per_dev->offset), 865 _LLU(ios->length), per_dev->dev); 866 } else { 867 osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); 868 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", 869 _LLU(_ios_obj(ios, cur_comp)->id), 870 ios->out_attr_len, dev); 871 } 872 873 if (ios->out_attr) 874 osd_req_add_set_attr_list(or, ios->out_attr, 875 ios->out_attr_len); 876 877 if (ios->in_attr) 878 osd_req_add_get_attr_list(or, ios->in_attr, 879 ios->in_attr_len); 880 } 881 882out: 883 return ret; 884} 885 886int ore_write(struct ore_io_state *ios) 887{ 888 int i; 889 int ret; 890 891 if (unlikely(ios->sp2d && !ios->r4w)) { 892 /* A library is attempting a RAID-write without providing 893 * a pages lock interface. 894 */ 895 WARN_ON_ONCE(1); 896 return -ENOTSUPP; 897 } 898 899 ret = _prepare_for_striping(ios); 900 if (unlikely(ret)) 901 return ret; 902 903 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 904 ret = _write_mirror(ios, i); 905 if (unlikely(ret)) 906 return ret; 907 } 908 909 ret = ore_io_execute(ios); 910 return ret; 911} 912EXPORT_SYMBOL(ore_write); 913 914int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) 915{ 916 struct osd_request *or; 917 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 918 struct osd_obj_id *obj = _ios_obj(ios, cur_comp); 919 unsigned first_dev = (unsigned)obj->id; 920 921 if (ios->pages && !per_dev->length) 922 return 0; /* Just an empty slot */ 923 924 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; 925 or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); 926 if (unlikely(!or)) { 927 ORE_ERR("%s: osd_start_request failed\n", __func__); 928 return -ENOMEM; 929 } 930 per_dev->or = or; 931 932 if (ios->pages) { 933 if (per_dev->cur_sg) { 934 /* finalize the last sg_entry */ 935 _ore_add_sg_seg(per_dev, 0, false); 936 if (unlikely(!per_dev->cur_sg)) 937 return 0; /* Skip parity only device */ 938 939 osd_req_read_sg(or, obj, per_dev->bio, 940 per_dev->sglist, per_dev->cur_sg); 941 } else { 942 /* The no raid case */ 943 osd_req_read(or, obj, per_dev->offset, 944 per_dev->bio, per_dev->length); 945 } 946 947 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" 948 " dev=%d sg_len=%d\n", _LLU(obj->id), 949 _LLU(per_dev->offset), _LLU(per_dev->length), 950 first_dev, per_dev->cur_sg); 951 } else { 952 BUG_ON(ios->kern_buff); 953 954 osd_req_get_attributes(or, obj); 955 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", 956 _LLU(obj->id), 957 ios->in_attr_len, first_dev); 958 } 959 if (ios->out_attr) 960 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); 961 962 if (ios->in_attr) 963 osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); 964 965 return 0; 966} 967 968int ore_read(struct ore_io_state *ios) 969{ 970 int i; 971 int ret; 972 973 ret = _prepare_for_striping(ios); 974 if (unlikely(ret)) 975 return ret; 976 977 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 978 ret = _ore_read_mirror(ios, i); 979 if (unlikely(ret)) 980 return ret; 981 } 982 983 ret = ore_io_execute(ios); 984 return ret; 985} 986EXPORT_SYMBOL(ore_read); 987 988int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr) 989{ 990 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ 991 void *iter = NULL; 992 int nelem; 993 994 do { 995 nelem = 1; 996 osd_req_decode_get_attr_list(ios->per_dev[0].or, 997 &cur_attr, &nelem, &iter); 998 if ((cur_attr.attr_page == attr->attr_page) && 999 (cur_attr.attr_id == attr->attr_id)) { 1000 attr->len = cur_attr.len; 1001 attr->val_ptr = cur_attr.val_ptr; 1002 return 0; 1003 } 1004 } while (iter); 1005 1006 return -EIO; 1007} 1008EXPORT_SYMBOL(extract_attr_from_ios); 1009 1010static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, 1011 struct osd_attr *attr) 1012{ 1013 int last_comp = cur_comp + ios->layout->mirrors_p1; 1014 1015 for (; cur_comp < last_comp; ++cur_comp) { 1016 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 1017 struct osd_request *or; 1018 1019 or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); 1020 if (unlikely(!or)) { 1021 ORE_ERR("%s: osd_start_request failed\n", __func__); 1022 return -ENOMEM; 1023 } 1024 per_dev->or = or; 1025 1026 osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); 1027 osd_req_add_set_attr_list(or, attr, 1); 1028 } 1029 1030 return 0; 1031} 1032 1033struct _trunc_info { 1034 struct ore_striping_info si; 1035 u64 prev_group_obj_off; 1036 u64 next_group_obj_off; 1037 1038 unsigned first_group_dev; 1039 unsigned nex_group_dev; 1040}; 1041 1042static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, 1043 struct _trunc_info *ti) 1044{ 1045 unsigned stripe_unit = layout->stripe_unit; 1046 1047 ore_calc_stripe_info(layout, file_offset, 0, &ti->si); 1048 1049 ti->prev_group_obj_off = ti->si.M * stripe_unit; 1050 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; 1051 1052 ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); 1053 ti->nex_group_dev = ti->first_group_dev + layout->group_width; 1054} 1055 1056int ore_truncate(struct ore_layout *layout, struct ore_components *oc, 1057 u64 size) 1058{ 1059 struct ore_io_state *ios; 1060 struct exofs_trunc_attr { 1061 struct osd_attr attr; 1062 __be64 newsize; 1063 } *size_attrs; 1064 struct _trunc_info ti; 1065 int i, ret; 1066 1067 ret = ore_get_io_state(layout, oc, &ios); 1068 if (unlikely(ret)) 1069 return ret; 1070 1071 _calc_trunk_info(ios->layout, size, &ti); 1072 1073 size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), 1074 GFP_KERNEL); 1075 if (unlikely(!size_attrs)) { 1076 ret = -ENOMEM; 1077 goto out; 1078 } 1079 1080 ios->numdevs = ios->oc->numdevs; 1081 1082 for (i = 0; i < ios->numdevs; ++i) { 1083 struct exofs_trunc_attr *size_attr = &size_attrs[i]; 1084 u64 obj_size; 1085 1086 if (i < ti.first_group_dev) 1087 obj_size = ti.prev_group_obj_off; 1088 else if (i >= ti.nex_group_dev) 1089 obj_size = ti.next_group_obj_off; 1090 else if (i < ti.si.dev) /* dev within this group */ 1091 obj_size = ti.si.obj_offset + 1092 ios->layout->stripe_unit - ti.si.unit_off; 1093 else if (i == ti.si.dev) 1094 obj_size = ti.si.obj_offset; 1095 else /* i > ti.dev */ 1096 obj_size = ti.si.obj_offset - ti.si.unit_off; 1097 1098 size_attr->newsize = cpu_to_be64(obj_size); 1099 size_attr->attr = g_attr_logical_length; 1100 size_attr->attr.val_ptr = &size_attr->newsize; 1101 1102 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", 1103 _LLU(oc->comps->obj.id), _LLU(obj_size), i); 1104 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, 1105 &size_attr->attr); 1106 if (unlikely(ret)) 1107 goto out; 1108 } 1109 ret = ore_io_execute(ios); 1110 1111out: 1112 kfree(size_attrs); 1113 ore_put_io_state(ios); 1114 return ret; 1115} 1116EXPORT_SYMBOL(ore_truncate); 1117 1118const struct osd_attr g_attr_logical_length = ATTR_DEF( 1119 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); 1120EXPORT_SYMBOL(g_attr_logical_length);