at v3.15 1136 lines 30 kB view raw
1/* 2 * Copyright (C) 2005, 2006 3 * Avishay Traeger (avishay@gmail.com) 4 * Copyright (C) 2008, 2009 5 * Boaz Harrosh <bharrosh@panasas.com> 6 * 7 * This file is part of exofs. 8 * 9 * exofs is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation. Since it is based on ext2, and the only 12 * valid version of GPL for the Linux kernel is version 2, the only valid 13 * version of GPL for exofs is version 2. 14 * 15 * exofs is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with exofs; if not, write to the Free Software 22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 23 */ 24 25#include <linux/slab.h> 26#include <linux/module.h> 27#include <asm/div64.h> 28#include <linux/lcm.h> 29 30#include "ore_raid.h" 31 32MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); 33MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); 34MODULE_LICENSE("GPL"); 35 36/* ore_verify_layout does a couple of things: 37 * 1. Given a minimum number of needed parameters fixes up the rest of the 38 * members to be operatonals for the ore. The needed parameters are those 39 * that are defined by the pnfs-objects layout STD. 40 * 2. Check to see if the current ore code actually supports these parameters 41 * for example stripe_unit must be a multple of the system PAGE_SIZE, 42 * and etc... 43 * 3. Cache some havily used calculations that will be needed by users. 44 */ 45 46enum { BIO_MAX_PAGES_KMALLOC = 47 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; 48 49int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) 50{ 51 u64 stripe_length; 52 53 switch (layout->raid_algorithm) { 54 case PNFS_OSD_RAID_0: 55 layout->parity = 0; 56 break; 57 case PNFS_OSD_RAID_5: 58 layout->parity = 1; 59 break; 60 case PNFS_OSD_RAID_PQ: 61 case PNFS_OSD_RAID_4: 62 default: 63 ORE_ERR("Only RAID_0/5 for now\n"); 64 return -EINVAL; 65 } 66 if (0 != (layout->stripe_unit & ~PAGE_MASK)) { 67 ORE_ERR("Stripe Unit(0x%llx)" 68 " must be Multples of PAGE_SIZE(0x%lx)\n", 69 _LLU(layout->stripe_unit), PAGE_SIZE); 70 return -EINVAL; 71 } 72 if (layout->group_width) { 73 if (!layout->group_depth) { 74 ORE_ERR("group_depth == 0 && group_width != 0\n"); 75 return -EINVAL; 76 } 77 if (total_comps < (layout->group_width * layout->mirrors_p1)) { 78 ORE_ERR("Data Map wrong, " 79 "numdevs=%d < group_width=%d * mirrors=%d\n", 80 total_comps, layout->group_width, 81 layout->mirrors_p1); 82 return -EINVAL; 83 } 84 layout->group_count = total_comps / layout->mirrors_p1 / 85 layout->group_width; 86 } else { 87 if (layout->group_depth) { 88 printk(KERN_NOTICE "Warning: group_depth ignored " 89 "group_width == 0 && group_depth == %lld\n", 90 _LLU(layout->group_depth)); 91 } 92 layout->group_width = total_comps / layout->mirrors_p1; 93 layout->group_depth = -1; 94 layout->group_count = 1; 95 } 96 97 stripe_length = (u64)layout->group_width * layout->stripe_unit; 98 if (stripe_length >= (1ULL << 32)) { 99 ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", 100 _LLU(stripe_length)); 101 return -EINVAL; 102 } 103 104 layout->max_io_length = 105 (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * 106 (layout->group_width - layout->parity); 107 if (layout->parity) { 108 unsigned stripe_length = 109 (layout->group_width - layout->parity) * 110 layout->stripe_unit; 111 112 layout->max_io_length /= stripe_length; 113 layout->max_io_length *= stripe_length; 114 } 115 return 0; 116} 117EXPORT_SYMBOL(ore_verify_layout); 118 119static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) 120{ 121 return ios->oc->comps[index & ios->oc->single_comp].cred; 122} 123 124static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) 125{ 126 return &ios->oc->comps[index & ios->oc->single_comp].obj; 127} 128 129static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) 130{ 131 ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", 132 ios->oc->first_dev, ios->oc->numdevs, index, 133 ios->oc->ods); 134 135 return ore_comp_dev(ios->oc, index); 136} 137 138int _ore_get_io_state(struct ore_layout *layout, 139 struct ore_components *oc, unsigned numdevs, 140 unsigned sgs_per_dev, unsigned num_par_pages, 141 struct ore_io_state **pios) 142{ 143 struct ore_io_state *ios; 144 struct page **pages; 145 struct osd_sg_entry *sgilist; 146 struct __alloc_all_io_state { 147 struct ore_io_state ios; 148 struct ore_per_dev_state per_dev[numdevs]; 149 union { 150 struct osd_sg_entry sglist[sgs_per_dev * numdevs]; 151 struct page *pages[num_par_pages]; 152 }; 153 } *_aios; 154 155 if (likely(sizeof(*_aios) <= PAGE_SIZE)) { 156 _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); 157 if (unlikely(!_aios)) { 158 ORE_DBGMSG("Failed kzalloc bytes=%zd\n", 159 sizeof(*_aios)); 160 *pios = NULL; 161 return -ENOMEM; 162 } 163 pages = num_par_pages ? _aios->pages : NULL; 164 sgilist = sgs_per_dev ? _aios->sglist : NULL; 165 ios = &_aios->ios; 166 } else { 167 struct __alloc_small_io_state { 168 struct ore_io_state ios; 169 struct ore_per_dev_state per_dev[numdevs]; 170 } *_aio_small; 171 union __extra_part { 172 struct osd_sg_entry sglist[sgs_per_dev * numdevs]; 173 struct page *pages[num_par_pages]; 174 } *extra_part; 175 176 _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); 177 if (unlikely(!_aio_small)) { 178 ORE_DBGMSG("Failed alloc first part bytes=%zd\n", 179 sizeof(*_aio_small)); 180 *pios = NULL; 181 return -ENOMEM; 182 } 183 extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); 184 if (unlikely(!extra_part)) { 185 ORE_DBGMSG("Failed alloc second part bytes=%zd\n", 186 sizeof(*extra_part)); 187 kfree(_aio_small); 188 *pios = NULL; 189 return -ENOMEM; 190 } 191 192 pages = num_par_pages ? extra_part->pages : NULL; 193 sgilist = sgs_per_dev ? extra_part->sglist : NULL; 194 /* In this case the per_dev[0].sgilist holds the pointer to 195 * be freed 196 */ 197 ios = &_aio_small->ios; 198 ios->extra_part_alloc = true; 199 } 200 201 if (pages) { 202 ios->parity_pages = pages; 203 ios->max_par_pages = num_par_pages; 204 } 205 if (sgilist) { 206 unsigned d; 207 208 for (d = 0; d < numdevs; ++d) { 209 ios->per_dev[d].sglist = sgilist; 210 sgilist += sgs_per_dev; 211 } 212 ios->sgs_per_dev = sgs_per_dev; 213 } 214 215 ios->layout = layout; 216 ios->oc = oc; 217 *pios = ios; 218 return 0; 219} 220 221/* Allocate an io_state for only a single group of devices 222 * 223 * If a user needs to call ore_read/write() this version must be used becase it 224 * allocates extra stuff for striping and raid. 225 * The ore might decide to only IO less then @length bytes do to alignmets 226 * and constrains as follows: 227 * - The IO cannot cross group boundary. 228 * - In raid5/6 The end of the IO must align at end of a stripe eg. 229 * (@offset + @length) % strip_size == 0. Or the complete range is within a 230 * single stripe. 231 * - Memory condition only permitted a shorter IO. (A user can use @length=~0 232 * And check the returned ios->length for max_io_size.) 233 * 234 * The caller must check returned ios->length (and/or ios->nr_pages) and 235 * re-issue these pages that fall outside of ios->length 236 */ 237int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, 238 bool is_reading, u64 offset, u64 length, 239 struct ore_io_state **pios) 240{ 241 struct ore_io_state *ios; 242 unsigned numdevs = layout->group_width * layout->mirrors_p1; 243 unsigned sgs_per_dev = 0, max_par_pages = 0; 244 int ret; 245 246 if (layout->parity && length) { 247 unsigned data_devs = layout->group_width - layout->parity; 248 unsigned stripe_size = layout->stripe_unit * data_devs; 249 unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; 250 u32 remainder; 251 u64 num_stripes; 252 u64 num_raid_units; 253 254 num_stripes = div_u64_rem(length, stripe_size, &remainder); 255 if (remainder) 256 ++num_stripes; 257 258 num_raid_units = num_stripes * layout->parity; 259 260 if (is_reading) { 261 /* For reads add per_dev sglist array */ 262 /* TODO: Raid 6 we need twice more. Actually: 263 * num_stripes / LCMdP(W,P); 264 * if (W%P != 0) num_stripes *= parity; 265 */ 266 267 /* first/last seg is split */ 268 num_raid_units += layout->group_width; 269 sgs_per_dev = div_u64(num_raid_units, data_devs) + 2; 270 } else { 271 /* For Writes add parity pages array. */ 272 max_par_pages = num_raid_units * pages_in_unit * 273 sizeof(struct page *); 274 } 275 } 276 277 ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, 278 pios); 279 if (unlikely(ret)) 280 return ret; 281 282 ios = *pios; 283 ios->reading = is_reading; 284 ios->offset = offset; 285 286 if (length) { 287 ore_calc_stripe_info(layout, offset, length, &ios->si); 288 ios->length = ios->si.length; 289 ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + 290 ios->length + PAGE_SIZE - 1) / PAGE_SIZE; 291 if (layout->parity) 292 _ore_post_alloc_raid_stuff(ios); 293 } 294 295 return 0; 296} 297EXPORT_SYMBOL(ore_get_rw_state); 298 299/* Allocate an io_state for all the devices in the comps array 300 * 301 * This version of io_state allocation is used mostly by create/remove 302 * and trunc where we currently need all the devices. The only wastful 303 * bit is the read/write_attributes with no IO. Those sites should 304 * be converted to use ore_get_rw_state() with length=0 305 */ 306int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, 307 struct ore_io_state **pios) 308{ 309 return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); 310} 311EXPORT_SYMBOL(ore_get_io_state); 312 313void ore_put_io_state(struct ore_io_state *ios) 314{ 315 if (ios) { 316 unsigned i; 317 318 for (i = 0; i < ios->numdevs; i++) { 319 struct ore_per_dev_state *per_dev = &ios->per_dev[i]; 320 321 if (per_dev->or) 322 osd_end_request(per_dev->or); 323 if (per_dev->bio) 324 bio_put(per_dev->bio); 325 } 326 327 _ore_free_raid_stuff(ios); 328 kfree(ios); 329 } 330} 331EXPORT_SYMBOL(ore_put_io_state); 332 333static void _sync_done(struct ore_io_state *ios, void *p) 334{ 335 struct completion *waiting = p; 336 337 complete(waiting); 338} 339 340static void _last_io(struct kref *kref) 341{ 342 struct ore_io_state *ios = container_of( 343 kref, struct ore_io_state, kref); 344 345 ios->done(ios, ios->private); 346} 347 348static void _done_io(struct osd_request *or, void *p) 349{ 350 struct ore_io_state *ios = p; 351 352 kref_put(&ios->kref, _last_io); 353} 354 355int ore_io_execute(struct ore_io_state *ios) 356{ 357 DECLARE_COMPLETION_ONSTACK(wait); 358 bool sync = (ios->done == NULL); 359 int i, ret; 360 361 if (sync) { 362 ios->done = _sync_done; 363 ios->private = &wait; 364 } 365 366 for (i = 0; i < ios->numdevs; i++) { 367 struct osd_request *or = ios->per_dev[i].or; 368 if (unlikely(!or)) 369 continue; 370 371 ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL); 372 if (unlikely(ret)) { 373 ORE_DBGMSG("Failed to osd_finalize_request() => %d\n", 374 ret); 375 return ret; 376 } 377 } 378 379 kref_init(&ios->kref); 380 381 for (i = 0; i < ios->numdevs; i++) { 382 struct osd_request *or = ios->per_dev[i].or; 383 if (unlikely(!or)) 384 continue; 385 386 kref_get(&ios->kref); 387 osd_execute_request_async(or, _done_io, ios); 388 } 389 390 kref_put(&ios->kref, _last_io); 391 ret = 0; 392 393 if (sync) { 394 wait_for_completion(&wait); 395 ret = ore_check_io(ios, NULL); 396 } 397 return ret; 398} 399 400static void _clear_bio(struct bio *bio) 401{ 402 struct bio_vec *bv; 403 unsigned i; 404 405 bio_for_each_segment_all(bv, bio, i) { 406 unsigned this_count = bv->bv_len; 407 408 if (likely(PAGE_SIZE == this_count)) 409 clear_highpage(bv->bv_page); 410 else 411 zero_user(bv->bv_page, bv->bv_offset, this_count); 412 } 413} 414 415int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) 416{ 417 enum osd_err_priority acumulated_osd_err = 0; 418 int acumulated_lin_err = 0; 419 int i; 420 421 for (i = 0; i < ios->numdevs; i++) { 422 struct osd_sense_info osi; 423 struct ore_per_dev_state *per_dev = &ios->per_dev[i]; 424 struct osd_request *or = per_dev->or; 425 int ret; 426 427 if (unlikely(!or)) 428 continue; 429 430 ret = osd_req_decode_sense(or, &osi); 431 if (likely(!ret)) 432 continue; 433 434 if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) && 435 per_dev->bio) { 436 /* start read offset passed endof file. 437 * Note: if we do not have bio it means read-attributes 438 * In this case we should return error to caller. 439 */ 440 _clear_bio(per_dev->bio); 441 ORE_DBGMSG("start read offset passed end of file " 442 "offset=0x%llx, length=0x%llx\n", 443 _LLU(per_dev->offset), 444 _LLU(per_dev->length)); 445 446 continue; /* we recovered */ 447 } 448 449 if (on_dev_error) { 450 u64 residual = ios->reading ? 451 or->in.residual : or->out.residual; 452 u64 offset = (ios->offset + ios->length) - residual; 453 unsigned dev = per_dev->dev - ios->oc->first_dev; 454 struct ore_dev *od = ios->oc->ods[dev]; 455 456 on_dev_error(ios, od, dev, osi.osd_err_pri, 457 offset, residual); 458 } 459 if (osi.osd_err_pri >= acumulated_osd_err) { 460 acumulated_osd_err = osi.osd_err_pri; 461 acumulated_lin_err = ret; 462 } 463 } 464 465 return acumulated_lin_err; 466} 467EXPORT_SYMBOL(ore_check_io); 468 469/* 470 * L - logical offset into the file 471 * 472 * D - number of Data devices 473 * D = group_width - parity 474 * 475 * U - The number of bytes in a stripe within a group 476 * U = stripe_unit * D 477 * 478 * T - The number of bytes striped within a group of component objects 479 * (before advancing to the next group) 480 * T = U * group_depth 481 * 482 * S - The number of bytes striped across all component objects 483 * before the pattern repeats 484 * S = T * group_count 485 * 486 * M - The "major" (i.e., across all components) cycle number 487 * M = L / S 488 * 489 * G - Counts the groups from the beginning of the major cycle 490 * G = (L - (M * S)) / T [or (L % S) / T] 491 * 492 * H - The byte offset within the group 493 * H = (L - (M * S)) % T [or (L % S) % T] 494 * 495 * N - The "minor" (i.e., across the group) stripe number 496 * N = H / U 497 * 498 * C - The component index coresponding to L 499 * 500 * C = (H - (N * U)) / stripe_unit + G * D 501 * [or (L % U) / stripe_unit + G * D] 502 * 503 * O - The component offset coresponding to L 504 * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit 505 * 506 * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity 507 * divide by parity 508 * LCMdP = lcm(group_width, parity) / parity 509 * 510 * R - The parity Rotation stripe 511 * (Note parity cycle always starts at a group's boundary) 512 * R = N % LCMdP 513 * 514 * I = the first parity device index 515 * I = (group_width + group_width - R*parity - parity) % group_width 516 * 517 * Craid - The component index Rotated 518 * Craid = (group_width + C - R*parity) % group_width 519 * (We add the group_width to avoid negative numbers modulo math) 520 */ 521void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, 522 u64 length, struct ore_striping_info *si) 523{ 524 u32 stripe_unit = layout->stripe_unit; 525 u32 group_width = layout->group_width; 526 u64 group_depth = layout->group_depth; 527 u32 parity = layout->parity; 528 529 u32 D = group_width - parity; 530 u32 U = D * stripe_unit; 531 u64 T = U * group_depth; 532 u64 S = T * layout->group_count; 533 u64 M = div64_u64(file_offset, S); 534 535 /* 536 G = (L - (M * S)) / T 537 H = (L - (M * S)) % T 538 */ 539 u64 LmodS = file_offset - M * S; 540 u32 G = div64_u64(LmodS, T); 541 u64 H = LmodS - G * T; 542 543 u32 N = div_u64(H, U); 544 u32 Nlast; 545 546 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 547 u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; 548 549 div_u64_rem(file_offset, stripe_unit, &si->unit_off); 550 551 si->obj_offset = si->unit_off + (N * stripe_unit) + 552 (M * group_depth * stripe_unit); 553 554 if (parity) { 555 u32 LCMdP = lcm(group_width, parity) / parity; 556 /* R = N % LCMdP; */ 557 u32 RxP = (N % LCMdP) * parity; 558 u32 first_dev = C - C % group_width; 559 560 si->par_dev = (group_width + group_width - parity - RxP) % 561 group_width + first_dev; 562 si->dev = (group_width + C - RxP) % group_width + first_dev; 563 si->bytes_in_stripe = U; 564 si->first_stripe_start = M * S + G * T + N * U; 565 } else { 566 /* Make the math correct see _prepare_one_group */ 567 si->par_dev = group_width; 568 si->dev = C; 569 } 570 571 si->dev *= layout->mirrors_p1; 572 si->par_dev *= layout->mirrors_p1; 573 si->offset = file_offset; 574 si->length = T - H; 575 if (si->length > length) 576 si->length = length; 577 578 Nlast = div_u64(H + si->length + U - 1, U); 579 si->maxdevUnits = Nlast - N; 580 581 si->M = M; 582} 583EXPORT_SYMBOL(ore_calc_stripe_info); 584 585int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, 586 unsigned pgbase, struct page **pages, 587 struct ore_per_dev_state *per_dev, int cur_len) 588{ 589 unsigned pg = *cur_pg; 590 struct request_queue *q = 591 osd_request_queue(_ios_od(ios, per_dev->dev)); 592 unsigned len = cur_len; 593 int ret; 594 595 if (per_dev->bio == NULL) { 596 unsigned bio_size; 597 598 if (!ios->reading) { 599 bio_size = ios->si.maxdevUnits; 600 } else { 601 bio_size = (ios->si.maxdevUnits + 1) * 602 (ios->layout->group_width - ios->layout->parity) / 603 ios->layout->group_width; 604 } 605 bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); 606 607 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 608 if (unlikely(!per_dev->bio)) { 609 ORE_DBGMSG("Failed to allocate BIO size=%u\n", 610 bio_size); 611 ret = -ENOMEM; 612 goto out; 613 } 614 } 615 616 while (cur_len > 0) { 617 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); 618 unsigned added_len; 619 620 cur_len -= pglen; 621 622 added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], 623 pglen, pgbase); 624 if (unlikely(pglen != added_len)) { 625 /* If bi_vcnt == bi_max then this is a SW BUG */ 626 ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " 627 "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", 628 per_dev->bio->bi_vcnt, 629 per_dev->bio->bi_max_vecs, 630 BIO_MAX_PAGES_KMALLOC, cur_len); 631 ret = -ENOMEM; 632 goto out; 633 } 634 _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); 635 636 pgbase = 0; 637 ++pg; 638 } 639 BUG_ON(cur_len); 640 641 per_dev->length += len; 642 *cur_pg = pg; 643 ret = 0; 644out: /* we fail the complete unit on an error eg don't advance 645 * per_dev->length and cur_pg. This means that we might have a bigger 646 * bio than the CDB requested length (per_dev->length). That's fine 647 * only the oposite is fatal. 648 */ 649 return ret; 650} 651 652static int _prepare_for_striping(struct ore_io_state *ios) 653{ 654 struct ore_striping_info *si = &ios->si; 655 unsigned stripe_unit = ios->layout->stripe_unit; 656 unsigned mirrors_p1 = ios->layout->mirrors_p1; 657 unsigned group_width = ios->layout->group_width; 658 unsigned devs_in_group = group_width * mirrors_p1; 659 unsigned dev = si->dev; 660 unsigned first_dev = dev - (dev % devs_in_group); 661 unsigned dev_order; 662 unsigned cur_pg = ios->pages_consumed; 663 u64 length = ios->length; 664 int ret = 0; 665 666 if (!ios->pages) { 667 ios->numdevs = ios->layout->mirrors_p1; 668 return 0; 669 } 670 671 BUG_ON(length > si->length); 672 673 dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev); 674 si->cur_comp = dev_order; 675 si->cur_pg = si->unit_off / PAGE_SIZE; 676 677 while (length) { 678 unsigned comp = dev - first_dev; 679 struct ore_per_dev_state *per_dev = &ios->per_dev[comp]; 680 unsigned cur_len, page_off = 0; 681 682 if (!per_dev->length) { 683 per_dev->dev = dev; 684 if (dev == si->dev) { 685 WARN_ON(dev == si->par_dev); 686 per_dev->offset = si->obj_offset; 687 cur_len = stripe_unit - si->unit_off; 688 page_off = si->unit_off & ~PAGE_MASK; 689 BUG_ON(page_off && (page_off != ios->pgbase)); 690 } else { 691 if (si->cur_comp > dev_order) 692 per_dev->offset = 693 si->obj_offset - si->unit_off; 694 else /* si->cur_comp < dev_order */ 695 per_dev->offset = 696 si->obj_offset + stripe_unit - 697 si->unit_off; 698 cur_len = stripe_unit; 699 } 700 } else { 701 cur_len = stripe_unit; 702 } 703 if (cur_len >= length) 704 cur_len = length; 705 706 ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, 707 per_dev, cur_len); 708 if (unlikely(ret)) 709 goto out; 710 711 dev += mirrors_p1; 712 dev = (dev % devs_in_group) + first_dev; 713 714 length -= cur_len; 715 716 si->cur_comp = (si->cur_comp + 1) % group_width; 717 if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { 718 if (!length && ios->sp2d) { 719 /* If we are writing and this is the very last 720 * stripe. then operate on parity dev. 721 */ 722 dev = si->par_dev; 723 } 724 if (ios->sp2d) 725 /* In writes cur_len just means if it's the 726 * last one. See _ore_add_parity_unit. 727 */ 728 cur_len = length; 729 per_dev = &ios->per_dev[dev - first_dev]; 730 if (!per_dev->length) { 731 /* Only/always the parity unit of the first 732 * stripe will be empty. So this is a chance to 733 * initialize the per_dev info. 734 */ 735 per_dev->dev = dev; 736 per_dev->offset = si->obj_offset - si->unit_off; 737 } 738 739 ret = _ore_add_parity_unit(ios, si, per_dev, cur_len); 740 if (unlikely(ret)) 741 goto out; 742 743 /* Rotate next par_dev backwards with wraping */ 744 si->par_dev = (devs_in_group + si->par_dev - 745 ios->layout->parity * mirrors_p1) % 746 devs_in_group + first_dev; 747 /* Next stripe, start fresh */ 748 si->cur_comp = 0; 749 si->cur_pg = 0; 750 } 751 } 752out: 753 ios->numdevs = devs_in_group; 754 ios->pages_consumed = cur_pg; 755 return ret; 756} 757 758int ore_create(struct ore_io_state *ios) 759{ 760 int i, ret; 761 762 for (i = 0; i < ios->oc->numdevs; i++) { 763 struct osd_request *or; 764 765 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); 766 if (unlikely(!or)) { 767 ORE_ERR("%s: osd_start_request failed\n", __func__); 768 ret = -ENOMEM; 769 goto out; 770 } 771 ios->per_dev[i].or = or; 772 ios->numdevs++; 773 774 osd_req_create_object(or, _ios_obj(ios, i)); 775 } 776 ret = ore_io_execute(ios); 777 778out: 779 return ret; 780} 781EXPORT_SYMBOL(ore_create); 782 783int ore_remove(struct ore_io_state *ios) 784{ 785 int i, ret; 786 787 for (i = 0; i < ios->oc->numdevs; i++) { 788 struct osd_request *or; 789 790 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); 791 if (unlikely(!or)) { 792 ORE_ERR("%s: osd_start_request failed\n", __func__); 793 ret = -ENOMEM; 794 goto out; 795 } 796 ios->per_dev[i].or = or; 797 ios->numdevs++; 798 799 osd_req_remove_object(or, _ios_obj(ios, i)); 800 } 801 ret = ore_io_execute(ios); 802 803out: 804 return ret; 805} 806EXPORT_SYMBOL(ore_remove); 807 808static int _write_mirror(struct ore_io_state *ios, int cur_comp) 809{ 810 struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp]; 811 unsigned dev = ios->per_dev[cur_comp].dev; 812 unsigned last_comp = cur_comp + ios->layout->mirrors_p1; 813 int ret = 0; 814 815 if (ios->pages && !master_dev->length) 816 return 0; /* Just an empty slot */ 817 818 for (; cur_comp < last_comp; ++cur_comp, ++dev) { 819 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 820 struct osd_request *or; 821 822 or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); 823 if (unlikely(!or)) { 824 ORE_ERR("%s: osd_start_request failed\n", __func__); 825 ret = -ENOMEM; 826 goto out; 827 } 828 per_dev->or = or; 829 830 if (ios->pages) { 831 struct bio *bio; 832 833 if (per_dev != master_dev) { 834 bio = bio_clone_kmalloc(master_dev->bio, 835 GFP_KERNEL); 836 if (unlikely(!bio)) { 837 ORE_DBGMSG( 838 "Failed to allocate BIO size=%u\n", 839 master_dev->bio->bi_max_vecs); 840 ret = -ENOMEM; 841 goto out; 842 } 843 844 bio->bi_bdev = NULL; 845 bio->bi_next = NULL; 846 per_dev->offset = master_dev->offset; 847 per_dev->length = master_dev->length; 848 per_dev->bio = bio; 849 per_dev->dev = dev; 850 } else { 851 bio = master_dev->bio; 852 /* FIXME: bio_set_dir() */ 853 bio->bi_rw |= REQ_WRITE; 854 } 855 856 osd_req_write(or, _ios_obj(ios, cur_comp), 857 per_dev->offset, bio, per_dev->length); 858 ORE_DBGMSG("write(0x%llx) offset=0x%llx " 859 "length=0x%llx dev=%d\n", 860 _LLU(_ios_obj(ios, cur_comp)->id), 861 _LLU(per_dev->offset), 862 _LLU(per_dev->length), dev); 863 } else if (ios->kern_buff) { 864 per_dev->offset = ios->si.obj_offset; 865 per_dev->dev = ios->si.dev + dev; 866 867 /* no cross device without page array */ 868 BUG_ON((ios->layout->group_width > 1) && 869 (ios->si.unit_off + ios->length > 870 ios->layout->stripe_unit)); 871 872 ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp), 873 per_dev->offset, 874 ios->kern_buff, ios->length); 875 if (unlikely(ret)) 876 goto out; 877 ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " 878 "length=0x%llx dev=%d\n", 879 _LLU(_ios_obj(ios, cur_comp)->id), 880 _LLU(per_dev->offset), 881 _LLU(ios->length), per_dev->dev); 882 } else { 883 osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); 884 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", 885 _LLU(_ios_obj(ios, cur_comp)->id), 886 ios->out_attr_len, dev); 887 } 888 889 if (ios->out_attr) 890 osd_req_add_set_attr_list(or, ios->out_attr, 891 ios->out_attr_len); 892 893 if (ios->in_attr) 894 osd_req_add_get_attr_list(or, ios->in_attr, 895 ios->in_attr_len); 896 } 897 898out: 899 return ret; 900} 901 902int ore_write(struct ore_io_state *ios) 903{ 904 int i; 905 int ret; 906 907 if (unlikely(ios->sp2d && !ios->r4w)) { 908 /* A library is attempting a RAID-write without providing 909 * a pages lock interface. 910 */ 911 WARN_ON_ONCE(1); 912 return -ENOTSUPP; 913 } 914 915 ret = _prepare_for_striping(ios); 916 if (unlikely(ret)) 917 return ret; 918 919 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 920 ret = _write_mirror(ios, i); 921 if (unlikely(ret)) 922 return ret; 923 } 924 925 ret = ore_io_execute(ios); 926 return ret; 927} 928EXPORT_SYMBOL(ore_write); 929 930int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) 931{ 932 struct osd_request *or; 933 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 934 struct osd_obj_id *obj = _ios_obj(ios, cur_comp); 935 unsigned first_dev = (unsigned)obj->id; 936 937 if (ios->pages && !per_dev->length) 938 return 0; /* Just an empty slot */ 939 940 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; 941 or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); 942 if (unlikely(!or)) { 943 ORE_ERR("%s: osd_start_request failed\n", __func__); 944 return -ENOMEM; 945 } 946 per_dev->or = or; 947 948 if (ios->pages) { 949 if (per_dev->cur_sg) { 950 /* finalize the last sg_entry */ 951 _ore_add_sg_seg(per_dev, 0, false); 952 if (unlikely(!per_dev->cur_sg)) 953 return 0; /* Skip parity only device */ 954 955 osd_req_read_sg(or, obj, per_dev->bio, 956 per_dev->sglist, per_dev->cur_sg); 957 } else { 958 /* The no raid case */ 959 osd_req_read(or, obj, per_dev->offset, 960 per_dev->bio, per_dev->length); 961 } 962 963 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" 964 " dev=%d sg_len=%d\n", _LLU(obj->id), 965 _LLU(per_dev->offset), _LLU(per_dev->length), 966 first_dev, per_dev->cur_sg); 967 } else { 968 BUG_ON(ios->kern_buff); 969 970 osd_req_get_attributes(or, obj); 971 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", 972 _LLU(obj->id), 973 ios->in_attr_len, first_dev); 974 } 975 if (ios->out_attr) 976 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); 977 978 if (ios->in_attr) 979 osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); 980 981 return 0; 982} 983 984int ore_read(struct ore_io_state *ios) 985{ 986 int i; 987 int ret; 988 989 ret = _prepare_for_striping(ios); 990 if (unlikely(ret)) 991 return ret; 992 993 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 994 ret = _ore_read_mirror(ios, i); 995 if (unlikely(ret)) 996 return ret; 997 } 998 999 ret = ore_io_execute(ios); 1000 return ret; 1001} 1002EXPORT_SYMBOL(ore_read); 1003 1004int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr) 1005{ 1006 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ 1007 void *iter = NULL; 1008 int nelem; 1009 1010 do { 1011 nelem = 1; 1012 osd_req_decode_get_attr_list(ios->per_dev[0].or, 1013 &cur_attr, &nelem, &iter); 1014 if ((cur_attr.attr_page == attr->attr_page) && 1015 (cur_attr.attr_id == attr->attr_id)) { 1016 attr->len = cur_attr.len; 1017 attr->val_ptr = cur_attr.val_ptr; 1018 return 0; 1019 } 1020 } while (iter); 1021 1022 return -EIO; 1023} 1024EXPORT_SYMBOL(extract_attr_from_ios); 1025 1026static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, 1027 struct osd_attr *attr) 1028{ 1029 int last_comp = cur_comp + ios->layout->mirrors_p1; 1030 1031 for (; cur_comp < last_comp; ++cur_comp) { 1032 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 1033 struct osd_request *or; 1034 1035 or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); 1036 if (unlikely(!or)) { 1037 ORE_ERR("%s: osd_start_request failed\n", __func__); 1038 return -ENOMEM; 1039 } 1040 per_dev->or = or; 1041 1042 osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); 1043 osd_req_add_set_attr_list(or, attr, 1); 1044 } 1045 1046 return 0; 1047} 1048 1049struct _trunc_info { 1050 struct ore_striping_info si; 1051 u64 prev_group_obj_off; 1052 u64 next_group_obj_off; 1053 1054 unsigned first_group_dev; 1055 unsigned nex_group_dev; 1056}; 1057 1058static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, 1059 struct _trunc_info *ti) 1060{ 1061 unsigned stripe_unit = layout->stripe_unit; 1062 1063 ore_calc_stripe_info(layout, file_offset, 0, &ti->si); 1064 1065 ti->prev_group_obj_off = ti->si.M * stripe_unit; 1066 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; 1067 1068 ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); 1069 ti->nex_group_dev = ti->first_group_dev + layout->group_width; 1070} 1071 1072int ore_truncate(struct ore_layout *layout, struct ore_components *oc, 1073 u64 size) 1074{ 1075 struct ore_io_state *ios; 1076 struct exofs_trunc_attr { 1077 struct osd_attr attr; 1078 __be64 newsize; 1079 } *size_attrs; 1080 struct _trunc_info ti; 1081 int i, ret; 1082 1083 ret = ore_get_io_state(layout, oc, &ios); 1084 if (unlikely(ret)) 1085 return ret; 1086 1087 _calc_trunk_info(ios->layout, size, &ti); 1088 1089 size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), 1090 GFP_KERNEL); 1091 if (unlikely(!size_attrs)) { 1092 ret = -ENOMEM; 1093 goto out; 1094 } 1095 1096 ios->numdevs = ios->oc->numdevs; 1097 1098 for (i = 0; i < ios->numdevs; ++i) { 1099 struct exofs_trunc_attr *size_attr = &size_attrs[i]; 1100 u64 obj_size; 1101 1102 if (i < ti.first_group_dev) 1103 obj_size = ti.prev_group_obj_off; 1104 else if (i >= ti.nex_group_dev) 1105 obj_size = ti.next_group_obj_off; 1106 else if (i < ti.si.dev) /* dev within this group */ 1107 obj_size = ti.si.obj_offset + 1108 ios->layout->stripe_unit - ti.si.unit_off; 1109 else if (i == ti.si.dev) 1110 obj_size = ti.si.obj_offset; 1111 else /* i > ti.dev */ 1112 obj_size = ti.si.obj_offset - ti.si.unit_off; 1113 1114 size_attr->newsize = cpu_to_be64(obj_size); 1115 size_attr->attr = g_attr_logical_length; 1116 size_attr->attr.val_ptr = &size_attr->newsize; 1117 1118 ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", 1119 _LLU(oc->comps->obj.id), _LLU(obj_size), i); 1120 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, 1121 &size_attr->attr); 1122 if (unlikely(ret)) 1123 goto out; 1124 } 1125 ret = ore_io_execute(ios); 1126 1127out: 1128 kfree(size_attrs); 1129 ore_put_io_state(ios); 1130 return ret; 1131} 1132EXPORT_SYMBOL(ore_truncate); 1133 1134const struct osd_attr g_attr_logical_length = ATTR_DEF( 1135 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); 1136EXPORT_SYMBOL(g_attr_logical_length);