Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ore: Must support none-PAGE-aligned IO

NFS might send us offsets that are not PAGE aligned. So
we must read in the reminder of the first/last pages, in cases
we need it for Parity calculations.

We only add an sg segments to read the partial page. But
we don't mark it as read=true because it is a lock-for-write
page.

TODO: In some cases (IO spans a single unit) we can just
adjust the raid_unit offset/length, but this is left for
later Kernels.

[Bug in 3.2.0 Kernel]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>

+60 -12
+60 -12
fs/exofs/ore_raid.c
··· 328 328 /* @si contains info of the to-be-inserted page. Update of @si should be 329 329 * maintained by caller. Specificaly si->dev, si->obj_offset, ... 330 330 */ 331 - static int _add_to_read_4_write(struct ore_io_state *ios, 332 - struct ore_striping_info *si, struct page *page) 331 + static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, 332 + struct page *page, unsigned pg_len) 333 333 { 334 334 struct request_queue *q; 335 335 struct ore_per_dev_state *per_dev; ··· 366 366 _ore_add_sg_seg(per_dev, gap, true); 367 367 } 368 368 q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); 369 - added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); 370 - if (unlikely(added_len != PAGE_SIZE)) { 369 + added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, 370 + si->obj_offset % PAGE_SIZE); 371 + if (unlikely(added_len != pg_len)) { 371 372 ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", 372 373 per_dev->bio->bi_vcnt); 373 374 return -ENOMEM; 374 375 } 375 376 376 - per_dev->length += PAGE_SIZE; 377 + per_dev->length += pg_len; 377 378 return 0; 379 + } 380 + 381 + /* read the beginning of an unaligned first page */ 382 + static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) 383 + { 384 + struct ore_striping_info si; 385 + unsigned pg_len; 386 + 387 + ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); 388 + 389 + pg_len = si.obj_offset % PAGE_SIZE; 390 + si.obj_offset -= pg_len; 391 + 392 + ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", 393 + _LLU(si.obj_offset), pg_len, page->index, si.dev); 394 + 395 + return _add_to_r4w(ios, &si, page, pg_len); 396 + } 397 + 398 + /* read the end of an incomplete last page */ 399 + static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) 400 + { 401 + struct ore_striping_info si; 402 + struct page *page; 403 + unsigned pg_len, p, c; 404 + 405 + ore_calc_stripe_info(ios->layout, *offset, 0, &si); 406 + 407 + p = si.unit_off / PAGE_SIZE; 408 + c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1, 409 + ios->layout->mirrors_p1, si.par_dev, si.dev); 410 + page = ios->sp2d->_1p_stripes[p].pages[c]; 411 + 412 + pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); 413 + *offset += pg_len; 414 + 415 + ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", 416 + p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); 417 + 418 + BUG_ON(!page); 419 + 420 + return _add_to_r4w(ios, &si, page, pg_len); 378 421 } 379 422 380 423 static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) ··· 487 444 struct page **pp = &_1ps->pages[c]; 488 445 bool uptodate; 489 446 490 - if (*pp) 447 + if (*pp) { 448 + if (ios->offset % PAGE_SIZE) 449 + /* Read the remainder of the page */ 450 + _add_to_r4w_first_page(ios, *pp); 491 451 /* to-be-written pages start here */ 492 452 goto read_last_stripe; 453 + } 493 454 494 455 *pp = ios->r4w->get_page(ios->private, offset, 495 456 &uptodate); ··· 501 454 return -ENOMEM; 502 455 503 456 if (!uptodate) 504 - _add_to_read_4_write(ios, &read_si, *pp); 457 + _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); 505 458 506 459 /* Mark read-pages to be cache_released */ 507 460 _1ps->page_is_read[c] = true; ··· 512 465 } 513 466 514 467 read_last_stripe: 515 - offset = ios->offset + (ios->length + PAGE_SIZE - 1) / 516 - PAGE_SIZE * PAGE_SIZE; 468 + offset = ios->offset + ios->length; 469 + if (offset % PAGE_SIZE) 470 + _add_to_r4w_last_page(ios, &offset); 471 + /* offset will be aligned to next page */ 472 + 517 473 last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) 518 474 * bytes_in_stripe; 519 475 if (offset == last_stripe_end) /* Optimize for the aligned case */ ··· 553 503 /* Mark read-pages to be cache_released */ 554 504 _1ps->page_is_read[c] = true; 555 505 if (!uptodate) 556 - _add_to_read_4_write(ios, &read_si, page); 506 + _add_to_r4w(ios, &read_si, page, PAGE_SIZE); 557 507 } 558 508 559 509 offset += PAGE_SIZE; ··· 665 615 layout->parity, &ios->sp2d)) { 666 616 return -ENOMEM; 667 617 } 668 - 669 - BUG_ON(ios->offset % PAGE_SIZE); 670 618 671 619 /* Round io down to last full strip */ 672 620 first_stripe = div_u64(ios->offset, stripe_size);