ceph: make page alignment explicit in osd interface

We used to infer alignment of IOs within a page based on the file offset,
which assumed they matched. This broke with direct IO that was not aligned
to pages (e.g., 512-byte aligned IO). We were also trusting the alignment
specified in the OSD reply, which could have been adjusted by the server.

Explicitly specify the page alignment when setting up OSD IO requests.

Signed-off-by: Sage Weil <sage@newdream.net>

Sage Weil b7495fc2 e98b6fed

+44 -19
+3 -3
fs/ceph/addr.c
··· 204 204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 205 205 page->index << PAGE_CACHE_SHIFT, &len, 206 206 ci->i_truncate_seq, ci->i_truncate_size, 207 - &page, 1); 207 + &page, 1, 0); 208 208 if (err == -ENOENT) 209 209 err = 0; 210 210 if (err < 0) { ··· 287 287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 288 288 offset, &len, 289 289 ci->i_truncate_seq, ci->i_truncate_size, 290 - pages, nr_pages); 290 + pages, nr_pages, 0); 291 291 if (rc == -ENOENT) 292 292 rc = 0; 293 293 if (rc < 0) ··· 782 782 snapc, do_sync, 783 783 ci->i_truncate_seq, 784 784 ci->i_truncate_size, 785 - &inode->i_mtime, true, 1); 785 + &inode->i_mtime, true, 1, 0); 786 786 max_pages = req->r_num_pages; 787 787 788 788 alloc_page_vec(fsc, req);
+21 -5
fs/ceph/file.c
··· 282 282 static int striped_read(struct inode *inode, 283 283 u64 off, u64 len, 284 284 struct page **pages, int num_pages, 285 - int *checkeof) 285 + int *checkeof, bool align_to_pages) 286 286 { 287 287 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 288 288 struct ceph_inode_info *ci = ceph_inode(inode); 289 289 u64 pos, this_len; 290 + int io_align, page_align; 290 291 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 291 292 int left, pages_left; 292 293 int read; ··· 303 302 page_pos = pages; 304 303 pages_left = num_pages; 305 304 read = 0; 305 + io_align = off & ~PAGE_MASK; 306 306 307 307 more: 308 + if (align_to_pages) 309 + page_align = (pos - io_align) & ~PAGE_MASK; 310 + else 311 + page_align = pos & ~PAGE_MASK; 308 312 this_len = left; 309 313 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), 310 314 &ci->i_layout, pos, &this_len, 311 315 ci->i_truncate_seq, 312 316 ci->i_truncate_size, 313 - page_pos, pages_left); 317 + page_pos, pages_left, page_align); 314 318 hit_stripe = this_len < left; 315 319 was_short = ret >= 0 && ret < this_len; 316 320 if (ret == -ENOENT) ··· 399 393 if (ret < 0) 400 394 goto done; 401 395 402 - ret = striped_read(inode, off, len, pages, num_pages, checkeof); 396 + ret = striped_read(inode, off, len, pages, num_pages, checkeof, 397 + file->f_flags & O_DIRECT); 403 398 404 399 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 405 400 ret = ceph_copy_page_vector_to_user(pages, data, off, ret); ··· 455 448 int flags; 456 449 int do_sync = 0; 457 450 int check_caps = 0; 451 + int page_align, io_align; 458 452 int ret; 459 453 struct timespec mtime = CURRENT_TIME; 460 454 ··· 469 461 pos = i_size_read(inode); 470 462 else 471 463 pos = *offset; 464 + 465 + io_align = pos & ~PAGE_MASK; 472 466 473 467 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 474 468 if (ret < 0) ··· 496 486 */ 497 487 more: 498 488 len = left; 489 + if (file->f_flags & O_DIRECT) 490 + /* write from beginning of first page, regardless of 491 + io alignment */ 492 + page_align = (pos - io_align) & ~PAGE_MASK; 493 + else 494 + page_align = pos & ~PAGE_MASK; 499 495 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 500 496 ceph_vino(inode), pos, &len, 501 497 CEPH_OSD_OP_WRITE, flags, 502 498 ci->i_snap_realm->cached_context, 503 499 do_sync, 504 500 ci->i_truncate_seq, ci->i_truncate_size, 505 - &mtime, false, 2); 501 + &mtime, false, 2, page_align); 506 502 if (!req) 507 503 return -ENOMEM; 508 504 509 505 num_pages = calc_pages_for(pos, len); 510 506 511 507 if (file->f_flags & O_DIRECT) { 512 - pages = ceph_get_direct_page_vector(data, num_pages, pos, len); 508 + pages = ceph_get_direct_page_vector(data, num_pages); 513 509 if (IS_ERR(pages)) { 514 510 ret = PTR_ERR(pages); 515 511 goto out;
+1 -1
fs/ceph/inode.c
··· 1752 1752 return 0; 1753 1753 } 1754 1754 1755 - dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); 1755 + dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); 1756 1756 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 1757 1757 return 0; 1758 1758
+5 -2
include/linux/ceph/osd_client.h
··· 79 79 struct ceph_file_layout r_file_layout; 80 80 struct ceph_snap_context *r_snapc; /* snap context for writes */ 81 81 unsigned r_num_pages; /* size of page array (follows) */ 82 + unsigned r_page_alignment; /* io offset in first page */ 82 83 struct page **r_pages; /* pages for data payload */ 83 84 int r_pages_from_pool; 84 85 int r_own_pages; /* if true, i own page list */ ··· 195 194 int do_sync, u32 truncate_seq, 196 195 u64 truncate_size, 197 196 struct timespec *mtime, 198 - bool use_mempool, int num_reply); 197 + bool use_mempool, int num_reply, 198 + int page_align); 199 199 200 200 static inline void ceph_osdc_get_request(struct ceph_osd_request *req) 201 201 { ··· 220 218 struct ceph_file_layout *layout, 221 219 u64 off, u64 *plen, 222 220 u32 truncate_seq, u64 truncate_size, 223 - struct page **pages, int nr_pages); 221 + struct page **pages, int nr_pages, 222 + int page_align); 224 223 225 224 extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, 226 225 struct ceph_vino vino,
+14 -8
net/ceph/osd_client.c
··· 71 71 op->extent.length = objlen; 72 72 } 73 73 req->r_num_pages = calc_pages_for(off, *plen); 74 + req->r_page_alignment = off & ~PAGE_MASK; 74 75 if (op->op == CEPH_OSD_OP_WRITE) 75 76 op->payload_len = *plen; 76 77 ··· 420 419 u32 truncate_seq, 421 420 u64 truncate_size, 422 421 struct timespec *mtime, 423 - bool use_mempool, int num_reply) 422 + bool use_mempool, int num_reply, 423 + int page_align) 424 424 { 425 425 struct ceph_osd_req_op ops[3]; 426 426 struct ceph_osd_request *req; ··· 448 446 /* calculate max write size */ 449 447 calc_layout(osdc, vino, layout, off, plen, req, ops); 450 448 req->r_file_layout = *layout; /* keep a copy */ 449 + 450 + /* in case it differs from natural alignment that calc_layout 451 + filled in for us */ 452 + req->r_page_alignment = page_align; 451 453 452 454 ceph_osdc_build_request(req, off, plen, ops, 453 455 snapc, ··· 1495 1489 struct ceph_vino vino, struct ceph_file_layout *layout, 1496 1490 u64 off, u64 *plen, 1497 1491 u32 truncate_seq, u64 truncate_size, 1498 - struct page **pages, int num_pages) 1492 + struct page **pages, int num_pages, int page_align) 1499 1493 { 1500 1494 struct ceph_osd_request *req; 1501 1495 int rc = 0; ··· 1505 1499 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1506 1500 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1507 1501 NULL, 0, truncate_seq, truncate_size, NULL, 1508 - false, 1); 1502 + false, 1, page_align); 1509 1503 if (!req) 1510 1504 return -ENOMEM; 1511 1505 1512 1506 /* it may be a short read due to an object boundary */ 1513 1507 req->r_pages = pages; 1514 1508 1515 - dout("readpages final extent is %llu~%llu (%d pages)\n", 1516 - off, *plen, req->r_num_pages); 1509 + dout("readpages final extent is %llu~%llu (%d pages align %d)\n", 1510 + off, *plen, req->r_num_pages, page_align); 1517 1511 1518 1512 rc = ceph_osdc_start_request(osdc, req, false); 1519 1513 if (!rc) ··· 1539 1533 { 1540 1534 struct ceph_osd_request *req; 1541 1535 int rc = 0; 1536 + int page_align = off & ~PAGE_MASK; 1542 1537 1543 1538 BUG_ON(vino.snap != CEPH_NOSNAP); 1544 1539 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, ··· 1548 1541 CEPH_OSD_FLAG_WRITE, 1549 1542 snapc, do_sync, 1550 1543 truncate_seq, truncate_size, mtime, 1551 - nofail, 1); 1544 + nofail, 1, page_align); 1552 1545 if (!req) 1553 1546 return -ENOMEM; 1554 1547 ··· 1645 1638 m = ceph_msg_get(req->r_reply); 1646 1639 1647 1640 if (data_len > 0) { 1648 - unsigned data_off = le16_to_cpu(hdr->data_off); 1649 - int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); 1641 + int want = calc_pages_for(req->r_page_alignment, data_len); 1650 1642 1651 1643 if (unlikely(req->r_num_pages < want)) { 1652 1644 pr_warning("tid %lld reply %d > expected %d pages\n",