ceph: make page alignment explicit in osd interface

We used to infer alignment of IOs within a page based on the file offset,
which assumed they matched. This broke with direct IO that was not aligned
to pages (e.g., 512-byte aligned IO). We were also trusting the alignment
specified in the OSD reply, which could have been adjusted by the server.

Explicitly specify the page alignment when setting up OSD IO requests.

Signed-off-by: Sage Weil <sage@newdream.net>

Sage Weil b7495fc2 e98b6fed

+44 -19
+3 -3
fs/ceph/addr.c
··· 204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 205 page->index << PAGE_CACHE_SHIFT, &len, 206 ci->i_truncate_seq, ci->i_truncate_size, 207 - &page, 1); 208 if (err == -ENOENT) 209 err = 0; 210 if (err < 0) { ··· 287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 288 offset, &len, 289 ci->i_truncate_seq, ci->i_truncate_size, 290 - pages, nr_pages); 291 if (rc == -ENOENT) 292 rc = 0; 293 if (rc < 0) ··· 782 snapc, do_sync, 783 ci->i_truncate_seq, 784 ci->i_truncate_size, 785 - &inode->i_mtime, true, 1); 786 max_pages = req->r_num_pages; 787 788 alloc_page_vec(fsc, req);
··· 204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 205 page->index << PAGE_CACHE_SHIFT, &len, 206 ci->i_truncate_seq, ci->i_truncate_size, 207 + &page, 1, 0); 208 if (err == -ENOENT) 209 err = 0; 210 if (err < 0) { ··· 287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 288 offset, &len, 289 ci->i_truncate_seq, ci->i_truncate_size, 290 + pages, nr_pages, 0); 291 if (rc == -ENOENT) 292 rc = 0; 293 if (rc < 0) ··· 782 snapc, do_sync, 783 ci->i_truncate_seq, 784 ci->i_truncate_size, 785 + &inode->i_mtime, true, 1, 0); 786 max_pages = req->r_num_pages; 787 788 alloc_page_vec(fsc, req);
+21 -5
fs/ceph/file.c
··· 282 static int striped_read(struct inode *inode, 283 u64 off, u64 len, 284 struct page **pages, int num_pages, 285 - int *checkeof) 286 { 287 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 288 struct ceph_inode_info *ci = ceph_inode(inode); 289 u64 pos, this_len; 290 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 291 int left, pages_left; 292 int read; ··· 303 page_pos = pages; 304 pages_left = num_pages; 305 read = 0; 306 307 more: 308 this_len = left; 309 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), 310 &ci->i_layout, pos, &this_len, 311 ci->i_truncate_seq, 312 ci->i_truncate_size, 313 - page_pos, pages_left); 314 hit_stripe = this_len < left; 315 was_short = ret >= 0 && ret < this_len; 316 if (ret == -ENOENT) ··· 399 if (ret < 0) 400 goto done; 401 402 - ret = striped_read(inode, off, len, pages, num_pages, checkeof); 403 404 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 405 ret = ceph_copy_page_vector_to_user(pages, data, off, ret); ··· 455 int flags; 456 int do_sync = 0; 457 int check_caps = 0; 458 int ret; 459 struct timespec mtime = CURRENT_TIME; 460 ··· 469 pos = i_size_read(inode); 470 else 471 pos = *offset; 472 473 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 474 if (ret < 0) ··· 496 */ 497 more: 498 len = left; 499 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 500 ceph_vino(inode), pos, &len, 501 CEPH_OSD_OP_WRITE, flags, 502 ci->i_snap_realm->cached_context, 503 do_sync, 504 ci->i_truncate_seq, ci->i_truncate_size, 505 - &mtime, false, 2); 506 if (!req) 507 return -ENOMEM; 508 509 num_pages = calc_pages_for(pos, len); 510 511 if (file->f_flags & O_DIRECT) { 512 - pages = ceph_get_direct_page_vector(data, num_pages, pos, len); 513 if (IS_ERR(pages)) { 514 ret = PTR_ERR(pages); 515 goto out;
··· 282 static int striped_read(struct inode *inode, 283 u64 off, u64 len, 284 struct page **pages, int num_pages, 285 + int *checkeof, bool align_to_pages) 286 { 287 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 288 struct ceph_inode_info *ci = ceph_inode(inode); 289 u64 pos, this_len; 290 + int io_align, page_align; 291 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 292 int left, pages_left; 293 int read; ··· 302 page_pos = pages; 303 pages_left = num_pages; 304 read = 0; 305 + io_align = off & ~PAGE_MASK; 306 307 more: 308 + if (align_to_pages) 309 + page_align = (pos - io_align) & ~PAGE_MASK; 310 + else 311 + page_align = pos & ~PAGE_MASK; 312 this_len = left; 313 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), 314 &ci->i_layout, pos, &this_len, 315 ci->i_truncate_seq, 316 ci->i_truncate_size, 317 + page_pos, pages_left, page_align); 318 hit_stripe = this_len < left; 319 was_short = ret >= 0 && ret < this_len; 320 if (ret == -ENOENT) ··· 393 if (ret < 0) 394 goto done; 395 396 + ret = striped_read(inode, off, len, pages, num_pages, checkeof, 397 + file->f_flags & O_DIRECT); 398 399 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 400 ret = ceph_copy_page_vector_to_user(pages, data, off, ret); ··· 448 int flags; 449 int do_sync = 0; 450 int check_caps = 0; 451 + int page_align, io_align; 452 int ret; 453 struct timespec mtime = CURRENT_TIME; 454 ··· 461 pos = i_size_read(inode); 462 else 463 pos = *offset; 464 + 465 + io_align = pos & ~PAGE_MASK; 466 467 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 468 if (ret < 0) ··· 486 */ 487 more: 488 len = left; 489 + if (file->f_flags & O_DIRECT) 490 + /* write from beginning of first page, regardless of 491 + io alignment */ 492 + page_align = (pos - io_align) & ~PAGE_MASK; 493 + else 494 + page_align = pos & ~PAGE_MASK; 495 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 496 ceph_vino(inode), pos, &len, 497 CEPH_OSD_OP_WRITE, flags, 498 ci->i_snap_realm->cached_context, 499 do_sync, 500 ci->i_truncate_seq, ci->i_truncate_size, 501 + &mtime, false, 2, page_align); 502 if (!req) 503 return -ENOMEM; 504 505 num_pages = calc_pages_for(pos, len); 506 507 if (file->f_flags & O_DIRECT) { 508 + pages = ceph_get_direct_page_vector(data, num_pages); 509 if (IS_ERR(pages)) { 510 ret = PTR_ERR(pages); 511 goto out;
+1 -1
fs/ceph/inode.c
··· 1752 return 0; 1753 } 1754 1755 - dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); 1756 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 1757 return 0; 1758
··· 1752 return 0; 1753 } 1754 1755 + dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); 1756 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 1757 return 0; 1758
+5 -2
include/linux/ceph/osd_client.h
··· 79 struct ceph_file_layout r_file_layout; 80 struct ceph_snap_context *r_snapc; /* snap context for writes */ 81 unsigned r_num_pages; /* size of page array (follows) */ 82 struct page **r_pages; /* pages for data payload */ 83 int r_pages_from_pool; 84 int r_own_pages; /* if true, i own page list */ ··· 195 int do_sync, u32 truncate_seq, 196 u64 truncate_size, 197 struct timespec *mtime, 198 - bool use_mempool, int num_reply); 199 200 static inline void ceph_osdc_get_request(struct ceph_osd_request *req) 201 { ··· 220 struct ceph_file_layout *layout, 221 u64 off, u64 *plen, 222 u32 truncate_seq, u64 truncate_size, 223 - struct page **pages, int nr_pages); 224 225 extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, 226 struct ceph_vino vino,
··· 79 struct ceph_file_layout r_file_layout; 80 struct ceph_snap_context *r_snapc; /* snap context for writes */ 81 unsigned r_num_pages; /* size of page array (follows) */ 82 + unsigned r_page_alignment; /* io offset in first page */ 83 struct page **r_pages; /* pages for data payload */ 84 int r_pages_from_pool; 85 int r_own_pages; /* if true, i own page list */ ··· 194 int do_sync, u32 truncate_seq, 195 u64 truncate_size, 196 struct timespec *mtime, 197 + bool use_mempool, int num_reply, 198 + int page_align); 199 200 static inline void ceph_osdc_get_request(struct ceph_osd_request *req) 201 { ··· 218 struct ceph_file_layout *layout, 219 u64 off, u64 *plen, 220 u32 truncate_seq, u64 truncate_size, 221 + struct page **pages, int nr_pages, 222 + int page_align); 223 224 extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, 225 struct ceph_vino vino,
+14 -8
net/ceph/osd_client.c
··· 71 op->extent.length = objlen; 72 } 73 req->r_num_pages = calc_pages_for(off, *plen); 74 if (op->op == CEPH_OSD_OP_WRITE) 75 op->payload_len = *plen; 76 ··· 420 u32 truncate_seq, 421 u64 truncate_size, 422 struct timespec *mtime, 423 - bool use_mempool, int num_reply) 424 { 425 struct ceph_osd_req_op ops[3]; 426 struct ceph_osd_request *req; ··· 448 /* calculate max write size */ 449 calc_layout(osdc, vino, layout, off, plen, req, ops); 450 req->r_file_layout = *layout; /* keep a copy */ 451 452 ceph_osdc_build_request(req, off, plen, ops, 453 snapc, ··· 1495 struct ceph_vino vino, struct ceph_file_layout *layout, 1496 u64 off, u64 *plen, 1497 u32 truncate_seq, u64 truncate_size, 1498 - struct page **pages, int num_pages) 1499 { 1500 struct ceph_osd_request *req; 1501 int rc = 0; ··· 1505 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1506 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1507 NULL, 0, truncate_seq, truncate_size, NULL, 1508 - false, 1); 1509 if (!req) 1510 return -ENOMEM; 1511 1512 /* it may be a short read due to an object boundary */ 1513 req->r_pages = pages; 1514 1515 - dout("readpages final extent is %llu~%llu (%d pages)\n", 1516 - off, *plen, req->r_num_pages); 1517 1518 rc = ceph_osdc_start_request(osdc, req, false); 1519 if (!rc) ··· 1539 { 1540 struct ceph_osd_request *req; 1541 int rc = 0; 1542 1543 BUG_ON(vino.snap != CEPH_NOSNAP); 1544 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, ··· 1548 CEPH_OSD_FLAG_WRITE, 1549 snapc, do_sync, 1550 truncate_seq, truncate_size, mtime, 1551 - nofail, 1); 1552 if (!req) 1553 return -ENOMEM; 1554 ··· 1645 m = ceph_msg_get(req->r_reply); 1646 1647 if (data_len > 0) { 1648 - unsigned data_off = le16_to_cpu(hdr->data_off); 1649 - int want = calc_pages_for(data_off & ~PAGE_MASK, data_len); 1650 1651 if (unlikely(req->r_num_pages < want)) { 1652 pr_warning("tid %lld reply %d > expected %d pages\n",
··· 71 op->extent.length = objlen; 72 } 73 req->r_num_pages = calc_pages_for(off, *plen); 74 + req->r_page_alignment = off & ~PAGE_MASK; 75 if (op->op == CEPH_OSD_OP_WRITE) 76 op->payload_len = *plen; 77 ··· 419 u32 truncate_seq, 420 u64 truncate_size, 421 struct timespec *mtime, 422 + bool use_mempool, int num_reply, 423 + int page_align) 424 { 425 struct ceph_osd_req_op ops[3]; 426 struct ceph_osd_request *req; ··· 446 /* calculate max write size */ 447 calc_layout(osdc, vino, layout, off, plen, req, ops); 448 req->r_file_layout = *layout; /* keep a copy */ 449 + 450 + /* in case it differs from natural alignment that calc_layout 451 + filled in for us */ 452 + req->r_page_alignment = page_align; 453 454 ceph_osdc_build_request(req, off, plen, ops, 455 snapc, ··· 1489 struct ceph_vino vino, struct ceph_file_layout *layout, 1490 u64 off, u64 *plen, 1491 u32 truncate_seq, u64 truncate_size, 1492 + struct page **pages, int num_pages, int page_align) 1493 { 1494 struct ceph_osd_request *req; 1495 int rc = 0; ··· 1499 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1500 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1501 NULL, 0, truncate_seq, truncate_size, NULL, 1502 + false, 1, page_align); 1503 if (!req) 1504 return -ENOMEM; 1505 1506 /* it may be a short read due to an object boundary */ 1507 req->r_pages = pages; 1508 1509 + dout("readpages final extent is %llu~%llu (%d pages align %d)\n", 1510 + off, *plen, req->r_num_pages, page_align); 1511 1512 rc = ceph_osdc_start_request(osdc, req, false); 1513 if (!rc) ··· 1533 { 1534 struct ceph_osd_request *req; 1535 int rc = 0; 1536 + int page_align = off & ~PAGE_MASK; 1537 1538 BUG_ON(vino.snap != CEPH_NOSNAP); 1539 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, ··· 1541 CEPH_OSD_FLAG_WRITE, 1542 snapc, do_sync, 1543 truncate_seq, truncate_size, mtime, 1544 + nofail, 1, page_align); 1545 if (!req) 1546 return -ENOMEM; 1547 ··· 1638 m = ceph_msg_get(req->r_reply); 1639 1640 if (data_len > 0) { 1641 + int want = calc_pages_for(req->r_page_alignment, data_len); 1642 1643 if (unlikely(req->r_num_pages < want)) { 1644 pr_warning("tid %lld reply %d > expected %d pages\n",