Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
"The highlights are:

- several changes to how snap context and snap realms are tracked
(Xiubo Li). In particular, this should resolve a long-standing
issue of high kworker CPU usage and various stalls caused by
needless iteration over all inodes in the snap realm.

- async create fixes to address hangs in some edge cases (Jeff
Layton)

- support for getvxattr MDS op for querying server-side xattrs, such
as file/directory layouts and ephemeral pins (Milind Changire)

- average latency is now maintained for all metrics (Venky Shankar)

- some tweaks around handling inline data to make it fit better with
netfs helper library (David Howells)

Also a couple of memory leaks got plugged along with a few assorted
fixups. Last but not least, Xiubo has stepped up to serve as a CephFS
co-maintainer"

* tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client: (27 commits)
ceph: fix memory leak in ceph_readdir when note_last_dentry returns error
ceph: uninitialized variable in debug output
ceph: use tracked average r/w/m latencies to display metrics in debugfs
ceph: include average/stdev r/w/m latency in mds metrics
ceph: track average r/w/m latency
ceph: use ktime_to_timespec64() rather than jiffies_to_timespec64()
ceph: assign the ci only when the inode isn't NULL
ceph: fix inode reference leakage in ceph_get_snapdir()
ceph: misc fix for code style and logs
ceph: allocate capsnap memory outside of ceph_queue_cap_snap()
ceph: do not release the global snaprealm until unmounting
ceph: remove incorrect and unused CEPH_INO_DOTDOT macro
MAINTAINERS: add Xiubo Li as cephfs co-maintainer
ceph: eliminate the recursion when rebuilding the snap context
ceph: do not update snapshot context when there is no new snapshot
ceph: zero the dir_entries memory when allocating it
ceph: move to a dedicated slabcache for ceph_cap_snap
ceph: add getvxattr op
libceph: drop else branches in prepare_read_data{,_cont}
ceph: fix comments mentioning i_mutex
...

+575 -374
+2
MAINTAINERS
··· 4456 4456 CEPH COMMON CODE (LIBCEPH) 4457 4457 M: Ilya Dryomov <idryomov@gmail.com> 4458 4458 M: Jeff Layton <jlayton@kernel.org> 4459 + M: Xiubo Li <xiubli@redhat.com> 4459 4460 L: ceph-devel@vger.kernel.org 4460 4461 S: Supported 4461 4462 W: http://ceph.com/ ··· 4467 4466 4468 4467 CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH) 4469 4468 M: Jeff Layton <jlayton@kernel.org> 4469 + M: Xiubo Li <xiubli@redhat.com> 4470 4470 M: Ilya Dryomov <idryomov@gmail.com> 4471 4471 L: ceph-devel@vger.kernel.org 4472 4472 S: Supported
+112 -128
fs/ceph/addr.c
··· 184 184 185 185 static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) 186 186 { 187 - struct inode *inode = rreq->mapping->host; 187 + struct inode *inode = rreq->inode; 188 188 struct ceph_inode_info *ci = ceph_inode(inode); 189 189 struct ceph_file_layout *lo = &ci->i_layout; 190 190 u32 blockoff; ··· 201 201 202 202 static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq) 203 203 { 204 - struct inode *inode = subreq->rreq->mapping->host; 204 + struct inode *inode = subreq->rreq->inode; 205 205 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 206 206 struct ceph_inode_info *ci = ceph_inode(inode); 207 207 u64 objno, objoff; ··· 244 244 iput(req->r_inode); 245 245 } 246 246 247 + static bool ceph_netfs_issue_op_inline(struct netfs_read_subrequest *subreq) 248 + { 249 + struct netfs_read_request *rreq = subreq->rreq; 250 + struct inode *inode = rreq->inode; 251 + struct ceph_mds_reply_info_parsed *rinfo; 252 + struct ceph_mds_reply_info_in *iinfo; 253 + struct ceph_mds_request *req; 254 + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 255 + struct ceph_inode_info *ci = ceph_inode(inode); 256 + struct iov_iter iter; 257 + ssize_t err = 0; 258 + size_t len; 259 + 260 + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); 261 + __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); 262 + 263 + if (subreq->start >= inode->i_size) 264 + goto out; 265 + 266 + /* We need to fetch the inline data. */ 267 + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); 268 + if (IS_ERR(req)) { 269 + err = PTR_ERR(req); 270 + goto out; 271 + } 272 + req->r_ino1 = ci->i_vino; 273 + req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA); 274 + req->r_num_caps = 2; 275 + 276 + err = ceph_mdsc_do_request(mdsc, NULL, req); 277 + if (err < 0) 278 + goto out; 279 + 280 + rinfo = &req->r_reply_info; 281 + iinfo = &rinfo->targeti; 282 + if (iinfo->inline_version == CEPH_INLINE_NONE) { 283 + /* The data got uninlined */ 284 + ceph_mdsc_put_request(req); 285 + return false; 286 + } 287 + 288 + len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len); 289 + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); 290 + err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter); 291 + if (err == 0) 292 + err = -EFAULT; 293 + 294 + ceph_mdsc_put_request(req); 295 + out: 296 + netfs_subreq_terminated(subreq, err, false); 297 + return true; 298 + } 299 + 247 300 static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) 248 301 { 249 302 struct netfs_read_request *rreq = subreq->rreq; 250 - struct inode *inode = rreq->mapping->host; 303 + struct inode *inode = rreq->inode; 251 304 struct ceph_inode_info *ci = ceph_inode(inode); 252 305 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 253 306 struct ceph_osd_request *req; ··· 310 257 size_t page_off; 311 258 int err = 0; 312 259 u64 len = subreq->len; 260 + 261 + if (ci->i_inline_version != CEPH_INLINE_NONE && 262 + ceph_netfs_issue_op_inline(subreq)) 263 + return; 313 264 314 265 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, 315 266 0, 1, CEPH_OSD_OP_READ, ··· 383 326 size_t len = folio_size(folio); 384 327 u64 off = folio_file_pos(folio); 385 328 386 - if (ci->i_inline_version != CEPH_INLINE_NONE) { 387 - /* 388 - * Uptodate inline data should have been added 389 - * into page cache while getting Fcr caps. 390 - */ 391 - if (off == 0) { 392 - folio_unlock(folio); 393 - return -EINVAL; 394 - } 395 - zero_user_segment(&folio->page, 0, folio_size(folio)); 396 - folio_mark_uptodate(folio); 397 - folio_unlock(folio); 398 - return 0; 399 - } 400 - 401 - dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n", 402 - vino.ino, vino.snap, file, off, len, folio, folio_index(folio)); 329 + dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n inline %d", 330 + vino.ino, vino.snap, file, off, len, folio, folio_index(folio), 331 + ci->i_inline_version != CEPH_INLINE_NONE); 403 332 404 333 return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL); 405 334 } ··· 1324 1281 struct page **pagep, void **fsdata) 1325 1282 { 1326 1283 struct inode *inode = file_inode(file); 1327 - struct ceph_inode_info *ci = ceph_inode(inode); 1328 1284 struct folio *folio = NULL; 1329 - pgoff_t index = pos >> PAGE_SHIFT; 1330 1285 int r; 1331 - 1332 - /* 1333 - * Uninlining should have already been done and everything updated, EXCEPT 1334 - * for inline_version sent to the MDS. 1335 - */ 1336 - if (ci->i_inline_version != CEPH_INLINE_NONE) { 1337 - unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; 1338 - if (aop_flags & AOP_FLAG_NOFS) 1339 - fgp_flags |= FGP_NOFS; 1340 - folio = __filemap_get_folio(mapping, index, fgp_flags, 1341 - mapping_gfp_mask(mapping)); 1342 - if (!folio) 1343 - return -ENOMEM; 1344 - 1345 - /* 1346 - * The inline_version on a new inode is set to 1. If that's the 1347 - * case, then the folio is brand new and isn't yet Uptodate. 1348 - */ 1349 - r = 0; 1350 - if (index == 0 && ci->i_inline_version != 1) { 1351 - if (!folio_test_uptodate(folio)) { 1352 - WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", 1353 - ci->i_inline_version); 1354 - r = -EINVAL; 1355 - } 1356 - goto out; 1357 - } 1358 - zero_user_segment(&folio->page, 0, folio_size(folio)); 1359 - folio_mark_uptodate(folio); 1360 - goto out; 1361 - } 1362 1286 1363 1287 r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL, 1364 1288 &ceph_netfs_read_ops, NULL); 1365 - out: 1366 1289 if (r == 0) 1367 1290 folio_wait_fscache(folio); 1368 1291 if (r < 0) { ··· 1524 1515 sb_start_pagefault(inode->i_sb); 1525 1516 ceph_block_sigs(&oldset); 1526 1517 1527 - if (ci->i_inline_version != CEPH_INLINE_NONE) { 1528 - struct page *locked_page = NULL; 1529 - if (off == 0) { 1530 - lock_page(page); 1531 - locked_page = page; 1532 - } 1533 - err = ceph_uninline_data(vma->vm_file, locked_page); 1534 - if (locked_page) 1535 - unlock_page(locked_page); 1536 - if (err < 0) 1537 - goto out_free; 1538 - } 1539 - 1540 1518 if (off + thp_size(page) <= size) 1541 1519 len = thp_size(page); 1542 1520 else ··· 1580 1584 ceph_put_snap_context(snapc); 1581 1585 } while (err == 0); 1582 1586 1583 - if (ret == VM_FAULT_LOCKED || 1584 - ci->i_inline_version != CEPH_INLINE_NONE) { 1587 + if (ret == VM_FAULT_LOCKED) { 1585 1588 int dirty; 1586 1589 spin_lock(&ci->i_ceph_lock); 1587 - ci->i_inline_version = CEPH_INLINE_NONE; 1588 1590 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 1589 1591 &prealloc_cf); 1590 1592 spin_unlock(&ci->i_ceph_lock); ··· 1646 1652 } 1647 1653 } 1648 1654 1649 - int ceph_uninline_data(struct file *filp, struct page *locked_page) 1655 + int ceph_uninline_data(struct file *file) 1650 1656 { 1651 - struct inode *inode = file_inode(filp); 1657 + struct inode *inode = file_inode(file); 1652 1658 struct ceph_inode_info *ci = ceph_inode(inode); 1653 1659 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1654 1660 struct ceph_osd_request *req; 1655 - struct page *page = NULL; 1656 - u64 len, inline_version; 1661 + struct ceph_cap_flush *prealloc_cf; 1662 + struct folio *folio = NULL; 1663 + u64 inline_version = CEPH_INLINE_NONE; 1664 + struct page *pages[1]; 1657 1665 int err = 0; 1658 - bool from_pagecache = false; 1666 + u64 len; 1667 + 1668 + prealloc_cf = ceph_alloc_cap_flush(); 1669 + if (!prealloc_cf) 1670 + return -ENOMEM; 1671 + 1672 + folio = read_mapping_folio(inode->i_mapping, 0, file); 1673 + if (IS_ERR(folio)) { 1674 + err = PTR_ERR(folio); 1675 + goto out; 1676 + } 1677 + 1678 + folio_lock(folio); 1659 1679 1660 1680 spin_lock(&ci->i_ceph_lock); 1661 1681 inline_version = ci->i_inline_version; ··· 1680 1672 1681 1673 if (inline_version == 1 || /* initial version, no data */ 1682 1674 inline_version == CEPH_INLINE_NONE) 1683 - goto out; 1675 + goto out_unlock; 1684 1676 1685 - if (locked_page) { 1686 - page = locked_page; 1687 - WARN_ON(!PageUptodate(page)); 1688 - } else if (ceph_caps_issued(ci) & 1689 - (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { 1690 - page = find_get_page(inode->i_mapping, 0); 1691 - if (page) { 1692 - if (PageUptodate(page)) { 1693 - from_pagecache = true; 1694 - lock_page(page); 1695 - } else { 1696 - put_page(page); 1697 - page = NULL; 1698 - } 1699 - } 1700 - } 1701 - 1702 - if (page) { 1703 - len = i_size_read(inode); 1704 - if (len > PAGE_SIZE) 1705 - len = PAGE_SIZE; 1706 - } else { 1707 - page = __page_cache_alloc(GFP_NOFS); 1708 - if (!page) { 1709 - err = -ENOMEM; 1710 - goto out; 1711 - } 1712 - err = __ceph_do_getattr(inode, page, 1713 - CEPH_STAT_CAP_INLINE_DATA, true); 1714 - if (err < 0) { 1715 - /* no inline data */ 1716 - if (err == -ENODATA) 1717 - err = 0; 1718 - goto out; 1719 - } 1720 - len = err; 1721 - } 1677 + len = i_size_read(inode); 1678 + if (len > folio_size(folio)) 1679 + len = folio_size(folio); 1722 1680 1723 1681 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1724 1682 ceph_vino(inode), 0, &len, 0, 1, ··· 1692 1718 NULL, 0, 0, false); 1693 1719 if (IS_ERR(req)) { 1694 1720 err = PTR_ERR(req); 1695 - goto out; 1721 + goto out_unlock; 1696 1722 } 1697 1723 1698 1724 req->r_mtime = inode->i_mtime; ··· 1701 1727 err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1702 1728 ceph_osdc_put_request(req); 1703 1729 if (err < 0) 1704 - goto out; 1730 + goto out_unlock; 1705 1731 1706 1732 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1707 1733 ceph_vino(inode), 0, &len, 1, 3, ··· 1710 1736 ci->i_truncate_size, false); 1711 1737 if (IS_ERR(req)) { 1712 1738 err = PTR_ERR(req); 1713 - goto out; 1739 + goto out_unlock; 1714 1740 } 1715 1741 1716 - osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); 1742 + pages[0] = folio_page(folio, 0); 1743 + osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false); 1717 1744 1718 1745 { 1719 1746 __le64 xattr_buf = cpu_to_le64(inline_version); ··· 1724 1749 CEPH_OSD_CMPXATTR_OP_GT, 1725 1750 CEPH_OSD_CMPXATTR_MODE_U64); 1726 1751 if (err) 1727 - goto out_put; 1752 + goto out_put_req; 1728 1753 } 1729 1754 1730 1755 { ··· 1735 1760 "inline_version", 1736 1761 xattr_buf, xattr_len, 0, 0); 1737 1762 if (err) 1738 - goto out_put; 1763 + goto out_put_req; 1739 1764 } 1740 1765 1741 1766 req->r_mtime = inode->i_mtime; ··· 1746 1771 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 1747 1772 req->r_end_latency, len, err); 1748 1773 1749 - out_put: 1774 + if (!err) { 1775 + int dirty; 1776 + 1777 + /* Set to CAP_INLINE_NONE and dirty the caps */ 1778 + down_read(&fsc->mdsc->snap_rwsem); 1779 + spin_lock(&ci->i_ceph_lock); 1780 + ci->i_inline_version = CEPH_INLINE_NONE; 1781 + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf); 1782 + spin_unlock(&ci->i_ceph_lock); 1783 + up_read(&fsc->mdsc->snap_rwsem); 1784 + if (dirty) 1785 + __mark_inode_dirty(inode, dirty); 1786 + } 1787 + out_put_req: 1750 1788 ceph_osdc_put_request(req); 1751 1789 if (err == -ECANCELED) 1752 1790 err = 0; 1791 + out_unlock: 1792 + folio_unlock(folio); 1793 + folio_put(folio); 1753 1794 out: 1754 - if (page && page != locked_page) { 1755 - if (from_pagecache) { 1756 - unlock_page(page); 1757 - put_page(page); 1758 - } else 1759 - __free_pages(page, 0); 1760 - } 1761 - 1795 + ceph_free_cap_flush(prealloc_cf); 1762 1796 dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", 1763 1797 inode, ceph_vinop(inode), inline_version, err); 1764 1798 return err;
+15 -1
fs/ceph/caps.c
··· 1915 1915 ceph_get_mds_session(session); 1916 1916 1917 1917 spin_lock(&ci->i_ceph_lock); 1918 + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { 1919 + /* Don't send messages until we get async create reply */ 1920 + spin_unlock(&ci->i_ceph_lock); 1921 + ceph_put_mds_session(session); 1922 + return; 1923 + } 1924 + 1918 1925 if (ci->i_ceph_flags & CEPH_I_FLUSH) 1919 1926 flags |= CHECK_CAPS_FLUSH; 1920 1927 retry: ··· 2416 2409 dout("write_inode %p wait=%d\n", inode, wait); 2417 2410 ceph_fscache_unpin_writeback(inode, wbc); 2418 2411 if (wait) { 2412 + err = ceph_wait_on_async_create(inode); 2413 + if (err) 2414 + return err; 2419 2415 dirty = try_flush_caps(inode, &flush_tid); 2420 2416 if (dirty) 2421 2417 err = wait_event_interruptible(ci->i_cap_wq, ··· 2448 2438 int ret; 2449 2439 u64 first_tid = 0; 2450 2440 u64 last_snap_flush = 0; 2441 + 2442 + /* Don't do anything until create reply comes in */ 2443 + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) 2444 + return; 2451 2445 2452 2446 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; 2453 2447 ··· 4166 4152 4167 4153 /* lookup ino */ 4168 4154 inode = ceph_find_inode(mdsc->fsc->sb, vino); 4169 - ci = ceph_inode(inode); 4170 4155 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, 4171 4156 vino.snap, inode); 4172 4157 ··· 4191 4178 } 4192 4179 goto flush_cap_releases; 4193 4180 } 4181 + ci = ceph_inode(inode); 4194 4182 4195 4183 /* these will work even if we don't have a cap yet */ 4196 4184 switch (op) {
+2 -3
fs/ceph/debugfs.c
··· 175 175 struct ceph_fs_client *fsc = s->private; 176 176 struct ceph_client_metric *cm = &fsc->mdsc->metric; 177 177 struct ceph_metric *m; 178 - s64 total, sum, avg, min, max, sq; 178 + s64 total, avg, min, max, sq; 179 179 int i; 180 180 181 181 seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); ··· 185 185 m = &cm->metric[i]; 186 186 spin_lock(&m->lock); 187 187 total = m->total; 188 - sum = m->latency_sum; 189 - avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; 188 + avg = m->latency_avg; 190 189 min = m->latency_min; 191 190 max = m->latency_max; 192 191 sq = m->latency_sq_sum;
+13 -4
fs/ceph/dir.c
··· 145 145 return ERR_PTR(-EAGAIN); 146 146 } 147 147 /* reading/filling the cache are serialized by 148 - i_mutex, no need to use page lock */ 148 + i_rwsem, no need to use page lock */ 149 149 unlock_page(cache_ctl->page); 150 150 cache_ctl->dentries = kmap(cache_ctl->page); 151 151 } ··· 155 155 rcu_read_lock(); 156 156 spin_lock(&parent->d_lock); 157 157 /* check i_size again here, because empty directory can be 158 - * marked as complete while not holding the i_mutex. */ 158 + * marked as complete while not holding the i_rwsem. */ 159 159 if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) 160 160 dentry = cache_ctl->dentries[cache_ctl->index]; 161 161 else ··· 478 478 2 : (fpos_off(rde->offset) + 1); 479 479 err = note_last_dentry(dfi, rde->name, rde->name_len, 480 480 next_offset); 481 - if (err) 481 + if (err) { 482 + ceph_mdsc_put_request(dfi->last_readdir); 483 + dfi->last_readdir = NULL; 482 484 return err; 485 + } 483 486 } else if (req->r_reply_info.dir_end) { 484 487 dfi->next_offset = 2; 485 488 /* keep last name */ ··· 523 520 if (!dir_emit(ctx, rde->name, rde->name_len, 524 521 ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)), 525 522 le32_to_cpu(rde->inode.in->mode) >> 12)) { 523 + /* 524 + * NOTE: Here no need to put the 'dfi->last_readdir', 525 + * because when dir_emit stops us it's most likely 526 + * doesn't have enough memory, etc. So for next readdir 527 + * it will continue. 528 + */ 526 529 dout("filldir stopping us...\n"); 527 530 return 0; 528 531 } ··· 680 671 struct dentry *dentry) 681 672 { 682 673 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 683 - struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ 674 + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */ 684 675 685 676 /* .snap dir? */ 686 677 if (ceph_snap(parent) == CEPH_NOSNAP &&
+49 -34
fs/ceph/file.c
··· 207 207 struct ceph_mount_options *opt = 208 208 ceph_inode_to_client(&ci->vfs_inode)->mount_options; 209 209 struct ceph_file_info *fi; 210 + int ret; 210 211 211 212 dout("%s %p %p 0%o (%s)\n", __func__, inode, file, 212 213 inode->i_mode, isdir ? "dir" : "regular"); ··· 241 240 INIT_LIST_HEAD(&fi->rw_contexts); 242 241 fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); 243 242 243 + if ((file->f_mode & FMODE_WRITE) && 244 + ci->i_inline_version != CEPH_INLINE_NONE) { 245 + ret = ceph_uninline_data(file); 246 + if (ret < 0) 247 + goto error; 248 + } 249 + 244 250 return 0; 251 + 252 + error: 253 + ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); 254 + ceph_put_fmode(ci, fi->fmode, 1); 255 + kmem_cache_free(ceph_file_cachep, fi); 256 + /* wake up anyone waiting for caps on this inode */ 257 + wake_up_all(&ci->i_cap_wq); 258 + return ret; 245 259 } 246 260 247 261 /* ··· 532 516 } 533 517 } 534 518 519 + static void wake_async_create_waiters(struct inode *inode, 520 + struct ceph_mds_session *session) 521 + { 522 + struct ceph_inode_info *ci = ceph_inode(inode); 523 + 524 + spin_lock(&ci->i_ceph_lock); 525 + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { 526 + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; 527 + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); 528 + } 529 + ceph_kick_flushing_inode_caps(session, ci); 530 + spin_unlock(&ci->i_ceph_lock); 531 + } 532 + 535 533 static void ceph_async_create_cb(struct ceph_mds_client *mdsc, 536 534 struct ceph_mds_request *req) 537 535 { 536 + struct dentry *dentry = req->r_dentry; 537 + struct inode *dinode = d_inode(dentry); 538 + struct inode *tinode = req->r_target_inode; 538 539 int result = req->r_err ? req->r_err : 539 540 le32_to_cpu(req->r_reply_info.head->result); 540 541 542 + WARN_ON_ONCE(dinode && tinode && dinode != tinode); 543 + 544 + /* MDS changed -- caller must resubmit */ 541 545 if (result == -EJUKEBOX) 542 546 goto out; 543 547 544 548 mapping_set_error(req->r_parent->i_mapping, result); 545 549 546 550 if (result) { 547 - struct dentry *dentry = req->r_dentry; 548 - struct inode *inode = d_inode(dentry); 549 551 int pathlen = 0; 550 552 u64 base = 0; 551 553 char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 552 554 &base, 0); 553 555 556 + pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", 557 + base, IS_ERR(path) ? "<<bad>>" : path, result); 558 + ceph_mdsc_free_path(path, pathlen); 559 + 554 560 ceph_dir_clear_complete(req->r_parent); 555 561 if (!d_unhashed(dentry)) 556 562 d_drop(dentry); 557 563 558 - ceph_inode_shutdown(inode); 559 - 560 - pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", 561 - base, IS_ERR(path) ? "<<bad>>" : path, result); 562 - ceph_mdsc_free_path(path, pathlen); 564 + if (dinode) { 565 + mapping_set_error(dinode->i_mapping, result); 566 + ceph_inode_shutdown(dinode); 567 + wake_async_create_waiters(dinode, req->r_session); 568 + } 563 569 } 564 570 565 - if (req->r_target_inode) { 566 - struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); 567 - u64 ino = ceph_vino(req->r_target_inode).ino; 571 + if (tinode) { 572 + u64 ino = ceph_vino(tinode).ino; 568 573 569 574 if (req->r_deleg_ino != ino) 570 575 pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", 571 576 __func__, req->r_err, req->r_deleg_ino, ino); 572 - mapping_set_error(req->r_target_inode->i_mapping, result); 573 577 574 - spin_lock(&ci->i_ceph_lock); 575 - if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { 576 - ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; 577 - wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); 578 - } 579 - ceph_kick_flushing_inode_caps(req->r_session, ci); 580 - spin_unlock(&ci->i_ceph_lock); 578 + mapping_set_error(tinode->i_mapping, result); 579 + wake_async_create_waiters(tinode, req->r_session); 581 580 } else if (!result) { 582 581 pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, 583 582 req->r_deleg_ino); ··· 1072 1041 } 1073 1042 1074 1043 spin_lock(&ci->i_ceph_lock); 1075 - ci->i_inline_version = CEPH_INLINE_NONE; 1076 1044 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 1077 1045 &aio_req->prealloc_cf); 1078 1046 spin_unlock(&ci->i_ceph_lock); ··· 1808 1778 if (err) 1809 1779 goto out; 1810 1780 1811 - if (ci->i_inline_version != CEPH_INLINE_NONE) { 1812 - err = ceph_uninline_data(file, NULL); 1813 - if (err < 0) 1814 - goto out; 1815 - } 1816 - 1817 1781 dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", 1818 1782 inode, ceph_vinop(inode), pos, count, i_size_read(inode)); 1819 1783 if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) ··· 1879 1855 int dirty; 1880 1856 1881 1857 spin_lock(&ci->i_ceph_lock); 1882 - ci->i_inline_version = CEPH_INLINE_NONE; 1883 1858 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 1884 1859 &prealloc_cf); 1885 1860 spin_unlock(&ci->i_ceph_lock); ··· 2132 2109 goto unlock; 2133 2110 } 2134 2111 2135 - if (ci->i_inline_version != CEPH_INLINE_NONE) { 2136 - ret = ceph_uninline_data(file, NULL); 2137 - if (ret < 0) 2138 - goto unlock; 2139 - } 2140 - 2141 2112 size = i_size_read(inode); 2142 2113 2143 2114 /* Are we punching a hole beyond EOF? */ ··· 2156 2139 2157 2140 if (!ret) { 2158 2141 spin_lock(&ci->i_ceph_lock); 2159 - ci->i_inline_version = CEPH_INLINE_NONE; 2160 2142 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 2161 2143 &prealloc_cf); 2162 2144 spin_unlock(&ci->i_ceph_lock); ··· 2548 2532 } 2549 2533 /* Mark Fw dirty */ 2550 2534 spin_lock(&dst_ci->i_ceph_lock); 2551 - dst_ci->i_inline_version = CEPH_INLINE_NONE; 2552 2535 dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); 2553 2536 spin_unlock(&dst_ci->i_ceph_lock); 2554 2537 if (dirty)
+61 -4
fs/ceph/inode.c
··· 87 87 if (!S_ISDIR(parent->i_mode)) { 88 88 pr_warn_once("bad snapdir parent type (mode=0%o)\n", 89 89 parent->i_mode); 90 - return ERR_PTR(-ENOTDIR); 90 + goto err; 91 91 } 92 92 93 93 if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { 94 94 pr_warn_once("bad snapdir inode type (mode=0%o)\n", 95 95 inode->i_mode); 96 - return ERR_PTR(-ENOTDIR); 96 + goto err; 97 97 } 98 98 99 99 inode->i_mode = parent->i_mode; ··· 113 113 } 114 114 115 115 return inode; 116 + err: 117 + if ((inode->i_state & I_NEW)) 118 + discard_new_inode(inode); 119 + else 120 + iput(inode); 121 + return ERR_PTR(-ENOTDIR); 116 122 } 117 123 118 124 const struct inode_operations ceph_file_iops = { ··· 1207 1201 1208 1202 /* 1209 1203 * splice a dentry to an inode. 1210 - * caller must hold directory i_mutex for this to be safe. 1204 + * caller must hold directory i_rwsem for this to be safe. 1211 1205 */ 1212 1206 static int splice_dentry(struct dentry **pdn, struct inode *in) 1213 1207 { ··· 1604 1598 return idx == 0 ? -ENOMEM : 0; 1605 1599 } 1606 1600 /* reading/filling the cache are serialized by 1607 - * i_mutex, no need to use page lock */ 1601 + * i_rwsem, no need to use page lock */ 1608 1602 unlock_page(ctl->page); 1609 1603 ctl->dentries = kmap(ctl->page); 1610 1604 if (idx == 0) ··· 2304 2298 } 2305 2299 ceph_mdsc_put_request(req); 2306 2300 dout("do_getattr result=%d\n", err); 2301 + return err; 2302 + } 2303 + 2304 + int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, 2305 + size_t size) 2306 + { 2307 + struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 2308 + struct ceph_mds_client *mdsc = fsc->mdsc; 2309 + struct ceph_mds_request *req; 2310 + int mode = USE_AUTH_MDS; 2311 + int err; 2312 + char *xattr_value; 2313 + size_t xattr_value_len; 2314 + 2315 + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode); 2316 + if (IS_ERR(req)) { 2317 + err = -ENOMEM; 2318 + goto out; 2319 + } 2320 + 2321 + req->r_path2 = kstrdup(name, GFP_NOFS); 2322 + if (!req->r_path2) { 2323 + err = -ENOMEM; 2324 + goto put; 2325 + } 2326 + 2327 + ihold(inode); 2328 + req->r_inode = inode; 2329 + err = ceph_mdsc_do_request(mdsc, NULL, req); 2330 + if (err < 0) 2331 + goto put; 2332 + 2333 + xattr_value = req->r_reply_info.xattr_info.xattr_value; 2334 + xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len; 2335 + 2336 + dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size); 2337 + 2338 + err = (int)xattr_value_len; 2339 + if (size == 0) 2340 + goto put; 2341 + 2342 + if (xattr_value_len > size) { 2343 + err = -ERANGE; 2344 + goto put; 2345 + } 2346 + 2347 + memcpy(value, xattr_value, xattr_value_len); 2348 + put: 2349 + ceph_mdsc_put_request(req); 2350 + out: 2351 + dout("do_getvxattr result=%d\n", err); 2307 2352 return err; 2308 2353 } 2309 2354
+4 -4
fs/ceph/locks.c
··· 111 111 req->r_args.filelock_change.length = cpu_to_le64(length); 112 112 req->r_args.filelock_change.wait = wait; 113 113 114 - if (wait) 115 - req->r_wait_for_completion = ceph_lock_wait_for_completion; 116 - 117 - err = ceph_mdsc_do_request(mdsc, inode, req); 114 + err = ceph_mdsc_submit_request(mdsc, inode, req); 115 + if (!err) 116 + err = ceph_mdsc_wait_request(mdsc, req, wait ? 117 + ceph_lock_wait_for_completion : NULL); 118 118 if (!err && operation == CEPH_MDS_OP_GETFILELOCK) { 119 119 fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); 120 120 if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+33 -36
fs/ceph/mds_client.c
··· 555 555 return -EIO; 556 556 } 557 557 558 + static int parse_reply_info_getvxattr(void **p, void *end, 559 + struct ceph_mds_reply_info_parsed *info, 560 + u64 features) 561 + { 562 + u32 value_len; 563 + 564 + ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */ 565 + ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */ 566 + ceph_decode_skip_32(p, end, bad); /* skip payload length */ 567 + 568 + ceph_decode_32_safe(p, end, value_len, bad); 569 + 570 + if (value_len == end - *p) { 571 + info->xattr_info.xattr_value = *p; 572 + info->xattr_info.xattr_value_len = value_len; 573 + *p = end; 574 + return value_len; 575 + } 576 + bad: 577 + return -EIO; 578 + } 579 + 558 580 /* 559 581 * parse extra results 560 582 */ ··· 592 570 return parse_reply_info_readdir(p, end, info, features); 593 571 else if (op == CEPH_MDS_OP_CREATE) 594 572 return parse_reply_info_create(p, end, info, features, s); 573 + else if (op == CEPH_MDS_OP_GETVXATTR) 574 + return parse_reply_info_getvxattr(p, end, info, features); 595 575 else 596 576 return -EIO; 597 577 } ··· 2202 2178 order = get_order(size * num_entries); 2203 2179 while (order >= 0) { 2204 2180 rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | 2205 - __GFP_NOWARN, 2181 + __GFP_NOWARN | 2182 + __GFP_ZERO, 2206 2183 order); 2207 2184 if (rinfo->dir_entries) 2208 2185 break; ··· 2971 2946 return err; 2972 2947 } 2973 2948 2974 - static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, 2975 - struct ceph_mds_request *req) 2949 + int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, 2950 + struct ceph_mds_request *req, 2951 + ceph_mds_request_wait_callback_t wait_func) 2976 2952 { 2977 2953 int err; 2978 2954 2979 2955 /* wait */ 2980 2956 dout("do_request waiting\n"); 2981 - if (!req->r_timeout && req->r_wait_for_completion) { 2982 - err = req->r_wait_for_completion(mdsc, req); 2957 + if (wait_func) { 2958 + err = wait_func(mdsc, req); 2983 2959 } else { 2984 2960 long timeleft = wait_for_completion_killable_timeout( 2985 2961 &req->r_completion, ··· 3037 3011 /* issue */ 3038 3012 err = ceph_mdsc_submit_request(mdsc, dir, req); 3039 3013 if (!err) 3040 - err = ceph_mdsc_wait_request(mdsc, req); 3014 + err = ceph_mdsc_wait_request(mdsc, req, NULL); 3041 3015 dout("do_request %p done, result %d\n", req, err); 3042 3016 return err; 3043 3017 } ··· 3122 3096 } 3123 3097 3124 3098 result = le32_to_cpu(head->result); 3125 - 3126 - /* 3127 - * Handle an ESTALE 3128 - * if we're not talking to the authority, send to them 3129 - * if the authority has changed while we weren't looking, 3130 - * send to new authority 3131 - * Otherwise we just have to return an ESTALE 3132 - */ 3133 - if (result == -ESTALE) { 3134 - dout("got ESTALE on request %llu\n", req->r_tid); 3135 - req->r_resend_mds = -1; 3136 - if (req->r_direct_mode != USE_AUTH_MDS) { 3137 - dout("not using auth, setting for that now\n"); 3138 - req->r_direct_mode = USE_AUTH_MDS; 3139 - __do_request(mdsc, req); 3140 - mutex_unlock(&mdsc->mutex); 3141 - goto out; 3142 - } else { 3143 - int mds = __choose_mds(mdsc, req, NULL); 3144 - if (mds >= 0 && mds != req->r_session->s_mds) { 3145 - dout("but auth changed, so resending\n"); 3146 - __do_request(mdsc, req); 3147 - mutex_unlock(&mdsc->mutex); 3148 - goto out; 3149 - } 3150 - } 3151 - dout("have to return ESTALE on request %llu\n", req->r_tid); 3152 - } 3153 - 3154 3099 3155 3100 if (head->safe) { 3156 3101 set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); ··· 4838 4841 mutex_unlock(&mdsc->mutex); 4839 4842 4840 4843 ceph_cleanup_snapid_map(mdsc); 4841 - ceph_cleanup_empty_realms(mdsc); 4844 + ceph_cleanup_global_and_empty_realms(mdsc); 4842 4845 4843 4846 cancel_work_sync(&mdsc->cap_reclaim_work); 4844 4847 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+11 -4
fs/ceph/mds_client.h
··· 100 100 loff_t offset; 101 101 }; 102 102 103 + struct ceph_mds_reply_xattr { 104 + char *xattr_value; 105 + size_t xattr_value_len; 106 + }; 107 + 103 108 /* 104 109 * parsed info about an mds reply, including information about 105 110 * either: 1) the target inode and/or its parent directory and dentry, ··· 120 115 char *dname; 121 116 u32 dname_len; 122 117 struct ceph_mds_reply_lease *dlease; 118 + struct ceph_mds_reply_xattr xattr_info; 123 119 124 120 /* extra */ 125 121 union { ··· 280 274 281 275 union ceph_mds_request_args r_args; 282 276 int r_fmode; /* file mode, if expecting cap */ 283 - const struct cred *r_cred; 284 277 int r_request_release_offset; 278 + const struct cred *r_cred; 285 279 struct timespec64 r_stamp; 286 280 287 281 /* for choosing which mds to send this request to */ ··· 302 296 struct ceph_msg *r_reply; 303 297 struct ceph_mds_reply_info_parsed r_reply_info; 304 298 int r_err; 305 - 299 + u32 r_readdir_offset; 306 300 307 301 struct page *r_locked_page; 308 302 int r_dir_caps; 309 303 int r_num_caps; 310 - u32 r_readdir_offset; 311 304 312 305 unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ 313 306 unsigned long r_started; /* start time to measure timeout against */ ··· 334 329 struct completion r_completion; 335 330 struct completion r_safe_completion; 336 331 ceph_mds_request_callback_t r_callback; 337 - ceph_mds_request_wait_callback_t r_wait_for_completion; 338 332 struct list_head r_unsafe_item; /* per-session unsafe list item */ 339 333 340 334 long long r_dir_release_cnt; ··· 511 507 extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 512 508 struct inode *dir, 513 509 struct ceph_mds_request *req); 510 + int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, 511 + struct ceph_mds_request *req, 512 + ceph_mds_request_wait_callback_t wait_func); 514 513 extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, 515 514 struct inode *dir, 516 515 struct ceph_mds_request *req);
+35 -26
fs/ceph/metric.c
··· 8 8 #include "metric.h" 9 9 #include "mds_client.h" 10 10 11 + static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val) 12 + { 13 + struct timespec64 t = ktime_to_timespec64(val); 14 + ceph_encode_timespec64(ts, &t); 15 + } 16 + 11 17 static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, 12 18 struct ceph_mds_session *s) 13 19 { ··· 32 26 u64 nr_caps = atomic64_read(&m->total_caps); 33 27 u32 header_len = sizeof(struct ceph_metric_header); 34 28 struct ceph_msg *msg; 35 - struct timespec64 ts; 36 29 s64 sum; 37 30 s32 items = 0; 38 31 s32 len; ··· 64 59 /* encode the read latency metric */ 65 60 read = (struct ceph_metric_read_latency *)(cap + 1); 66 61 read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY); 67 - read->header.ver = 1; 62 + read->header.ver = 2; 68 63 read->header.compat = 1; 69 64 read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); 70 65 sum = m->metric[METRIC_READ].latency_sum; 71 - jiffies_to_timespec64(sum, &ts); 72 - read->sec = cpu_to_le32(ts.tv_sec); 73 - read->nsec = cpu_to_le32(ts.tv_nsec); 66 + ktime_to_ceph_timespec(&read->lat, sum); 67 + ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg); 68 + read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum); 69 + read->count = cpu_to_le64(m->metric[METRIC_READ].total); 74 70 items++; 75 71 76 72 /* encode the write latency metric */ 77 73 write = (struct ceph_metric_write_latency *)(read + 1); 78 74 write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY); 79 - write->header.ver = 1; 75 + write->header.ver = 2; 80 76 write->header.compat = 1; 81 77 write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); 82 78 sum = m->metric[METRIC_WRITE].latency_sum; 83 - jiffies_to_timespec64(sum, &ts); 84 - write->sec = cpu_to_le32(ts.tv_sec); 85 - write->nsec = cpu_to_le32(ts.tv_nsec); 79 + ktime_to_ceph_timespec(&write->lat, sum); 80 + ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg); 81 + write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum); 82 + write->count = cpu_to_le64(m->metric[METRIC_WRITE].total); 86 83 items++; 87 84 88 85 /* encode the metadata latency metric */ 89 86 meta = (struct ceph_metric_metadata_latency *)(write + 1); 90 87 meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY); 91 - meta->header.ver = 1; 88 + meta->header.ver = 2; 92 89 meta->header.compat = 1; 93 90 meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); 94 91 sum = m->metric[METRIC_METADATA].latency_sum; 95 - jiffies_to_timespec64(sum, &ts); 96 - meta->sec = cpu_to_le32(ts.tv_sec); 97 - meta->nsec = cpu_to_le32(ts.tv_nsec); 92 + ktime_to_ceph_timespec(&meta->lat, sum); 93 + ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg); 94 + meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum); 95 + meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total); 98 96 items++; 99 97 100 98 /* encode the dentry lease metric */ ··· 258 250 metric->size_max = 0; 259 251 metric->total = 0; 260 252 metric->latency_sum = 0; 253 + metric->latency_avg = 0; 261 254 metric->latency_sq_sum = 0; 262 255 metric->latency_min = KTIME_MAX; 263 256 metric->latency_max = 0; ··· 316 307 max = new; \ 317 308 } 318 309 319 - static inline void __update_stdev(ktime_t total, ktime_t lsum, 320 - ktime_t *sq_sump, ktime_t lat) 310 + static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg, 311 + ktime_t *sq_sump, ktime_t lat) 321 312 { 322 - ktime_t avg, sq; 313 + ktime_t avg; 323 314 324 - if (unlikely(total == 1)) 325 - return; 326 - 327 - /* the sq is (lat - old_avg) * (lat - new_avg) */ 328 - avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1)); 329 - sq = lat - avg; 330 - avg = DIV64_U64_ROUND_CLOSEST(lsum, total); 331 - sq = sq * (lat - avg); 332 - *sq_sump += sq; 315 + if (unlikely(total == 1)) { 316 + *lavg = lat; 317 + } else { 318 + /* the sq is (lat - old_avg) * (lat - new_avg) */ 319 + avg = *lavg + div64_s64(lat - *lavg, total); 320 + *sq_sump += (lat - *lavg)*(lat - avg); 321 + *lavg = avg; 322 + } 333 323 } 334 324 335 325 void ceph_update_metrics(struct ceph_metric *m, ··· 347 339 METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size); 348 340 m->latency_sum += lat; 349 341 METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat); 350 - __update_stdev(total, m->latency_sum, &m->latency_sq_sum, lat); 342 + __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum, 343 + lat); 351 344 spin_unlock(&m->lock); 352 345 }
+40 -21
fs/ceph/metric.h
··· 2 2 #ifndef _FS_CEPH_MDS_METRIC_H 3 3 #define _FS_CEPH_MDS_METRIC_H 4 4 5 - #include <linux/types.h> 5 + #include <linux/ceph/types.h> 6 6 #include <linux/percpu_counter.h> 7 7 #include <linux/ktime.h> 8 8 ··· 19 19 CLIENT_METRIC_TYPE_OPENED_INODES, 20 20 CLIENT_METRIC_TYPE_READ_IO_SIZES, 21 21 CLIENT_METRIC_TYPE_WRITE_IO_SIZES, 22 + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, 23 + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, 24 + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, 25 + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, 26 + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, 27 + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, 22 28 23 - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES, 29 + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, 24 30 }; 25 31 26 32 /* 27 33 * This will always have the highest metric bit value 28 34 * as the last element of the array. 29 35 */ 30 - #define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ 31 - CLIENT_METRIC_TYPE_CAP_INFO, \ 32 - CLIENT_METRIC_TYPE_READ_LATENCY, \ 33 - CLIENT_METRIC_TYPE_WRITE_LATENCY, \ 34 - CLIENT_METRIC_TYPE_METADATA_LATENCY, \ 35 - CLIENT_METRIC_TYPE_DENTRY_LEASE, \ 36 - CLIENT_METRIC_TYPE_OPENED_FILES, \ 37 - CLIENT_METRIC_TYPE_PINNED_ICAPS, \ 38 - CLIENT_METRIC_TYPE_OPENED_INODES, \ 39 - CLIENT_METRIC_TYPE_READ_IO_SIZES, \ 40 - CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ 41 - \ 42 - CLIENT_METRIC_TYPE_MAX, \ 36 + #define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \ 37 + CLIENT_METRIC_TYPE_CAP_INFO, \ 38 + CLIENT_METRIC_TYPE_READ_LATENCY, \ 39 + CLIENT_METRIC_TYPE_WRITE_LATENCY, \ 40 + CLIENT_METRIC_TYPE_METADATA_LATENCY, \ 41 + CLIENT_METRIC_TYPE_DENTRY_LEASE, \ 42 + CLIENT_METRIC_TYPE_OPENED_FILES, \ 43 + CLIENT_METRIC_TYPE_PINNED_ICAPS, \ 44 + CLIENT_METRIC_TYPE_OPENED_INODES, \ 45 + CLIENT_METRIC_TYPE_READ_IO_SIZES, \ 46 + CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \ 47 + CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \ 48 + CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \ 49 + CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \ 50 + CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \ 51 + CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \ 52 + CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \ 53 + \ 54 + CLIENT_METRIC_TYPE_MAX, \ 43 55 } 44 56 45 57 struct ceph_metric_header { ··· 72 60 /* metric read latency header */ 73 61 struct ceph_metric_read_latency { 74 62 struct ceph_metric_header header; 75 - __le32 sec; 76 - __le32 nsec; 63 + struct ceph_timespec lat; 64 + struct ceph_timespec avg; 65 + __le64 sq_sum; 66 + __le64 count; 77 67 } __packed; 78 68 79 69 /* metric write latency header */ 80 70 struct ceph_metric_write_latency { 81 71 struct ceph_metric_header header; 82 - __le32 sec; 83 - __le32 nsec; 72 + struct ceph_timespec lat; 73 + struct ceph_timespec avg; 74 + __le64 sq_sum; 75 + __le64 count; 84 76 } __packed; 85 77 86 78 /* metric metadata latency header */ 87 79 struct ceph_metric_metadata_latency { 88 80 struct ceph_metric_header header; 89 - __le32 sec; 90 - __le32 nsec; 81 + struct ceph_timespec lat; 82 + struct ceph_timespec avg; 83 + __le64 sq_sum; 84 + __le64 count; 91 85 } __packed; 92 86 93 87 /* metric dentry lease header */ ··· 158 140 u64 size_min; 159 141 u64 size_max; 160 142 ktime_t latency_sum; 143 + ktime_t latency_avg; 161 144 ktime_t latency_sq_sum; 162 145 ktime_t latency_min; 163 146 ktime_t latency_max;
+167 -96
fs/ceph/snap.c
··· 121 121 if (!realm) 122 122 return ERR_PTR(-ENOMEM); 123 123 124 - atomic_set(&realm->nref, 1); /* for caller */ 124 + /* Do not release the global dummy snaprealm until unmouting */ 125 + if (ino == CEPH_INO_GLOBAL_SNAPREALM) 126 + atomic_set(&realm->nref, 2); 127 + else 128 + atomic_set(&realm->nref, 1); 125 129 realm->ino = ino; 126 130 INIT_LIST_HEAD(&realm->children); 127 131 INIT_LIST_HEAD(&realm->child_item); 128 132 INIT_LIST_HEAD(&realm->empty_item); 129 133 INIT_LIST_HEAD(&realm->dirty_item); 134 + INIT_LIST_HEAD(&realm->rebuild_item); 130 135 INIT_LIST_HEAD(&realm->inodes_with_caps); 131 136 spin_lock_init(&realm->inodes_with_caps_lock); 132 137 __insert_snap_realm(&mdsc->snap_realms, realm); 133 138 mdsc->num_snap_realms++; 134 139 135 - dout("create_snap_realm %llx %p\n", realm->ino, realm); 140 + dout("%s %llx %p\n", __func__, realm->ino, realm); 136 141 return realm; 137 142 } 138 143 ··· 161 156 else if (ino > r->ino) 162 157 n = n->rb_right; 163 158 else { 164 - dout("lookup_snap_realm %llx %p\n", r->ino, r); 159 + dout("%s %llx %p\n", __func__, r->ino, r); 165 160 return r; 166 161 } 167 162 } ··· 189 184 { 190 185 lockdep_assert_held_write(&mdsc->snap_rwsem); 191 186 192 - dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); 187 + dout("%s %p %llx\n", __func__, realm, realm->ino); 193 188 194 189 rb_erase(&realm->node, &mdsc->snap_realms); 195 190 mdsc->num_snap_realms--; ··· 265 260 spin_unlock(&mdsc->snap_empty_lock); 266 261 } 267 262 268 - void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc) 263 + void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc) 269 264 { 265 + struct ceph_snap_realm *global_realm; 266 + 270 267 down_write(&mdsc->snap_rwsem); 268 + global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM); 269 + if (global_realm) 270 + ceph_put_snap_realm(mdsc, global_realm); 271 271 __cleanup_empty_realms(mdsc); 272 272 up_write(&mdsc->snap_rwsem); 273 273 } ··· 302 292 if (IS_ERR(parent)) 303 293 return PTR_ERR(parent); 304 294 } 305 - dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n", 306 - realm->ino, realm, realm->parent_ino, realm->parent, 307 - parentino, parent); 295 + dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino, 296 + realm, realm->parent_ino, realm->parent, parentino, parent); 308 297 if (realm->parent) { 309 298 list_del_init(&realm->child_item); 310 299 ceph_put_snap_realm(mdsc, realm->parent); ··· 329 320 * build the snap context for a given realm. 330 321 */ 331 322 static int build_snap_context(struct ceph_snap_realm *realm, 332 - struct list_head* dirty_realms) 323 + struct list_head *realm_queue, 324 + struct list_head *dirty_realms) 333 325 { 334 326 struct ceph_snap_realm *parent = realm->parent; 335 327 struct ceph_snap_context *snapc; ··· 344 334 */ 345 335 if (parent) { 346 336 if (!parent->cached_context) { 347 - err = build_snap_context(parent, dirty_realms); 348 - if (err) 349 - goto fail; 337 + /* add to the queue head */ 338 + list_add(&parent->rebuild_item, realm_queue); 339 + return 1; 350 340 } 351 341 num += parent->cached_context->num_snaps; 352 342 } ··· 359 349 realm->cached_context->seq == realm->seq && 360 350 (!parent || 361 351 realm->cached_context->seq >= parent->cached_context->seq)) { 362 - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)" 363 - " (unchanged)\n", 364 - realm->ino, realm, realm->cached_context, 352 + dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n", 353 + __func__, realm->ino, realm, realm->cached_context, 365 354 realm->cached_context->seq, 366 355 (unsigned int)realm->cached_context->num_snaps); 367 356 return 0; ··· 399 390 400 391 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); 401 392 snapc->num_snaps = num; 402 - dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n", 403 - realm->ino, realm, snapc, snapc->seq, 404 - (unsigned int) snapc->num_snaps); 393 + dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino, 394 + realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); 405 395 406 396 ceph_put_snap_context(realm->cached_context); 407 397 realm->cached_context = snapc; ··· 417 409 ceph_put_snap_context(realm->cached_context); 418 410 realm->cached_context = NULL; 419 411 } 420 - pr_err("build_snap_context %llx %p fail %d\n", realm->ino, 421 - realm, err); 412 + pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err); 422 413 return err; 423 414 } 424 415 ··· 427 420 static void rebuild_snap_realms(struct ceph_snap_realm *realm, 428 421 struct list_head *dirty_realms) 429 422 { 430 - struct ceph_snap_realm *child; 423 + LIST_HEAD(realm_queue); 424 + int last = 0; 425 + bool skip = false; 431 426 432 - dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); 433 - build_snap_context(realm, dirty_realms); 427 + list_add_tail(&realm->rebuild_item, &realm_queue); 434 428 435 - list_for_each_entry(child, &realm->children, child_item) 436 - rebuild_snap_realms(child, dirty_realms); 429 + while (!list_empty(&realm_queue)) { 430 + struct ceph_snap_realm *_realm, *child; 431 + 432 + _realm = list_first_entry(&realm_queue, 433 + struct ceph_snap_realm, 434 + rebuild_item); 435 + 436 + /* 437 + * If the last building failed dues to memory 438 + * issue, just empty the realm_queue and return 439 + * to avoid infinite loop. 440 + */ 441 + if (last < 0) { 442 + list_del_init(&_realm->rebuild_item); 443 + continue; 444 + } 445 + 446 + last = build_snap_context(_realm, &realm_queue, dirty_realms); 447 + dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm, 448 + last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); 449 + 450 + /* is any child in the list ? */ 451 + list_for_each_entry(child, &_realm->children, child_item) { 452 + if (!list_empty(&child->rebuild_item)) { 453 + skip = true; 454 + break; 455 + } 456 + } 457 + 458 + if (!skip) { 459 + list_for_each_entry(child, &_realm->children, child_item) 460 + list_add_tail(&child->rebuild_item, &realm_queue); 461 + } 462 + 463 + /* last == 1 means need to build parent first */ 464 + if (last <= 0) 465 + list_del_init(&_realm->rebuild_item); 466 + } 437 467 } 438 468 439 469 ··· 518 474 * Caller must hold snap_rwsem for read (i.e., the realm topology won't 519 475 * change). 520 476 */ 521 - static void ceph_queue_cap_snap(struct ceph_inode_info *ci) 477 + static void ceph_queue_cap_snap(struct ceph_inode_info *ci, 478 + struct ceph_cap_snap **pcapsnap) 522 479 { 523 480 struct inode *inode = &ci->vfs_inode; 524 - struct ceph_cap_snap *capsnap; 525 481 struct ceph_snap_context *old_snapc, *new_snapc; 482 + struct ceph_cap_snap *capsnap = *pcapsnap; 526 483 struct ceph_buffer *old_blob = NULL; 527 484 int used, dirty; 528 - 529 - capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); 530 - if (!capsnap) { 531 - pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); 532 - return; 533 - } 534 - capsnap->cap_flush.is_capsnap = true; 535 - INIT_LIST_HEAD(&capsnap->cap_flush.i_list); 536 - INIT_LIST_HEAD(&capsnap->cap_flush.g_list); 537 485 538 486 spin_lock(&ci->i_ceph_lock); 539 487 used = __ceph_caps_used(ci); ··· 547 511 as no new writes are allowed to start when pending, so any 548 512 writes in progress now were started before the previous 549 513 cap_snap. lucky us. */ 550 - dout("queue_cap_snap %p already pending\n", inode); 514 + dout("%s %p %llx.%llx already pending\n", 515 + __func__, inode, ceph_vinop(inode)); 551 516 goto update_snapc; 552 517 } 553 518 if (ci->i_wrbuffer_ref_head == 0 && 554 519 !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { 555 - dout("queue_cap_snap %p nothing dirty|writing\n", inode); 520 + dout("%s %p %llx.%llx nothing dirty|writing\n", 521 + __func__, inode, ceph_vinop(inode)); 556 522 goto update_snapc; 557 523 } 558 524 ··· 574 536 } else { 575 537 if (!(used & CEPH_CAP_FILE_WR) && 576 538 ci->i_wrbuffer_ref_head == 0) { 577 - dout("queue_cap_snap %p " 578 - "no new_snap|dirty_page|writing\n", inode); 539 + dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n", 540 + __func__, inode, ceph_vinop(inode)); 579 541 goto update_snapc; 580 542 } 581 543 } 582 544 583 - dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", 584 - inode, capsnap, old_snapc, ceph_cap_string(dirty), 585 - capsnap->need_flush ? "" : "no_flush"); 545 + dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n", 546 + __func__, inode, ceph_vinop(inode), capsnap, old_snapc, 547 + ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); 586 548 ihold(inode); 587 - 588 - refcount_set(&capsnap->nref, 1); 589 - INIT_LIST_HEAD(&capsnap->ci_item); 590 549 591 550 capsnap->follows = old_snapc->seq; 592 551 capsnap->issued = __ceph_caps_issued(ci, NULL); ··· 614 579 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 615 580 616 581 if (used & CEPH_CAP_FILE_WR) { 617 - dout("queue_cap_snap %p cap_snap %p snapc %p" 618 - " seq %llu used WR, now pending\n", inode, 582 + dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR," 583 + " now pending\n", __func__, inode, ceph_vinop(inode), 619 584 capsnap, old_snapc, old_snapc->seq); 620 585 capsnap->writing = 1; 621 586 } else { 622 587 /* note mtime, size NOW. */ 623 588 __ceph_finish_cap_snap(ci, capsnap); 624 589 } 625 - capsnap = NULL; 590 + *pcapsnap = NULL; 626 591 old_snapc = NULL; 627 592 628 593 update_snapc: 629 - if (ci->i_wrbuffer_ref_head == 0 && 630 - ci->i_wr_ref == 0 && 631 - ci->i_dirty_caps == 0 && 632 - ci->i_flushing_caps == 0) { 633 - ci->i_head_snapc = NULL; 634 - } else { 594 + if (ci->i_wrbuffer_ref_head == 0 && 595 + ci->i_wr_ref == 0 && 596 + ci->i_dirty_caps == 0 && 597 + ci->i_flushing_caps == 0) { 598 + ci->i_head_snapc = NULL; 599 + } else { 635 600 ci->i_head_snapc = ceph_get_snap_context(new_snapc); 636 601 dout(" new snapc is %p\n", new_snapc); 637 602 } 638 603 spin_unlock(&ci->i_ceph_lock); 639 604 640 605 ceph_buffer_put(old_blob); 641 - kfree(capsnap); 642 606 ceph_put_snap_context(old_snapc); 643 607 } 644 608 ··· 666 632 capsnap->truncate_size = ci->i_truncate_size; 667 633 capsnap->truncate_seq = ci->i_truncate_seq; 668 634 if (capsnap->dirty_pages) { 669 - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " 670 - "still has %d dirty pages\n", inode, capsnap, 671 - capsnap->context, capsnap->context->seq, 672 - ceph_cap_string(capsnap->dirty), capsnap->size, 673 - capsnap->dirty_pages); 635 + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " 636 + "still has %d dirty pages\n", __func__, inode, 637 + ceph_vinop(inode), capsnap, capsnap->context, 638 + capsnap->context->seq, ceph_cap_string(capsnap->dirty), 639 + capsnap->size, capsnap->dirty_pages); 674 640 return 0; 675 641 } 676 642 677 643 /* Fb cap still in use, delay it */ 678 644 if (ci->i_wb_ref) { 679 - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " 680 - "used WRBUFFER, delaying\n", inode, capsnap, 681 - capsnap->context, capsnap->context->seq, 682 - ceph_cap_string(capsnap->dirty), capsnap->size); 645 + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " 646 + "used WRBUFFER, delaying\n", __func__, inode, 647 + ceph_vinop(inode), capsnap, capsnap->context, 648 + capsnap->context->seq, ceph_cap_string(capsnap->dirty), 649 + capsnap->size); 683 650 capsnap->writing = 1; 684 651 return 0; 685 652 } 686 653 687 654 ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; 688 - dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", 689 - inode, capsnap, capsnap->context, 655 + dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", 656 + __func__, inode, ceph_vinop(inode), capsnap, capsnap->context, 690 657 capsnap->context->seq, ceph_cap_string(capsnap->dirty), 691 658 capsnap->size); 692 659 ··· 706 671 { 707 672 struct ceph_inode_info *ci; 708 673 struct inode *lastinode = NULL; 674 + struct ceph_cap_snap *capsnap = NULL; 709 675 710 - dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); 676 + dout("%s %p %llx inode\n", __func__, realm, realm->ino); 711 677 712 678 spin_lock(&realm->inodes_with_caps_lock); 713 679 list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { ··· 718 682 spin_unlock(&realm->inodes_with_caps_lock); 719 683 iput(lastinode); 720 684 lastinode = inode; 721 - ceph_queue_cap_snap(ci); 685 + 686 + /* 687 + * Allocate the capsnap memory outside of ceph_queue_cap_snap() 688 + * to reduce very possible but unnecessary frequently memory 689 + * allocate/free in this loop. 690 + */ 691 + if (!capsnap) { 692 + capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); 693 + if (!capsnap) { 694 + pr_err("ENOMEM allocating ceph_cap_snap on %p\n", 695 + inode); 696 + return; 697 + } 698 + } 699 + capsnap->cap_flush.is_capsnap = true; 700 + refcount_set(&capsnap->nref, 1); 701 + INIT_LIST_HEAD(&capsnap->cap_flush.i_list); 702 + INIT_LIST_HEAD(&capsnap->cap_flush.g_list); 703 + INIT_LIST_HEAD(&capsnap->ci_item); 704 + 705 + ceph_queue_cap_snap(ci, &capsnap); 722 706 spin_lock(&realm->inodes_with_caps_lock); 723 707 } 724 708 spin_unlock(&realm->inodes_with_caps_lock); 725 709 iput(lastinode); 726 710 727 - dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); 711 + if (capsnap) 712 + kmem_cache_free(ceph_cap_snap_cachep, capsnap); 713 + dout("%s %p %llx done\n", __func__, realm, realm->ino); 728 714 } 729 715 730 716 /* ··· 765 707 __le64 *prior_parent_snaps; /* encoded */ 766 708 struct ceph_snap_realm *realm = NULL; 767 709 struct ceph_snap_realm *first_realm = NULL; 768 - int invalidate = 0; 710 + struct ceph_snap_realm *realm_to_rebuild = NULL; 711 + int rebuild_snapcs; 769 712 int err = -ENOMEM; 770 713 LIST_HEAD(dirty_realms); 771 714 772 715 lockdep_assert_held_write(&mdsc->snap_rwsem); 773 716 774 - dout("update_snap_trace deletion=%d\n", deletion); 717 + dout("%s deletion=%d\n", __func__, deletion); 775 718 more: 719 + rebuild_snapcs = 0; 776 720 ceph_decode_need(&p, e, sizeof(*ri), bad); 777 721 ri = p; 778 722 p += sizeof(*ri); ··· 798 738 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); 799 739 if (err < 0) 800 740 goto fail; 801 - invalidate += err; 741 + rebuild_snapcs += err; 802 742 803 743 if (le64_to_cpu(ri->seq) > realm->seq) { 804 - dout("update_snap_trace updating %llx %p %lld -> %lld\n", 744 + dout("%s updating %llx %p %lld -> %lld\n", __func__, 805 745 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); 806 746 /* update realm parameters, snap lists */ 807 747 realm->seq = le64_to_cpu(ri->seq); ··· 823 763 if (realm->seq > mdsc->last_snap_seq) 824 764 mdsc->last_snap_seq = realm->seq; 825 765 826 - invalidate = 1; 766 + rebuild_snapcs = 1; 827 767 } else if (!realm->cached_context) { 828 - dout("update_snap_trace %llx %p seq %lld new\n", 768 + dout("%s %llx %p seq %lld new\n", __func__, 829 769 realm->ino, realm, realm->seq); 830 - invalidate = 1; 770 + rebuild_snapcs = 1; 831 771 } else { 832 - dout("update_snap_trace %llx %p seq %lld unchanged\n", 772 + dout("%s %llx %p seq %lld unchanged\n", __func__, 833 773 realm->ino, realm, realm->seq); 834 774 } 835 775 836 - dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, 837 - realm, invalidate, p, e); 776 + dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, 777 + realm, rebuild_snapcs, p, e); 838 778 839 - /* invalidate when we reach the _end_ (root) of the trace */ 840 - if (invalidate && p >= e) 841 - rebuild_snap_realms(realm, &dirty_realms); 779 + /* 780 + * this will always track the uppest parent realm from which 781 + * we need to rebuild the snapshot contexts _downward_ in 782 + * hierarchy. 783 + */ 784 + if (rebuild_snapcs) 785 + realm_to_rebuild = realm; 786 + 787 + /* rebuild_snapcs when we reach the _end_ (root) of the trace */ 788 + if (realm_to_rebuild && p >= e) 789 + rebuild_snap_realms(realm_to_rebuild, &dirty_realms); 842 790 843 791 if (!first_realm) 844 792 first_realm = realm; ··· 882 814 ceph_put_snap_realm(mdsc, realm); 883 815 if (first_realm) 884 816 ceph_put_snap_realm(mdsc, first_realm); 885 - pr_err("update_snap_trace error %d\n", err); 817 + pr_err("%s error %d\n", __func__, err); 886 818 return err; 887 819 } 888 820 ··· 899 831 struct inode *inode; 900 832 struct ceph_mds_session *session = NULL; 901 833 902 - dout("flush_snaps\n"); 834 + dout("%s\n", __func__); 903 835 spin_lock(&mdsc->snap_flush_lock); 904 836 while (!list_empty(&mdsc->snap_flush_list)) { 905 837 ci = list_first_entry(&mdsc->snap_flush_list, ··· 914 846 spin_unlock(&mdsc->snap_flush_lock); 915 847 916 848 ceph_put_mds_session(session); 917 - dout("flush_snaps done\n"); 849 + dout("%s done\n", __func__); 918 850 } 919 851 920 852 /** ··· 996 928 trace_len = le32_to_cpu(h->trace_len); 997 929 p += sizeof(*h); 998 930 999 - dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds, 1000 - ceph_snap_op_name(op), split, trace_len); 931 + dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, 932 + mds, ceph_snap_op_name(op), split, trace_len); 1001 933 1002 934 mutex_lock(&session->s_mutex); 1003 935 inc_session_sequence(session); ··· 1057 989 */ 1058 990 if (ci->i_snap_realm->created > 1059 991 le64_to_cpu(ri->created)) { 1060 - dout(" leaving %p in newer realm %llx %p\n", 1061 - inode, ci->i_snap_realm->ino, 992 + dout(" leaving %p %llx.%llx in newer realm %llx %p\n", 993 + inode, ceph_vinop(inode), ci->i_snap_realm->ino, 1062 994 ci->i_snap_realm); 1063 995 goto skip_inode; 1064 996 } 1065 - dout(" will move %p to split realm %llx %p\n", 1066 - inode, realm->ino, realm); 997 + dout(" will move %p %llx.%llx to split realm %llx %p\n", 998 + inode, ceph_vinop(inode), realm->ino, realm); 1067 999 1068 1000 ceph_get_snap_realm(mdsc, realm); 1069 1001 ceph_change_snap_realm(inode, realm); ··· 1106 1038 return; 1107 1039 1108 1040 bad: 1109 - pr_err("corrupt snap message from mds%d\n", mds); 1041 + pr_err("%s corrupt snap message from mds%d\n", __func__, mds); 1110 1042 ceph_msg_dump(msg); 1111 1043 out: 1112 1044 if (locked_rwsem) ··· 1139 1071 } 1140 1072 spin_unlock(&mdsc->snapid_map_lock); 1141 1073 if (exist) { 1142 - dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); 1074 + dout("%s found snapid map %llx -> %x\n", __func__, 1075 + exist->snap, exist->dev); 1143 1076 return exist; 1144 1077 } 1145 1078 ··· 1184 1115 if (exist) { 1185 1116 free_anon_bdev(sm->dev); 1186 1117 kfree(sm); 1187 - dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); 1118 + dout("%s found snapid map %llx -> %x\n", __func__, 1119 + exist->snap, exist->dev); 1188 1120 return exist; 1189 1121 } 1190 1122 1191 - dout("create snapid map %llx -> %x\n", sm->snap, sm->dev); 1123 + dout("%s create snapid map %llx -> %x\n", __func__, 1124 + sm->snap, sm->dev); 1192 1125 return sm; 1193 1126 } 1194 1127
+1
fs/ceph/strings.c
··· 60 60 case CEPH_MDS_OP_LOOKUPINO: return "lookupino"; 61 61 case CEPH_MDS_OP_LOOKUPNAME: return "lookupname"; 62 62 case CEPH_MDS_OP_GETATTR: return "getattr"; 63 + case CEPH_MDS_OP_GETVXATTR: return "getvxattr"; 63 64 case CEPH_MDS_OP_SETXATTR: return "setxattr"; 64 65 case CEPH_MDS_OP_SETATTR: return "setattr"; 65 66 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+7
fs/ceph/super.c
··· 865 865 */ 866 866 struct kmem_cache *ceph_inode_cachep; 867 867 struct kmem_cache *ceph_cap_cachep; 868 + struct kmem_cache *ceph_cap_snap_cachep; 868 869 struct kmem_cache *ceph_cap_flush_cachep; 869 870 struct kmem_cache *ceph_dentry_cachep; 870 871 struct kmem_cache *ceph_file_cachep; ··· 894 893 ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD); 895 894 if (!ceph_cap_cachep) 896 895 goto bad_cap; 896 + ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD); 897 + if (!ceph_cap_snap_cachep) 898 + goto bad_cap_snap; 897 899 ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, 898 900 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); 899 901 if (!ceph_cap_flush_cachep) ··· 936 932 bad_dentry: 937 933 kmem_cache_destroy(ceph_cap_flush_cachep); 938 934 bad_cap_flush: 935 + kmem_cache_destroy(ceph_cap_snap_cachep); 936 + bad_cap_snap: 939 937 kmem_cache_destroy(ceph_cap_cachep); 940 938 bad_cap: 941 939 kmem_cache_destroy(ceph_inode_cachep); ··· 954 948 955 949 kmem_cache_destroy(ceph_inode_cachep); 956 950 kmem_cache_destroy(ceph_cap_cachep); 951 + kmem_cache_destroy(ceph_cap_snap_cachep); 957 952 kmem_cache_destroy(ceph_cap_flush_cachep); 958 953 kmem_cache_destroy(ceph_dentry_cachep); 959 954 kmem_cache_destroy(ceph_file_cachep);
+6 -3
fs/ceph/super.h
··· 231 231 if (refcount_dec_and_test(&capsnap->nref)) { 232 232 if (capsnap->xattr_blob) 233 233 ceph_buffer_put(capsnap->xattr_blob); 234 - kfree(capsnap); 234 + kmem_cache_free(ceph_cap_snap_cachep, capsnap); 235 235 } 236 236 } 237 237 ··· 884 884 885 885 struct list_head dirty_item; /* if realm needs new context */ 886 886 887 + struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */ 888 + 887 889 /* the current set of snaps for this realm */ 888 890 struct ceph_snap_context *cached_context; 889 891 ··· 941 939 struct ceph_msg *msg); 942 940 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 943 941 struct ceph_cap_snap *capsnap); 944 - extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); 942 + extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc); 945 943 946 944 extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, 947 945 u64 snap); ··· 1051 1049 1052 1050 /* xattr.c */ 1053 1051 int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); 1052 + int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size); 1054 1053 ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); 1055 1054 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); 1056 1055 extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); ··· 1217 1214 /* addr.c */ 1218 1215 extern const struct address_space_operations ceph_aops; 1219 1216 extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); 1220 - extern int ceph_uninline_data(struct file *filp, struct page *locked_page); 1217 + extern int ceph_uninline_data(struct file *file); 1221 1218 extern int ceph_pool_perm_check(struct inode *inode, int need); 1222 1219 extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); 1223 1220 int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
+11 -2
fs/ceph/xattr.c
··· 923 923 { 924 924 struct ceph_inode_info *ci = ceph_inode(inode); 925 925 struct ceph_inode_xattr *xattr; 926 - struct ceph_vxattr *vxattr = NULL; 926 + struct ceph_vxattr *vxattr; 927 927 int req_mask; 928 928 ssize_t err; 929 + 930 + if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) 931 + goto handle_non_vxattrs; 929 932 930 933 /* let's see if a virtual xattr was requested */ 931 934 vxattr = ceph_match_vxattr(inode, name); ··· 948 945 err = -ERANGE; 949 946 } 950 947 return err; 948 + } else { 949 + err = ceph_do_getvxattr(inode, name, value, size); 950 + /* this would happen with a new client and old server combo */ 951 + if (err == -EOPNOTSUPP) 952 + err = -ENODATA; 953 + return err; 951 954 } 952 - 955 + handle_non_vxattrs: 953 956 req_mask = __get_request_mask(inode); 954 957 955 958 spin_lock(&ci->i_ceph_lock);
+3 -2
include/linux/ceph/ceph_fs.h
··· 28 28 29 29 30 30 #define CEPH_INO_ROOT 1 31 - #define CEPH_INO_CEPH 2 /* hidden .ceph dir */ 32 - #define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */ 31 + #define CEPH_INO_CEPH 2 /* hidden .ceph dir */ 32 + #define CEPH_INO_GLOBAL_SNAPREALM 3 /* global dummy snaprealm */ 33 33 34 34 /* arbitrary limit on max # of monitors (cluster of 3 is typical) */ 35 35 #define CEPH_MAX_MON 31 ··· 328 328 CEPH_MDS_OP_LOOKUPPARENT = 0x00103, 329 329 CEPH_MDS_OP_LOOKUPINO = 0x00104, 330 330 CEPH_MDS_OP_LOOKUPNAME = 0x00105, 331 + CEPH_MDS_OP_GETVXATTR = 0x00106, 331 332 332 333 CEPH_MDS_OP_SETXATTR = 0x01105, 333 334 CEPH_MDS_OP_RMXATTR = 0x01106,
+1
include/linux/ceph/libceph.h
··· 284 284 285 285 extern struct kmem_cache *ceph_inode_cachep; 286 286 extern struct kmem_cache *ceph_cap_cachep; 287 + extern struct kmem_cache *ceph_cap_snap_cachep; 287 288 extern struct kmem_cache *ceph_cap_flush_cachep; 288 289 extern struct kmem_cache *ceph_dentry_cachep; 289 290 extern struct kmem_cache *ceph_file_cachep;
+2 -6
net/ceph/messenger_v2.c
··· 1773 1773 1774 1774 bv.bv_page = con->bounce_page; 1775 1775 bv.bv_offset = 0; 1776 - set_in_bvec(con, &bv); 1777 - } else { 1778 - set_in_bvec(con, &bv); 1779 1776 } 1777 + set_in_bvec(con, &bv); 1780 1778 con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT; 1781 1779 return 0; 1782 1780 } ··· 1805 1807 if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) { 1806 1808 bv.bv_page = con->bounce_page; 1807 1809 bv.bv_offset = 0; 1808 - set_in_bvec(con, &bv); 1809 - } else { 1810 - set_in_bvec(con, &bv); 1811 1810 } 1811 + set_in_bvec(con, &bv); 1812 1812 WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT); 1813 1813 return; 1814 1814 }