Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ceph: fix possible deadlock when holding Fwb to get inline_data

1, mount with wsync.
2, create a file with O_RDWR, and the request was sent to mds.0:

ceph_atomic_open()-->
ceph_mdsc_do_request(openc)
finish_open(file, dentry, ceph_open)-->
ceph_open()-->
ceph_init_file()-->
ceph_init_file_info()-->
ceph_uninline_data()-->
{
...
if (inline_version == 1 || /* initial version, no data */
inline_version == CEPH_INLINE_NONE)
goto out_unlock;
...
}

The inline_version will be 1, which is the initial version for the
new create file. And here the ci->i_inline_version will keep with 1,
it's buggy.

3, buffer write to the file immediately:

ceph_write_iter()-->
ceph_get_caps(file, need=Fw, want=Fb, ...);
generic_perform_write()-->
a_ops->write_begin()-->
ceph_write_begin()-->
netfs_write_begin()-->
netfs_begin_read()-->
netfs_rreq_submit_slice()-->
netfs_read_from_server()-->
rreq->netfs_ops->issue_read()-->
ceph_netfs_issue_read()-->
{
...
if (ci->i_inline_version != CEPH_INLINE_NONE &&
ceph_netfs_issue_op_inline(subreq))
return;
...
}
ceph_put_cap_refs(ci, Fwb);

The ceph_netfs_issue_op_inline() will send a getattr(Fsr) request to
mds.1.

4, then the mds.1 will request the rd lock for CInode::filelock from
the auth mds.0, the mds.0 will do the CInode::filelock state transation
from excl --> sync, but it need to revoke the Fxwb caps back from the
clients.

While the kernel client has aleady held the Fwb caps and waiting for
the getattr(Fsr).

It's deadlock!

URL: https://tracker.ceph.com/issues/55377
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

authored by

Xiubo Li and committed by
Ilya Dryomov
825978fd 3459bd0c

+23 -18
+23 -18
fs/ceph/addr.c
··· 1648 1648 struct inode *inode = file_inode(file); 1649 1649 struct ceph_inode_info *ci = ceph_inode(inode); 1650 1650 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1651 - struct ceph_osd_request *req; 1651 + struct ceph_osd_request *req = NULL; 1652 1652 struct ceph_cap_flush *prealloc_cf; 1653 1653 struct folio *folio = NULL; 1654 1654 u64 inline_version = CEPH_INLINE_NONE; 1655 1655 struct page *pages[1]; 1656 1656 int err = 0; 1657 1657 u64 len; 1658 - 1659 - prealloc_cf = ceph_alloc_cap_flush(); 1660 - if (!prealloc_cf) 1661 - return -ENOMEM; 1662 - 1663 - folio = read_mapping_folio(inode->i_mapping, 0, file); 1664 - if (IS_ERR(folio)) { 1665 - err = PTR_ERR(folio); 1666 - goto out; 1667 - } 1668 - 1669 - folio_lock(folio); 1670 1658 1671 1659 spin_lock(&ci->i_ceph_lock); 1672 1660 inline_version = ci->i_inline_version; ··· 1663 1675 dout("uninline_data %p %llx.%llx inline_version %llu\n", 1664 1676 inode, ceph_vinop(inode), inline_version); 1665 1677 1666 - if (inline_version == 1 || /* initial version, no data */ 1667 - inline_version == CEPH_INLINE_NONE) 1668 - goto out_unlock; 1678 + if (inline_version == CEPH_INLINE_NONE) 1679 + return 0; 1680 + 1681 + prealloc_cf = ceph_alloc_cap_flush(); 1682 + if (!prealloc_cf) 1683 + return -ENOMEM; 1684 + 1685 + if (inline_version == 1) /* initial version, no data */ 1686 + goto out_uninline; 1687 + 1688 + folio = read_mapping_folio(inode->i_mapping, 0, file); 1689 + if (IS_ERR(folio)) { 1690 + err = PTR_ERR(folio); 1691 + goto out; 1692 + } 1693 + 1694 + folio_lock(folio); 1669 1695 1670 1696 len = i_size_read(inode); 1671 1697 if (len > folio_size(folio)) ··· 1745 1743 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 1746 1744 req->r_end_latency, len, err); 1747 1745 1746 + out_uninline: 1748 1747 if (!err) { 1749 1748 int dirty; 1750 1749 ··· 1764 1761 if (err == -ECANCELED) 1765 1762 err = 0; 1766 1763 out_unlock: 1767 - folio_unlock(folio); 1768 - folio_put(folio); 1764 + if (folio) { 1765 + folio_unlock(folio); 1766 + folio_put(folio); 1767 + } 1769 1768 out: 1770 1769 ceph_free_cap_flush(prealloc_cf); 1771 1770 dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",