Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

pNFS: Fix a deadlock when returning a delegation during open()

Ben Coddington reports seeing a hang in the following stack trace:
0 [ffffd0b50e1774e0] __schedule at ffffffff9ca05415
1 [ffffd0b50e177548] schedule at ffffffff9ca05717
2 [ffffd0b50e177558] bit_wait at ffffffff9ca061e1
3 [ffffd0b50e177568] __wait_on_bit at ffffffff9ca05cfb
4 [ffffd0b50e1775c8] out_of_line_wait_on_bit at ffffffff9ca05ea5
5 [ffffd0b50e177618] pnfs_roc at ffffffffc154207b [nfsv4]
6 [ffffd0b50e1776b8] _nfs4_proc_delegreturn at ffffffffc1506586 [nfsv4]
7 [ffffd0b50e177788] nfs4_proc_delegreturn at ffffffffc1507480 [nfsv4]
8 [ffffd0b50e1777f8] nfs_do_return_delegation at ffffffffc1523e41 [nfsv4]
9 [ffffd0b50e177838] nfs_inode_set_delegation at ffffffffc1524a75 [nfsv4]
10 [ffffd0b50e177888] nfs4_process_delegation at ffffffffc14f41dd [nfsv4]
11 [ffffd0b50e1778a0] _nfs4_opendata_to_nfs4_state at ffffffffc1503edf [nfsv4]
12 [ffffd0b50e1778c0] _nfs4_open_and_get_state at ffffffffc1504e56 [nfsv4]
13 [ffffd0b50e177978] _nfs4_do_open at ffffffffc15051b8 [nfsv4]
14 [ffffd0b50e1779f8] nfs4_do_open at ffffffffc150559c [nfsv4]
15 [ffffd0b50e177a80] nfs4_atomic_open at ffffffffc15057fb [nfsv4]
16 [ffffd0b50e177ad0] nfs4_file_open at ffffffffc15219be [nfsv4]
17 [ffffd0b50e177b78] do_dentry_open at ffffffff9c09e6ea
18 [ffffd0b50e177ba8] vfs_open at ffffffff9c0a082e
19 [ffffd0b50e177bd0] dentry_open at ffffffff9c0a0935

The issue is that the delegreturn is being asked to wait for a layout
return that cannot complete because a state recovery was initiated. The
state recovery cannot complete until the open() finishes processing the
delegations it was given.

The solution is to propagate the existing flags that indicate a
non-blocking call to the function pnfs_roc(), so that it knows not to
wait in this situation.

Reported-by: Benjamin Coddington <bcodding@hammerspace.com>
Fixes: 29ade5db1293 ("pNFS: Wait on outstanding layoutreturns to complete in pnfs_roc()")
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>

+51 -30
+3 -3
fs/nfs/nfs4proc.c
··· 3894 3894 calldata->res.seqid = calldata->arg.seqid; 3895 3895 calldata->res.server = server; 3896 3896 calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; 3897 - calldata->lr.roc = pnfs_roc(state->inode, 3898 - &calldata->lr.arg, &calldata->lr.res, msg.rpc_cred); 3897 + calldata->lr.roc = pnfs_roc(state->inode, &calldata->lr.arg, 3898 + &calldata->lr.res, msg.rpc_cred, wait); 3899 3899 if (calldata->lr.roc) { 3900 3900 calldata->arg.lr_args = &calldata->lr.arg; 3901 3901 calldata->res.lr_res = &calldata->lr.res; ··· 7005 7005 data->inode = nfs_igrab_and_active(inode); 7006 7006 if (data->inode || issync) { 7007 7007 data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res, 7008 - cred); 7008 + cred, issync); 7009 7009 if (data->lr.roc) { 7010 7010 data->args.lr_args = &data->lr.arg; 7011 7011 data->res.lr_res = &data->lr.res;
+41 -17
fs/nfs/pnfs.c
··· 1533 1533 PNFS_FL_LAYOUTRETURN_PRIVILEGED); 1534 1534 } 1535 1535 1536 - bool pnfs_roc(struct inode *ino, 1537 - struct nfs4_layoutreturn_args *args, 1538 - struct nfs4_layoutreturn_res *res, 1539 - const struct cred *cred) 1536 + bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, 1537 + struct nfs4_layoutreturn_res *res, const struct cred *cred, 1538 + bool sync) 1540 1539 { 1541 1540 struct nfs_inode *nfsi = NFS_I(ino); 1542 1541 struct nfs_open_context *ctx; ··· 1546 1547 nfs4_stateid stateid; 1547 1548 enum pnfs_iomode iomode = 0; 1548 1549 bool layoutreturn = false, roc = false; 1549 - bool skip_read = false; 1550 + bool skip_read; 1550 1551 1551 1552 if (!nfs_have_layout(ino)) 1552 1553 return false; ··· 1559 1560 lo = NULL; 1560 1561 goto out_noroc; 1561 1562 } 1562 - pnfs_get_layout_hdr(lo); 1563 - if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { 1564 - spin_unlock(&ino->i_lock); 1565 - rcu_read_unlock(); 1566 - wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, 1567 - TASK_UNINTERRUPTIBLE); 1568 - pnfs_put_layout_hdr(lo); 1569 - goto retry; 1570 - } 1571 1563 1572 1564 /* no roc if we hold a delegation */ 1565 + skip_read = false; 1573 1566 if (nfs4_check_delegation(ino, FMODE_READ)) { 1574 - if (nfs4_check_delegation(ino, FMODE_WRITE)) 1567 + if (nfs4_check_delegation(ino, FMODE_WRITE)) { 1568 + lo = NULL; 1575 1569 goto out_noroc; 1570 + } 1576 1571 skip_read = true; 1577 1572 } 1578 1573 ··· 1575 1582 if (state == NULL) 1576 1583 continue; 1577 1584 /* Don't return layout if there is open file state */ 1578 - if (state->state & FMODE_WRITE) 1585 + if (state->state & FMODE_WRITE) { 1586 + lo = NULL; 1579 1587 goto out_noroc; 1588 + } 1580 1589 if (state->state & FMODE_READ) 1581 1590 skip_read = true; 1582 1591 } 1583 1592 1593 + if (skip_read) { 1594 + bool writes = false; 1595 + 1596 + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1597 + if (lseg->pls_range.iomode != IOMODE_READ) { 1598 + writes = true; 1599 + break; 1600 + } 1601 + } 1602 + if (!writes) { 1603 + lo = NULL; 1604 + goto out_noroc; 1605 + } 1606 + } 1607 + 1608 + pnfs_get_layout_hdr(lo); 1609 + if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { 1610 + if (!sync) { 1611 + pnfs_set_plh_return_info( 1612 + lo, skip_read ? IOMODE_RW : IOMODE_ANY, 0); 1613 + goto out_noroc; 1614 + } 1615 + spin_unlock(&ino->i_lock); 1616 + rcu_read_unlock(); 1617 + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, 1618 + TASK_UNINTERRUPTIBLE); 1619 + pnfs_put_layout_hdr(lo); 1620 + goto retry; 1621 + } 1584 1622 1585 1623 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) { 1586 1624 if (skip_read && lseg->pls_range.iomode == IOMODE_READ) ··· 1651 1627 out_noroc: 1652 1628 spin_unlock(&ino->i_lock); 1653 1629 rcu_read_unlock(); 1654 - pnfs_layoutcommit_inode(ino, true); 1630 + pnfs_layoutcommit_inode(ino, sync); 1655 1631 if (roc) { 1656 1632 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 1657 1633 if (ld->prepare_layoutreturn)
+7 -10
fs/nfs/pnfs.h
··· 303 303 u32 seq); 304 304 int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, 305 305 struct list_head *lseg_list); 306 - bool pnfs_roc(struct inode *ino, 307 - struct nfs4_layoutreturn_args *args, 308 - struct nfs4_layoutreturn_res *res, 309 - const struct cred *cred); 306 + bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, 307 + struct nfs4_layoutreturn_res *res, const struct cred *cred, 308 + bool sync); 310 309 int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, 311 310 struct nfs4_layoutreturn_res **respp, int *ret); 312 311 void pnfs_roc_release(struct nfs4_layoutreturn_args *args, ··· 772 773 return false; 773 774 } 774 775 775 - 776 - static inline bool 777 - pnfs_roc(struct inode *ino, 778 - struct nfs4_layoutreturn_args *args, 779 - struct nfs4_layoutreturn_res *res, 780 - const struct cred *cred) 776 + static inline bool pnfs_roc(struct inode *ino, 777 + struct nfs4_layoutreturn_args *args, 778 + struct nfs4_layoutreturn_res *res, 779 + const struct cred *cred, bool sync) 781 780 { 782 781 return false; 783 782 }