Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'nfs-for-4.14-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs

Pull NFS client updates from Trond Myklebust:
"Hightlights include:

Stable bugfixes:
- Fix mirror allocation in the writeback code to avoid a use after
free
- Fix the O_DSYNC writes to use the correct byte range
- Fix 2 use after free issues in the I/O code

Features:
- Writeback fixes to split up the inode->i_lock in order to reduce
contention
- RPC client receive fixes to reduce the amount of time the
xprt->transport_lock is held when receiving data from a socket into
am XDR buffer.
- Ditto fixes to reduce contention between call side users of the
rdma rb_lock, and its use in rpcrdma_reply_handler.
- Re-arrange rdma stats to reduce false cacheline sharing.
- Various rdma cleanups and optimisations.
- Refactor the NFSv4.1 exchange id code and clean up the code.
- Const-ify all instances of struct rpc_xprt_ops

Bugfixes:
- Fix the NFSv2 'sec=' mount option.
- NFSv4.1: don't use machine credentials for CLOSE when using
'sec=sys'
- Fix the NFSv3 GRANT callback when the port changes on the server.
- Fix livelock issues with COMMIT
- NFSv4: Use correct inode in _nfs4_opendata_to_nfs4_state() when
doing and NFSv4.1 open by filehandle"

* tag 'nfs-for-4.14-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (69 commits)
NFS: Count the bytes of skipped subrequests in nfs_lock_and_join_requests()
NFS: Don't hold the group lock when calling nfs_release_request()
NFS: Remove pnfs_generic_transfer_commit_list()
NFS: nfs_lock_and_join_requests and nfs_scan_commit_list can deadlock
NFS: Fix 2 use after free issues in the I/O code
NFS: Sync the correct byte range during synchronous writes
lockd: Delete an error message for a failed memory allocation in reclaimer()
NFS: remove jiffies field from access cache
NFS: flush data when locking a file to ensure cache coherence for mmap.
SUNRPC: remove some dead code.
NFS: don't expect errors from mempool_alloc().
xprtrdma: Use xprt_pin_rqst in rpcrdma_reply_handler
xprtrdma: Re-arrange struct rx_stats
NFS: Fix NFSv2 security settings
NFSv4.1: don't use machine credentials for CLOSE when using 'sec=sys'
SUNRPC: ECONNREFUSED should cause a rebind.
NFS: Remove unused parameter gfp_flags from nfs_pageio_init()
NFSv4: Fix up mirror allocation
SUNRPC: Add a separate spinlock to protect the RPC request receive list
SUNRPC: Cleanup xs_tcp_read_common()
...

+1248 -1153
+1 -5
fs/lockd/clntlock.c
··· 235 235 struct net *net = host->net; 236 236 237 237 req = kmalloc(sizeof(*req), GFP_KERNEL); 238 - if (!req) { 239 - printk(KERN_ERR "lockd: reclaimer unable to alloc memory." 240 - " Locks for %s won't be reclaimed!\n", 241 - host->h_name); 238 + if (!req) 242 239 return 0; 243 - } 244 240 245 241 allow_signal(SIGKILL); 246 242
+1 -1
fs/nfs/callback_proc.c
··· 51 51 goto out_iput; 52 52 res->size = i_size_read(inode); 53 53 res->change_attr = delegation->change_attr; 54 - if (nfsi->nrequests != 0) 54 + if (nfs_have_writebacks(inode)) 55 55 res->change_attr++; 56 56 res->ctime = inode->i_ctime; 57 57 res->mtime = inode->i_mtime;
+1 -1
fs/nfs/delegation.c
··· 1089 1089 delegation = rcu_dereference(nfsi->delegation); 1090 1090 if (delegation == NULL || !(delegation->type & FMODE_WRITE)) 1091 1091 goto out; 1092 - if (nfsi->nrequests < delegation->pagemod_limit) 1092 + if (atomic_long_read(&nfsi->nrequests) < delegation->pagemod_limit) 1093 1093 ret = false; 1094 1094 out: 1095 1095 rcu_read_unlock();
-4
fs/nfs/dir.c
··· 2260 2260 spin_lock(&inode->i_lock); 2261 2261 retry = false; 2262 2262 } 2263 - res->jiffies = cache->jiffies; 2264 2263 res->cred = cache->cred; 2265 2264 res->mask = cache->mask; 2266 2265 list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); ··· 2295 2296 goto out; 2296 2297 if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) 2297 2298 goto out; 2298 - res->jiffies = cache->jiffies; 2299 2299 res->cred = cache->cred; 2300 2300 res->mask = cache->mask; 2301 2301 err = 0; ··· 2342 2344 if (cache == NULL) 2343 2345 return; 2344 2346 RB_CLEAR_NODE(&cache->rb_node); 2345 - cache->jiffies = set->jiffies; 2346 2347 cache->cred = get_rpccred(set->cred); 2347 2348 cache->mask = set->mask; 2348 2349 ··· 2429 2432 cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE 2430 2433 | NFS_MAY_WRITE | NFS_MAY_READ; 2431 2434 cache.cred = cred; 2432 - cache.jiffies = jiffies; 2433 2435 status = NFS_PROTO(inode)->access(inode, &cache); 2434 2436 if (status != 0) { 2435 2437 if (status == -ESTALE) {
+2 -2
fs/nfs/direct.c
··· 616 616 struct list_head *list, 617 617 struct nfs_commit_info *cinfo) 618 618 { 619 - spin_lock(&cinfo->inode->i_lock); 619 + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); 620 620 #ifdef CONFIG_NFS_V4_1 621 621 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) 622 622 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); 623 623 #endif 624 624 nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); 625 - spin_unlock(&cinfo->inode->i_lock); 625 + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); 626 626 } 627 627 628 628 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
+10 -7
fs/nfs/file.c
··· 631 631 if (result <= 0) 632 632 goto out; 633 633 634 - result = generic_write_sync(iocb, result); 635 - if (result < 0) 636 - goto out; 637 634 written = result; 638 635 iocb->ki_pos += written; 636 + result = generic_write_sync(iocb, written); 637 + if (result < 0) 638 + goto out; 639 639 640 640 /* Return error values */ 641 641 if (nfs_need_check_write(file, inode)) { ··· 744 744 goto out; 745 745 746 746 /* 747 - * Revalidate the cache if the server has time stamps granular 748 - * enough to detect subsecond changes. Otherwise, clear the 749 - * cache to prevent missing any changes. 747 + * Invalidate cache to prevent missing any changes. If 748 + * the file is mapped, clear the page cache as well so 749 + * those mappings will be loaded. 750 750 * 751 751 * This makes locking act as a cache coherency point. 752 752 */ 753 753 nfs_sync_mapping(filp->f_mapping); 754 - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) 754 + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { 755 755 nfs_zap_caches(inode); 756 + if (mapping_mapped(filp->f_mapping)) 757 + nfs_revalidate_mapping(inode, filp->f_mapping); 758 + } 756 759 out: 757 760 return status; 758 761 }
+5 -5
fs/nfs/inode.c
··· 1285 1285 1286 1286 static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) 1287 1287 { 1288 - struct nfs_inode *nfsi = NFS_I(inode); 1289 1288 unsigned long ret = 0; 1290 1289 1291 1290 if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) ··· 1314 1315 if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) 1315 1316 && (fattr->valid & NFS_ATTR_FATTR_SIZE) 1316 1317 && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) 1317 - && nfsi->nrequests == 0) { 1318 + && !nfs_have_writebacks(inode)) { 1318 1319 i_size_write(inode, nfs_size_to_loff_t(fattr->size)); 1319 1320 ret |= NFS_INO_INVALID_ATTR; 1320 1321 } ··· 1822 1823 if (new_isize != cur_isize) { 1823 1824 /* Do we perhaps have any outstanding writes, or has 1824 1825 * the file grown beyond our last write? */ 1825 - if (nfsi->nrequests == 0 || new_isize > cur_isize) { 1826 + if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { 1826 1827 i_size_write(inode, new_isize); 1827 1828 if (!have_writers) 1828 1829 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; ··· 2011 2012 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); 2012 2013 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 2013 2014 INIT_LIST_HEAD(&nfsi->commit_info.list); 2014 - nfsi->nrequests = 0; 2015 - nfsi->commit_info.ncommit = 0; 2015 + atomic_long_set(&nfsi->nrequests, 0); 2016 + atomic_long_set(&nfsi->commit_info.ncommit, 0); 2016 2017 atomic_set(&nfsi->commit_info.rpcs_out, 0); 2017 2018 init_rwsem(&nfsi->rmdir_sem); 2019 + mutex_init(&nfsi->commit_mutex); 2018 2020 nfs4_init_once(nfsi); 2019 2021 } 2020 2022
-1
fs/nfs/internal.h
··· 251 251 extern const struct nfs_pageio_ops nfs_pgio_rw_ops; 252 252 struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); 253 253 void nfs_pgio_header_free(struct nfs_pgio_header *); 254 - void nfs_pgio_data_destroy(struct nfs_pgio_header *); 255 254 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); 256 255 int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, 257 256 struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+11
fs/nfs/nfs4_fs.h
··· 303 303 struct rpc_cred *newcred = NULL; 304 304 rpc_authflavor_t flavor; 305 305 306 + if (sp4_mode == NFS_SP4_MACH_CRED_CLEANUP || 307 + sp4_mode == NFS_SP4_MACH_CRED_PNFS_CLEANUP) { 308 + /* Using machine creds for cleanup operations 309 + * is only relevent if the client credentials 310 + * might expire. So don't bother for 311 + * RPC_AUTH_UNIX. If file was only exported to 312 + * sec=sys, the PUTFH would fail anyway. 313 + */ 314 + if ((*clntp)->cl_auth->au_flavor == RPC_AUTH_UNIX) 315 + return false; 316 + } 306 317 if (test_bit(sp4_mode, &clp->cl_sp4_flags)) { 307 318 spin_lock(&clp->cl_lock); 308 319 if (clp->cl_machine_cred != NULL)
+150 -124
fs/nfs/nfs4proc.c
··· 1659 1659 return state; 1660 1660 } 1661 1661 1662 + static struct inode * 1663 + nfs4_opendata_get_inode(struct nfs4_opendata *data) 1664 + { 1665 + struct inode *inode; 1666 + 1667 + switch (data->o_arg.claim) { 1668 + case NFS4_OPEN_CLAIM_NULL: 1669 + case NFS4_OPEN_CLAIM_DELEGATE_CUR: 1670 + case NFS4_OPEN_CLAIM_DELEGATE_PREV: 1671 + if (!(data->f_attr.valid & NFS_ATTR_FATTR)) 1672 + return ERR_PTR(-EAGAIN); 1673 + inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, 1674 + &data->f_attr, data->f_label); 1675 + break; 1676 + default: 1677 + inode = d_inode(data->dentry); 1678 + ihold(inode); 1679 + nfs_refresh_inode(inode, &data->f_attr); 1680 + } 1681 + return inode; 1682 + } 1683 + 1684 + static struct nfs4_state * 1685 + nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data) 1686 + { 1687 + struct nfs4_state *state; 1688 + struct inode *inode; 1689 + 1690 + inode = nfs4_opendata_get_inode(data); 1691 + if (IS_ERR(inode)) 1692 + return ERR_CAST(inode); 1693 + if (data->state != NULL && data->state->inode == inode) { 1694 + state = data->state; 1695 + atomic_inc(&state->count); 1696 + } else 1697 + state = nfs4_get_open_state(inode, data->owner); 1698 + iput(inode); 1699 + if (state == NULL) 1700 + state = ERR_PTR(-ENOMEM); 1701 + return state; 1702 + } 1703 + 1662 1704 static struct nfs4_state * 1663 1705 _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) 1664 1706 { 1665 - struct inode *inode; 1666 - struct nfs4_state *state = NULL; 1667 - int ret; 1707 + struct nfs4_state *state; 1668 1708 1669 1709 if (!data->rpc_done) { 1670 1710 state = nfs4_try_open_cached(data); ··· 1712 1672 goto out; 1713 1673 } 1714 1674 1715 - ret = -EAGAIN; 1716 - if (!(data->f_attr.valid & NFS_ATTR_FATTR)) 1717 - goto err; 1718 - inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label); 1719 - ret = PTR_ERR(inode); 1720 - if (IS_ERR(inode)) 1721 - goto err; 1722 - ret = -ENOMEM; 1723 - state = nfs4_get_open_state(inode, data->owner); 1724 - if (state == NULL) 1725 - goto err_put_inode; 1675 + state = nfs4_opendata_find_nfs4_state(data); 1676 + if (IS_ERR(state)) 1677 + goto out; 1678 + 1726 1679 if (data->o_res.delegation_type != 0) 1727 1680 nfs4_opendata_check_deleg(data, state); 1728 1681 update_open_stateid(state, &data->o_res.stateid, NULL, 1729 1682 data->o_arg.fmode); 1730 - iput(inode); 1731 1683 out: 1732 1684 nfs_release_seqid(data->o_arg.seqid); 1733 1685 return state; 1734 - err_put_inode: 1735 - iput(inode); 1736 - err: 1737 - return ERR_PTR(ret); 1738 1686 } 1739 1687 1740 1688 static struct nfs4_state * ··· 2099 2071 data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; 2100 2072 case NFS4_OPEN_CLAIM_FH: 2101 2073 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; 2102 - nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); 2103 2074 } 2104 2075 data->timestamp = jiffies; 2105 2076 if (nfs4_setup_sequence(data->o_arg.server->nfs_client, ··· 2285 2258 mask = NFS4_ACCESS_READ; 2286 2259 2287 2260 cache.cred = cred; 2288 - cache.jiffies = jiffies; 2289 2261 nfs_access_set_mask(&cache, opendata->o_res.access_result); 2290 2262 nfs_access_add_cache(state->inode, &cache); 2291 2263 ··· 7344 7318 1 << (OP_DESTROY_SESSION - 32) | 7345 7319 1 << (OP_DESTROY_CLIENTID - 32) 7346 7320 }; 7321 + unsigned long flags = 0; 7347 7322 unsigned int i; 7323 + int ret = 0; 7348 7324 7349 7325 if (sp->how == SP4_MACH_CRED) { 7350 7326 /* Print state protect result */ ··· 7362 7334 for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) { 7363 7335 if (sp->enforce.u.words[i] & ~supported_enforce[i]) { 7364 7336 dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); 7365 - return -EINVAL; 7337 + ret = -EINVAL; 7338 + goto out; 7366 7339 } 7367 7340 } 7368 7341 ··· 7382 7353 test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) { 7383 7354 dfprintk(MOUNT, "sp4_mach_cred:\n"); 7384 7355 dfprintk(MOUNT, " minimal mode enabled\n"); 7385 - set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags); 7356 + __set_bit(NFS_SP4_MACH_CRED_MINIMAL, &flags); 7386 7357 } else { 7387 7358 dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); 7388 - return -EINVAL; 7359 + ret = -EINVAL; 7360 + goto out; 7389 7361 } 7390 7362 7391 7363 if (test_bit(OP_CLOSE, sp->allow.u.longs) && ··· 7394 7364 test_bit(OP_DELEGRETURN, sp->allow.u.longs) && 7395 7365 test_bit(OP_LOCKU, sp->allow.u.longs)) { 7396 7366 dfprintk(MOUNT, " cleanup mode enabled\n"); 7397 - set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); 7367 + __set_bit(NFS_SP4_MACH_CRED_CLEANUP, &flags); 7398 7368 } 7399 7369 7400 7370 if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) { 7401 7371 dfprintk(MOUNT, " pnfs cleanup mode enabled\n"); 7402 - set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, 7403 - &clp->cl_sp4_flags); 7372 + __set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, &flags); 7404 7373 } 7405 7374 7406 7375 if (test_bit(OP_SECINFO, sp->allow.u.longs) && 7407 7376 test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { 7408 7377 dfprintk(MOUNT, " secinfo mode enabled\n"); 7409 - set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags); 7378 + __set_bit(NFS_SP4_MACH_CRED_SECINFO, &flags); 7410 7379 } 7411 7380 7412 7381 if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) && 7413 7382 test_bit(OP_FREE_STATEID, sp->allow.u.longs)) { 7414 7383 dfprintk(MOUNT, " stateid mode enabled\n"); 7415 - set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags); 7384 + __set_bit(NFS_SP4_MACH_CRED_STATEID, &flags); 7416 7385 } 7417 7386 7418 7387 if (test_bit(OP_WRITE, sp->allow.u.longs)) { 7419 7388 dfprintk(MOUNT, " write mode enabled\n"); 7420 - set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags); 7389 + __set_bit(NFS_SP4_MACH_CRED_WRITE, &flags); 7421 7390 } 7422 7391 7423 7392 if (test_bit(OP_COMMIT, sp->allow.u.longs)) { 7424 7393 dfprintk(MOUNT, " commit mode enabled\n"); 7425 - set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags); 7394 + __set_bit(NFS_SP4_MACH_CRED_COMMIT, &flags); 7426 7395 } 7427 7396 } 7428 - 7397 + out: 7398 + clp->cl_sp4_flags = flags; 7429 7399 return 0; 7430 7400 } 7431 7401 7432 7402 struct nfs41_exchange_id_data { 7433 7403 struct nfs41_exchange_id_res res; 7434 7404 struct nfs41_exchange_id_args args; 7435 - struct rpc_xprt *xprt; 7436 - int rpc_status; 7437 7405 }; 7438 - 7439 - static void nfs4_exchange_id_done(struct rpc_task *task, void *data) 7440 - { 7441 - struct nfs41_exchange_id_data *cdata = 7442 - (struct nfs41_exchange_id_data *)data; 7443 - struct nfs_client *clp = cdata->args.client; 7444 - int status = task->tk_status; 7445 - 7446 - trace_nfs4_exchange_id(clp, status); 7447 - 7448 - if (status == 0) 7449 - status = nfs4_check_cl_exchange_flags(cdata->res.flags); 7450 - 7451 - if (cdata->xprt && status == 0) { 7452 - status = nfs4_detect_session_trunking(clp, &cdata->res, 7453 - cdata->xprt); 7454 - goto out; 7455 - } 7456 - 7457 - if (status == 0) 7458 - status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect); 7459 - 7460 - if (status == 0) { 7461 - clp->cl_clientid = cdata->res.clientid; 7462 - clp->cl_exchange_flags = cdata->res.flags; 7463 - clp->cl_seqid = cdata->res.seqid; 7464 - /* Client ID is not confirmed */ 7465 - if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) 7466 - clear_bit(NFS4_SESSION_ESTABLISHED, 7467 - &clp->cl_session->session_state); 7468 - 7469 - kfree(clp->cl_serverowner); 7470 - clp->cl_serverowner = cdata->res.server_owner; 7471 - cdata->res.server_owner = NULL; 7472 - 7473 - /* use the most recent implementation id */ 7474 - kfree(clp->cl_implid); 7475 - clp->cl_implid = cdata->res.impl_id; 7476 - cdata->res.impl_id = NULL; 7477 - 7478 - if (clp->cl_serverscope != NULL && 7479 - !nfs41_same_server_scope(clp->cl_serverscope, 7480 - cdata->res.server_scope)) { 7481 - dprintk("%s: server_scope mismatch detected\n", 7482 - __func__); 7483 - set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); 7484 - kfree(clp->cl_serverscope); 7485 - clp->cl_serverscope = NULL; 7486 - } 7487 - 7488 - if (clp->cl_serverscope == NULL) { 7489 - clp->cl_serverscope = cdata->res.server_scope; 7490 - cdata->res.server_scope = NULL; 7491 - } 7492 - /* Save the EXCHANGE_ID verifier session trunk tests */ 7493 - memcpy(clp->cl_confirm.data, cdata->args.verifier.data, 7494 - sizeof(clp->cl_confirm.data)); 7495 - } 7496 - out: 7497 - cdata->rpc_status = status; 7498 - return; 7499 - } 7500 7406 7501 7407 static void nfs4_exchange_id_release(void *data) 7502 7408 { ··· 7447 7481 } 7448 7482 7449 7483 static const struct rpc_call_ops nfs4_exchange_id_call_ops = { 7450 - .rpc_call_done = nfs4_exchange_id_done, 7451 7484 .rpc_release = nfs4_exchange_id_release, 7452 7485 }; 7453 7486 ··· 7455 7490 * 7456 7491 * Wrapper for EXCHANGE_ID operation. 7457 7492 */ 7458 - static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, 7493 + static struct rpc_task * 7494 + nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, 7459 7495 u32 sp4_how, struct rpc_xprt *xprt) 7460 7496 { 7461 7497 struct rpc_message msg = { ··· 7470 7504 .flags = RPC_TASK_TIMEOUT, 7471 7505 }; 7472 7506 struct nfs41_exchange_id_data *calldata; 7473 - struct rpc_task *task; 7474 7507 int status; 7475 7508 7476 7509 if (!atomic_inc_not_zero(&clp->cl_count)) 7477 - return -EIO; 7510 + return ERR_PTR(-EIO); 7478 7511 7512 + status = -ENOMEM; 7479 7513 calldata = kzalloc(sizeof(*calldata), GFP_NOFS); 7480 - if (!calldata) { 7481 - nfs_put_client(clp); 7482 - return -ENOMEM; 7483 - } 7514 + if (!calldata) 7515 + goto out; 7484 7516 7485 7517 nfs4_init_boot_verifier(clp, &calldata->args.verifier); 7486 7518 ··· 7517 7553 goto out_impl_id; 7518 7554 } 7519 7555 if (xprt) { 7520 - calldata->xprt = xprt; 7521 7556 task_setup_data.rpc_xprt = xprt; 7522 7557 task_setup_data.flags |= RPC_TASK_SOFTCONN; 7523 7558 memcpy(calldata->args.verifier.data, clp->cl_confirm.data, 7524 7559 sizeof(calldata->args.verifier.data)); 7525 7560 } 7526 7561 calldata->args.client = clp; 7562 + calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | 7563 + EXCHGID4_FLAG_BIND_PRINC_STATEID; 7527 7564 #ifdef CONFIG_NFS_V4_1_MIGRATION 7528 - calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | 7529 - EXCHGID4_FLAG_BIND_PRINC_STATEID | 7530 - EXCHGID4_FLAG_SUPP_MOVED_MIGR, 7531 - #else 7532 - calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | 7533 - EXCHGID4_FLAG_BIND_PRINC_STATEID, 7565 + calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR; 7534 7566 #endif 7535 7567 msg.rpc_argp = &calldata->args; 7536 7568 msg.rpc_resp = &calldata->res; 7537 7569 task_setup_data.callback_data = calldata; 7538 7570 7539 - task = rpc_run_task(&task_setup_data); 7540 - if (IS_ERR(task)) 7541 - return PTR_ERR(task); 7542 - 7543 - status = calldata->rpc_status; 7544 - 7545 - rpc_put_task(task); 7546 - out: 7547 - return status; 7571 + return rpc_run_task(&task_setup_data); 7548 7572 7549 7573 out_impl_id: 7550 7574 kfree(calldata->res.impl_id); ··· 7542 7590 kfree(calldata->res.server_owner); 7543 7591 out_calldata: 7544 7592 kfree(calldata); 7593 + out: 7545 7594 nfs_put_client(clp); 7546 - goto out; 7595 + return ERR_PTR(status); 7596 + } 7597 + 7598 + /* 7599 + * _nfs4_proc_exchange_id() 7600 + * 7601 + * Wrapper for EXCHANGE_ID operation. 7602 + */ 7603 + static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, 7604 + u32 sp4_how) 7605 + { 7606 + struct rpc_task *task; 7607 + struct nfs41_exchange_id_args *argp; 7608 + struct nfs41_exchange_id_res *resp; 7609 + int status; 7610 + 7611 + task = nfs4_run_exchange_id(clp, cred, sp4_how, NULL); 7612 + if (IS_ERR(task)) 7613 + return PTR_ERR(task); 7614 + 7615 + argp = task->tk_msg.rpc_argp; 7616 + resp = task->tk_msg.rpc_resp; 7617 + status = task->tk_status; 7618 + if (status != 0) 7619 + goto out; 7620 + 7621 + status = nfs4_check_cl_exchange_flags(resp->flags); 7622 + if (status != 0) 7623 + goto out; 7624 + 7625 + status = nfs4_sp4_select_mode(clp, &resp->state_protect); 7626 + if (status != 0) 7627 + goto out; 7628 + 7629 + clp->cl_clientid = resp->clientid; 7630 + clp->cl_exchange_flags = resp->flags; 7631 + clp->cl_seqid = resp->seqid; 7632 + /* Client ID is not confirmed */ 7633 + if (!(resp->flags & EXCHGID4_FLAG_CONFIRMED_R)) 7634 + clear_bit(NFS4_SESSION_ESTABLISHED, 7635 + &clp->cl_session->session_state); 7636 + 7637 + if (clp->cl_serverscope != NULL && 7638 + !nfs41_same_server_scope(clp->cl_serverscope, 7639 + resp->server_scope)) { 7640 + dprintk("%s: server_scope mismatch detected\n", 7641 + __func__); 7642 + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); 7643 + } 7644 + 7645 + swap(clp->cl_serverowner, resp->server_owner); 7646 + swap(clp->cl_serverscope, resp->server_scope); 7647 + swap(clp->cl_implid, resp->impl_id); 7648 + 7649 + /* Save the EXCHANGE_ID verifier session trunk tests */ 7650 + memcpy(clp->cl_confirm.data, argp->verifier.data, 7651 + sizeof(clp->cl_confirm.data)); 7652 + out: 7653 + trace_nfs4_exchange_id(clp, status); 7654 + rpc_put_task(task); 7655 + return status; 7547 7656 } 7548 7657 7549 7658 /* ··· 7627 7614 /* try SP4_MACH_CRED if krb5i/p */ 7628 7615 if (authflavor == RPC_AUTH_GSS_KRB5I || 7629 7616 authflavor == RPC_AUTH_GSS_KRB5P) { 7630 - status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL); 7617 + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); 7631 7618 if (!status) 7632 7619 return 0; 7633 7620 } 7634 7621 7635 7622 /* try SP4_NONE */ 7636 - return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL); 7623 + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); 7637 7624 } 7638 7625 7639 7626 /** ··· 7655 7642 void *data) 7656 7643 { 7657 7644 struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; 7645 + struct rpc_task *task; 7646 + int status; 7647 + 7658 7648 u32 sp4_how; 7659 7649 7660 7650 dprintk("--> %s try %s\n", __func__, ··· 7666 7650 sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED); 7667 7651 7668 7652 /* Test connection for session trunking. Async exchange_id call */ 7669 - return _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt); 7653 + task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt); 7654 + if (IS_ERR(task)) 7655 + return PTR_ERR(task); 7656 + 7657 + status = task->tk_status; 7658 + if (status == 0) 7659 + status = nfs4_detect_session_trunking(adata->clp, 7660 + task->tk_msg.rpc_resp, xprt); 7661 + 7662 + rpc_put_task(task); 7663 + return status; 7670 7664 } 7671 7665 EXPORT_SYMBOL_GPL(nfs4_test_session_trunk); 7672 7666
+71 -99
fs/nfs/pagelist.c
··· 134 134 /* 135 135 * nfs_page_group_lock - lock the head of the page group 136 136 * @req - request in group that is to be locked 137 - * @nonblock - if true don't block waiting for lock 138 137 * 139 - * this lock must be held if modifying the page group list 138 + * this lock must be held when traversing or modifying the page 139 + * group list 140 140 * 141 - * return 0 on success, < 0 on error: -EDELAY if nonblocking or the 142 - * result from wait_on_bit_lock 143 - * 144 - * NOTE: calling with nonblock=false should always have set the 145 - * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock 146 - * with TASK_UNINTERRUPTIBLE), so there is no need to check the result. 141 + * return 0 on success, < 0 on error 147 142 */ 148 143 int 149 - nfs_page_group_lock(struct nfs_page *req, bool nonblock) 144 + nfs_page_group_lock(struct nfs_page *req) 150 145 { 151 146 struct nfs_page *head = req->wb_head; 152 147 ··· 150 155 if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) 151 156 return 0; 152 157 153 - if (!nonblock) { 154 - set_bit(PG_CONTENDED1, &head->wb_flags); 155 - smp_mb__after_atomic(); 156 - return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, 157 - TASK_UNINTERRUPTIBLE); 158 - } 159 - 160 - return -EAGAIN; 161 - } 162 - 163 - /* 164 - * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it 165 - * @req - a request in the group 166 - * 167 - * This is a blocking call to wait for the group lock to be cleared. 168 - */ 169 - void 170 - nfs_page_group_lock_wait(struct nfs_page *req) 171 - { 172 - struct nfs_page *head = req->wb_head; 173 - 174 - WARN_ON_ONCE(head != head->wb_head); 175 - 176 - if (!test_bit(PG_HEADLOCK, &head->wb_flags)) 177 - return; 178 158 set_bit(PG_CONTENDED1, &head->wb_flags); 179 159 smp_mb__after_atomic(); 180 - wait_on_bit(&head->wb_flags, PG_HEADLOCK, 181 - TASK_UNINTERRUPTIBLE); 160 + return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, 161 + TASK_UNINTERRUPTIBLE); 182 162 } 183 163 184 164 /* ··· 216 246 { 217 247 bool ret; 218 248 219 - nfs_page_group_lock(req, false); 249 + nfs_page_group_lock(req); 220 250 ret = nfs_page_group_sync_on_bit_locked(req, bit); 221 251 nfs_page_group_unlock(req); 222 252 ··· 258 288 inode = page_file_mapping(req->wb_page)->host; 259 289 set_bit(PG_INODE_REF, &req->wb_flags); 260 290 kref_get(&req->wb_kref); 261 - spin_lock(&inode->i_lock); 262 - NFS_I(inode)->nrequests++; 263 - spin_unlock(&inode->i_lock); 291 + atomic_long_inc(&NFS_I(inode)->nrequests); 264 292 } 265 293 } 266 294 } ··· 274 306 nfs_page_group_destroy(struct kref *kref) 275 307 { 276 308 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); 309 + struct nfs_page *head = req->wb_head; 277 310 struct nfs_page *tmp, *next; 278 311 279 - /* subrequests must release the ref on the head request */ 280 - if (req->wb_head != req) 281 - nfs_release_request(req->wb_head); 282 - 283 312 if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) 284 - return; 313 + goto out; 285 314 286 315 tmp = req; 287 316 do { ··· 289 324 nfs_free_request(tmp); 290 325 tmp = next; 291 326 } while (tmp != req); 327 + out: 328 + /* subrequests must release the ref on the head request */ 329 + if (head != req) 330 + nfs_release_request(head); 292 331 } 293 332 294 333 /** ··· 434 465 { 435 466 kref_put(&req->wb_kref, nfs_page_group_destroy); 436 467 } 468 + EXPORT_SYMBOL_GPL(nfs_release_request); 437 469 438 470 /** 439 471 * nfs_wait_on_request - Wait for a request to complete. ··· 453 483 return wait_on_bit_io(&req->wb_flags, PG_BUSY, 454 484 TASK_UNINTERRUPTIBLE); 455 485 } 486 + EXPORT_SYMBOL_GPL(nfs_wait_on_request); 456 487 457 488 /* 458 489 * nfs_generic_pg_test - determine if requests can be coalesced ··· 501 530 } 502 531 EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc); 503 532 504 - /* 505 - * nfs_pgio_header_free - Free a read or write header 506 - * @hdr: The header to free 507 - */ 508 - void nfs_pgio_header_free(struct nfs_pgio_header *hdr) 509 - { 510 - hdr->rw_ops->rw_free_header(hdr); 511 - } 512 - EXPORT_SYMBOL_GPL(nfs_pgio_header_free); 513 - 514 533 /** 515 534 * nfs_pgio_data_destroy - make @hdr suitable for reuse 516 535 * ··· 509 548 * 510 549 * @hdr: A header that has had nfs_generic_pgio called 511 550 */ 512 - void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) 551 + static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) 513 552 { 514 553 if (hdr->args.context) 515 554 put_nfs_open_context(hdr->args.context); 516 555 if (hdr->page_array.pagevec != hdr->page_array.page_array) 517 556 kfree(hdr->page_array.pagevec); 518 557 } 519 - EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy); 558 + 559 + /* 560 + * nfs_pgio_header_free - Free a read or write header 561 + * @hdr: The header to free 562 + */ 563 + void nfs_pgio_header_free(struct nfs_pgio_header *hdr) 564 + { 565 + nfs_pgio_data_destroy(hdr); 566 + hdr->rw_ops->rw_free_header(hdr); 567 + } 568 + EXPORT_SYMBOL_GPL(nfs_pgio_header_free); 520 569 521 570 /** 522 571 * nfs_pgio_rpcsetup - Set up arguments for a pageio call ··· 640 669 static void nfs_pgio_error(struct nfs_pgio_header *hdr) 641 670 { 642 671 set_bit(NFS_IOHDR_REDO, &hdr->flags); 643 - nfs_pgio_data_destroy(hdr); 644 672 hdr->completion_ops->completion(hdr); 645 673 } 646 674 ··· 650 680 static void nfs_pgio_release(void *calldata) 651 681 { 652 682 struct nfs_pgio_header *hdr = calldata; 653 - nfs_pgio_data_destroy(hdr); 654 683 hdr->completion_ops->completion(hdr); 655 684 } 656 685 ··· 680 711 const struct nfs_pgio_completion_ops *compl_ops, 681 712 const struct nfs_rw_ops *rw_ops, 682 713 size_t bsize, 683 - int io_flags, 684 - gfp_t gfp_flags) 714 + int io_flags) 685 715 { 686 - struct nfs_pgio_mirror *new; 687 - int i; 688 - 689 716 desc->pg_moreio = 0; 690 717 desc->pg_inode = inode; 691 718 desc->pg_ops = pg_ops; ··· 697 732 desc->pg_mirror_count = 1; 698 733 desc->pg_mirror_idx = 0; 699 734 700 - if (pg_ops->pg_get_mirror_count) { 701 - /* until we have a request, we don't have an lseg and no 702 - * idea how many mirrors there will be */ 703 - new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, 704 - sizeof(struct nfs_pgio_mirror), gfp_flags); 705 - desc->pg_mirrors_dynamic = new; 706 - desc->pg_mirrors = new; 707 - 708 - for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++) 709 - nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize); 710 - } else { 711 - desc->pg_mirrors_dynamic = NULL; 712 - desc->pg_mirrors = desc->pg_mirrors_static; 713 - nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); 714 - } 735 + desc->pg_mirrors_dynamic = NULL; 736 + desc->pg_mirrors = desc->pg_mirrors_static; 737 + nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); 715 738 } 716 - EXPORT_SYMBOL_GPL(nfs_pageio_init); 717 739 718 740 /** 719 741 * nfs_pgio_result - Basic pageio error handling ··· 817 865 return ret; 818 866 } 819 867 868 + static struct nfs_pgio_mirror * 869 + nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc, 870 + unsigned int mirror_count) 871 + { 872 + struct nfs_pgio_mirror *ret; 873 + unsigned int i; 874 + 875 + kfree(desc->pg_mirrors_dynamic); 876 + desc->pg_mirrors_dynamic = NULL; 877 + if (mirror_count == 1) 878 + return desc->pg_mirrors_static; 879 + ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_NOFS); 880 + if (ret != NULL) { 881 + for (i = 0; i < mirror_count; i++) 882 + nfs_pageio_mirror_init(&ret[i], desc->pg_bsize); 883 + desc->pg_mirrors_dynamic = ret; 884 + } 885 + return ret; 886 + } 887 + 820 888 /* 821 889 * nfs_pageio_setup_mirroring - determine if mirroring is to be used 822 890 * by calling the pg_get_mirror_count op 823 891 */ 824 - static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, 892 + static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, 825 893 struct nfs_page *req) 826 894 { 827 - int mirror_count = 1; 895 + unsigned int mirror_count = 1; 828 896 829 - if (!pgio->pg_ops->pg_get_mirror_count) 830 - return 0; 897 + if (pgio->pg_ops->pg_get_mirror_count) 898 + mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); 899 + if (mirror_count == pgio->pg_mirror_count || pgio->pg_error < 0) 900 + return; 831 901 832 - mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); 902 + if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) { 903 + pgio->pg_error = -EINVAL; 904 + return; 905 + } 833 906 834 - if (pgio->pg_error < 0) 835 - return pgio->pg_error; 836 - 837 - if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) 838 - return -EINVAL; 839 - 840 - if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic)) 841 - return -EINVAL; 842 - 907 + pgio->pg_mirrors = nfs_pageio_alloc_mirrors(pgio, mirror_count); 908 + if (pgio->pg_mirrors == NULL) { 909 + pgio->pg_error = -ENOMEM; 910 + pgio->pg_mirrors = pgio->pg_mirrors_static; 911 + mirror_count = 1; 912 + } 843 913 pgio->pg_mirror_count = mirror_count; 844 - 845 - return 0; 846 914 } 847 915 848 916 /* ··· 1008 1036 unsigned int bytes_left = 0; 1009 1037 unsigned int offset, pgbase; 1010 1038 1011 - nfs_page_group_lock(req, false); 1039 + nfs_page_group_lock(req); 1012 1040 1013 1041 subreq = req; 1014 1042 bytes_left = subreq->wb_bytes; ··· 1030 1058 if (mirror->pg_recoalesce) 1031 1059 return 0; 1032 1060 /* retry add_request for this subreq */ 1033 - nfs_page_group_lock(req, false); 1061 + nfs_page_group_lock(req); 1034 1062 continue; 1035 1063 } 1036 1064 ··· 1127 1155 1128 1156 for (midx = 0; midx < desc->pg_mirror_count; midx++) { 1129 1157 if (midx) { 1130 - nfs_page_group_lock(req, false); 1158 + nfs_page_group_lock(req); 1131 1159 1132 1160 /* find the last request */ 1133 1161 for (lastreq = req->wb_head;
-43
fs/nfs/pnfs.c
··· 529 529 } 530 530 EXPORT_SYMBOL_GPL(pnfs_put_lseg); 531 531 532 - static void pnfs_free_lseg_async_work(struct work_struct *work) 533 - { 534 - struct pnfs_layout_segment *lseg; 535 - struct pnfs_layout_hdr *lo; 536 - 537 - lseg = container_of(work, struct pnfs_layout_segment, pls_work); 538 - lo = lseg->pls_layout; 539 - 540 - pnfs_free_lseg(lseg); 541 - pnfs_put_layout_hdr(lo); 542 - } 543 - 544 - static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg) 545 - { 546 - INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work); 547 - schedule_work(&lseg->pls_work); 548 - } 549 - 550 - void 551 - pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) 552 - { 553 - if (!lseg) 554 - return; 555 - 556 - assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock); 557 - 558 - dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 559 - atomic_read(&lseg->pls_refcount), 560 - test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 561 - if (atomic_dec_and_test(&lseg->pls_refcount)) { 562 - struct pnfs_layout_hdr *lo = lseg->pls_layout; 563 - if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) 564 - return; 565 - pnfs_layout_remove_lseg(lo, lseg); 566 - if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) { 567 - pnfs_get_layout_hdr(lo); 568 - pnfs_free_lseg_async(lseg); 569 - } 570 - } 571 - } 572 - 573 532 /* 574 533 * is l2 fully contained in l1? 575 534 * start1 end1 ··· 2233 2274 nfs_pageio_reset_write_mds(desc); 2234 2275 mirror->pg_recoalesce = 1; 2235 2276 } 2236 - nfs_pgio_data_destroy(hdr); 2237 2277 hdr->release(hdr); 2238 2278 } 2239 2279 ··· 2356 2398 nfs_pageio_reset_read_mds(desc); 2357 2399 mirror->pg_recoalesce = 1; 2358 2400 } 2359 - nfs_pgio_data_destroy(hdr); 2360 2401 hdr->release(hdr); 2361 2402 } 2362 2403
-2
fs/nfs/pnfs.h
··· 67 67 u32 pls_seq; 68 68 unsigned long pls_flags; 69 69 struct pnfs_layout_hdr *pls_layout; 70 - struct work_struct pls_work; 71 70 }; 72 71 73 72 enum pnfs_try_status { ··· 229 230 /* pnfs.c */ 230 231 void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); 231 232 void pnfs_put_lseg(struct pnfs_layout_segment *lseg); 232 - void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); 233 233 234 234 void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); 235 235 void unset_pnfs_layoutdriver(struct nfs_server *);
+10 -34
fs/nfs/pnfs_nfs.c
··· 83 83 } 84 84 out: 85 85 nfs_request_remove_commit_list(req, cinfo); 86 - pnfs_put_lseg_locked(freeme); 86 + pnfs_put_lseg(freeme); 87 87 } 88 88 EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit); 89 - 90 - static int 91 - pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, 92 - struct nfs_commit_info *cinfo, int max) 93 - { 94 - struct nfs_page *req, *tmp; 95 - int ret = 0; 96 - 97 - list_for_each_entry_safe(req, tmp, src, wb_list) { 98 - if (!nfs_lock_request(req)) 99 - continue; 100 - kref_get(&req->wb_kref); 101 - if (cond_resched_lock(&cinfo->inode->i_lock)) 102 - list_safe_reset_next(req, tmp, wb_list); 103 - nfs_request_remove_commit_list(req, cinfo); 104 - clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); 105 - nfs_list_add_request(req, dst); 106 - ret++; 107 - if ((ret == max) && !cinfo->dreq) 108 - break; 109 - } 110 - return ret; 111 - } 112 89 113 90 static int 114 91 pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, ··· 96 119 struct list_head *dst = &bucket->committing; 97 120 int ret; 98 121 99 - lockdep_assert_held(&cinfo->inode->i_lock); 100 - ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); 122 + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); 123 + ret = nfs_scan_commit_list(src, dst, cinfo, max); 101 124 if (ret) { 102 125 cinfo->ds->nwritten -= ret; 103 126 cinfo->ds->ncommitting += ret; 104 127 if (bucket->clseg == NULL) 105 128 bucket->clseg = pnfs_get_lseg(bucket->wlseg); 106 129 if (list_empty(src)) { 107 - pnfs_put_lseg_locked(bucket->wlseg); 130 + pnfs_put_lseg(bucket->wlseg); 108 131 bucket->wlseg = NULL; 109 132 } 110 133 } ··· 119 142 { 120 143 int i, rv = 0, cnt; 121 144 122 - lockdep_assert_held(&cinfo->inode->i_lock); 145 + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); 123 146 for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { 124 147 cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], 125 148 cinfo, max); ··· 139 162 int nwritten; 140 163 int i; 141 164 142 - lockdep_assert_held(&cinfo->inode->i_lock); 165 + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); 143 166 restart: 144 167 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { 145 - nwritten = pnfs_generic_transfer_commit_list(&b->written, 146 - dst, cinfo, 0); 168 + nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0); 147 169 if (!nwritten) 148 170 continue; 149 171 cinfo->ds->nwritten -= nwritten; ··· 929 953 struct list_head *list; 930 954 struct pnfs_commit_bucket *buckets; 931 955 932 - spin_lock(&cinfo->inode->i_lock); 956 + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); 933 957 buckets = cinfo->ds->buckets; 934 958 list = &buckets[ds_commit_idx].written; 935 959 if (list_empty(list)) { 936 960 if (!pnfs_is_valid_lseg(lseg)) { 937 - spin_unlock(&cinfo->inode->i_lock); 961 + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); 938 962 cinfo->completion_ops->resched_write(cinfo, req); 939 963 return; 940 964 } ··· 951 975 cinfo->ds->nwritten++; 952 976 953 977 nfs_request_add_commit_list_locked(req, list, cinfo); 954 - spin_unlock(&cinfo->inode->i_lock); 978 + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); 955 979 nfs_mark_page_unstable(req->wb_page, cinfo); 956 980 } 957 981 EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
+1 -1
fs/nfs/read.c
··· 68 68 pg_ops = server->pnfs_curr_ld->pg_read_ops; 69 69 #endif 70 70 nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, 71 - server->rsize, 0, GFP_KERNEL); 71 + server->rsize, 0); 72 72 } 73 73 EXPORT_SYMBOL_GPL(nfs_pageio_init_read); 74 74
+8 -4
fs/nfs/super.c
··· 1691 1691 rpc_authflavor_t *server_authlist, unsigned int count) 1692 1692 { 1693 1693 rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; 1694 + bool found_auth_null = false; 1694 1695 unsigned int i; 1695 - int use_auth_null = false; 1696 1696 1697 1697 /* 1698 1698 * If the sec= mount option is used, the specified flavor or AUTH_NULL ··· 1701 1701 * AUTH_NULL has a special meaning when it's in the server list - it 1702 1702 * means that the server will ignore the rpc creds, so any flavor 1703 1703 * can be used but still use the sec= that was specified. 1704 + * 1705 + * Note also that the MNT procedure in MNTv1 does not return a list 1706 + * of supported security flavors. In this case, nfs_mount() fabricates 1707 + * a security flavor list containing just AUTH_NULL. 1704 1708 */ 1705 1709 for (i = 0; i < count; i++) { 1706 1710 flavor = server_authlist[i]; ··· 1713 1709 goto out; 1714 1710 1715 1711 if (flavor == RPC_AUTH_NULL) 1716 - use_auth_null = true; 1712 + found_auth_null = true; 1717 1713 } 1718 1714 1719 - if (use_auth_null) { 1720 - flavor = RPC_AUTH_NULL; 1715 + if (found_auth_null) { 1716 + flavor = args->auth_info.flavors[0]; 1721 1717 goto out; 1722 1718 } 1723 1719
+222 -239
fs/nfs/write.c
··· 102 102 { 103 103 struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); 104 104 105 - if (p) { 106 - memset(p, 0, sizeof(*p)); 107 - p->rw_mode = FMODE_WRITE; 108 - } 105 + memset(p, 0, sizeof(*p)); 106 + p->rw_mode = FMODE_WRITE; 109 107 return p; 110 108 } 111 109 ··· 152 154 set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 153 155 } 154 156 157 + static struct nfs_page * 158 + nfs_page_private_request(struct page *page) 159 + { 160 + if (!PagePrivate(page)) 161 + return NULL; 162 + return (struct nfs_page *)page_private(page); 163 + } 164 + 155 165 /* 156 166 * nfs_page_find_head_request_locked - find head request associated with @page 157 167 * ··· 168 162 * returns matching head request with reference held, or NULL if not found. 169 163 */ 170 164 static struct nfs_page * 171 - nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) 165 + nfs_page_find_private_request(struct page *page) 172 166 { 173 - struct nfs_page *req = NULL; 167 + struct address_space *mapping = page_file_mapping(page); 168 + struct nfs_page *req; 174 169 175 - if (PagePrivate(page)) 176 - req = (struct nfs_page *)page_private(page); 177 - else if (unlikely(PageSwapCache(page))) 178 - req = nfs_page_search_commits_for_head_request_locked(nfsi, 179 - page); 180 - 170 + if (!PagePrivate(page)) 171 + return NULL; 172 + spin_lock(&mapping->private_lock); 173 + req = nfs_page_private_request(page); 181 174 if (req) { 182 175 WARN_ON_ONCE(req->wb_head != req); 183 176 kref_get(&req->wb_kref); 184 177 } 178 + spin_unlock(&mapping->private_lock); 179 + return req; 180 + } 185 181 182 + static struct nfs_page * 183 + nfs_page_find_swap_request(struct page *page) 184 + { 185 + struct inode *inode = page_file_mapping(page)->host; 186 + struct nfs_inode *nfsi = NFS_I(inode); 187 + struct nfs_page *req = NULL; 188 + if (!PageSwapCache(page)) 189 + return NULL; 190 + mutex_lock(&nfsi->commit_mutex); 191 + if (PageSwapCache(page)) { 192 + req = nfs_page_search_commits_for_head_request_locked(nfsi, 193 + page); 194 + if (req) { 195 + WARN_ON_ONCE(req->wb_head != req); 196 + kref_get(&req->wb_kref); 197 + } 198 + } 199 + mutex_unlock(&nfsi->commit_mutex); 186 200 return req; 187 201 } 188 202 ··· 213 187 */ 214 188 static struct nfs_page *nfs_page_find_head_request(struct page *page) 215 189 { 216 - struct inode *inode = page_file_mapping(page)->host; 217 - struct nfs_page *req = NULL; 190 + struct nfs_page *req; 218 191 219 - spin_lock(&inode->i_lock); 220 - req = nfs_page_find_head_request_locked(NFS_I(inode), page); 221 - spin_unlock(&inode->i_lock); 192 + req = nfs_page_find_private_request(page); 193 + if (!req) 194 + req = nfs_page_find_swap_request(page); 222 195 return req; 223 196 } 224 197 ··· 266 241 { 267 242 struct nfs_page *req; 268 243 269 - WARN_ON_ONCE(head != head->wb_head); 270 - WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); 271 - 272 244 req = head; 273 245 do { 274 246 if (page_offset >= req->wb_pgbase && ··· 291 269 unsigned int pos = 0; 292 270 unsigned int len = nfs_page_length(req->wb_page); 293 271 294 - nfs_page_group_lock(req, false); 272 + nfs_page_group_lock(req); 295 273 296 - do { 274 + for (;;) { 297 275 tmp = nfs_page_group_search_locked(req->wb_head, pos); 298 - if (tmp) { 299 - /* no way this should happen */ 300 - WARN_ON_ONCE(tmp->wb_pgbase != pos); 301 - pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); 302 - } 303 - } while (tmp && pos < len); 276 + if (!tmp) 277 + break; 278 + pos = tmp->wb_pgbase + tmp->wb_bytes; 279 + } 304 280 305 281 nfs_page_group_unlock(req); 306 - WARN_ON_ONCE(pos > len); 307 - return pos == len; 282 + return pos >= len; 308 283 } 309 284 310 285 /* We can set the PG_uptodate flag if we see that a write request ··· 352 333 { 353 334 struct inode *inode = page_file_mapping(req->wb_page)->host; 354 335 struct nfs_server *nfss = NFS_SERVER(inode); 336 + bool is_done; 355 337 356 - if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) 338 + is_done = nfs_page_group_sync_on_bit(req, PG_WB_END); 339 + nfs_unlock_request(req); 340 + if (!is_done) 357 341 return; 358 342 359 343 end_page_writeback(req->wb_page); 360 344 if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) 361 345 clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); 362 346 } 363 - 364 - 365 - /* nfs_page_group_clear_bits 366 - * @req - an nfs request 367 - * clears all page group related bits from @req 368 - */ 369 - static void 370 - nfs_page_group_clear_bits(struct nfs_page *req) 371 - { 372 - clear_bit(PG_TEARDOWN, &req->wb_flags); 373 - clear_bit(PG_UNLOCKPAGE, &req->wb_flags); 374 - clear_bit(PG_UPTODATE, &req->wb_flags); 375 - clear_bit(PG_WB_END, &req->wb_flags); 376 - clear_bit(PG_REMOVE, &req->wb_flags); 377 - } 378 - 379 347 380 348 /* 381 349 * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req ··· 372 366 * @inode - inode associated with request page group, must be holding inode lock 373 367 * @head - head request of page group, must be holding head lock 374 368 * @req - request that couldn't lock and needs to wait on the req bit lock 375 - * @nonblock - if true, don't actually wait 376 369 * 377 - * NOTE: this must be called holding page_group bit lock and inode spin lock 378 - * and BOTH will be released before returning. 370 + * NOTE: this must be called holding page_group bit lock 371 + * which will be released before returning. 379 372 * 380 373 * returns 0 on success, < 0 on error. 381 374 */ 382 - static int 383 - nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, 384 - struct nfs_page *req, bool nonblock) 385 - __releases(&inode->i_lock) 375 + static void 376 + nfs_unroll_locks(struct inode *inode, struct nfs_page *head, 377 + struct nfs_page *req) 386 378 { 387 379 struct nfs_page *tmp; 388 - int ret; 389 380 390 381 /* relinquish all the locks successfully grabbed this run */ 391 - for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) 392 - nfs_unlock_request(tmp); 393 - 394 - WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); 395 - 396 - /* grab a ref on the request that will be waited on */ 397 - kref_get(&req->wb_kref); 398 - 399 - nfs_page_group_unlock(head); 400 - spin_unlock(&inode->i_lock); 401 - 402 - /* release ref from nfs_page_find_head_request_locked */ 403 - nfs_release_request(head); 404 - 405 - if (!nonblock) 406 - ret = nfs_wait_on_request(req); 407 - else 408 - ret = -EAGAIN; 409 - nfs_release_request(req); 410 - 411 - return ret; 382 + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { 383 + if (!kref_read(&tmp->wb_kref)) 384 + continue; 385 + nfs_unlock_and_release_request(tmp); 386 + } 412 387 } 413 388 414 389 /* ··· 404 417 */ 405 418 static void 406 419 nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, 407 - struct nfs_page *old_head) 420 + struct nfs_page *old_head, 421 + struct inode *inode) 408 422 { 409 423 while (destroy_list) { 410 424 struct nfs_page *subreq = destroy_list; ··· 416 428 WARN_ON_ONCE(old_head != subreq->wb_head); 417 429 418 430 /* make sure old group is not used */ 419 - subreq->wb_head = subreq; 420 431 subreq->wb_this_page = subreq; 432 + 433 + clear_bit(PG_REMOVE, &subreq->wb_flags); 434 + 435 + /* Note: races with nfs_page_group_destroy() */ 436 + if (!kref_read(&subreq->wb_kref)) { 437 + /* Check if we raced with nfs_page_group_destroy() */ 438 + if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) 439 + nfs_free_request(subreq); 440 + continue; 441 + } 442 + 443 + subreq->wb_head = subreq; 444 + 445 + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { 446 + nfs_release_request(subreq); 447 + atomic_long_dec(&NFS_I(inode)->nrequests); 448 + } 421 449 422 450 /* subreq is now totally disconnected from page group or any 423 451 * write / commit lists. last chance to wake any waiters */ 424 - nfs_unlock_request(subreq); 425 - 426 - if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { 427 - /* release ref on old head request */ 428 - nfs_release_request(old_head); 429 - 430 - nfs_page_group_clear_bits(subreq); 431 - 432 - /* release the PG_INODE_REF reference */ 433 - if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) 434 - nfs_release_request(subreq); 435 - else 436 - WARN_ON_ONCE(1); 437 - } else { 438 - WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); 439 - /* zombie requests have already released the last 440 - * reference and were waiting on the rest of the 441 - * group to complete. Since it's no longer part of a 442 - * group, simply free the request */ 443 - nfs_page_group_clear_bits(subreq); 444 - nfs_free_request(subreq); 445 - } 452 + nfs_unlock_and_release_request(subreq); 446 453 } 447 454 } 448 455 ··· 447 464 * operations for this page. 448 465 * 449 466 * @page - the page used to lookup the "page group" of nfs_page structures 450 - * @nonblock - if true, don't block waiting for request locks 451 467 * 452 468 * This function joins all sub requests to the head request by first 453 469 * locking all requests in the group, cancelling any pending operations ··· 460 478 * error was encountered. 461 479 */ 462 480 static struct nfs_page * 463 - nfs_lock_and_join_requests(struct page *page, bool nonblock) 481 + nfs_lock_and_join_requests(struct page *page) 464 482 { 465 483 struct inode *inode = page_file_mapping(page)->host; 466 484 struct nfs_page *head, *subreq; ··· 469 487 int ret; 470 488 471 489 try_again: 472 - total_bytes = 0; 473 - 474 - WARN_ON_ONCE(destroy_list); 475 - 476 - spin_lock(&inode->i_lock); 477 - 478 490 /* 479 491 * A reference is taken only on the head request which acts as a 480 492 * reference to the whole page group - the group will not be destroyed 481 493 * until the head reference is released. 482 494 */ 483 - head = nfs_page_find_head_request_locked(NFS_I(inode), page); 484 - 485 - if (!head) { 486 - spin_unlock(&inode->i_lock); 495 + head = nfs_page_find_head_request(page); 496 + if (!head) 487 497 return NULL; 498 + 499 + /* lock the page head first in order to avoid an ABBA inefficiency */ 500 + if (!nfs_lock_request(head)) { 501 + ret = nfs_wait_on_request(head); 502 + nfs_release_request(head); 503 + if (ret < 0) 504 + return ERR_PTR(ret); 505 + goto try_again; 488 506 } 489 507 490 - /* holding inode lock, so always make a non-blocking call to try the 491 - * page group lock */ 492 - ret = nfs_page_group_lock(head, true); 508 + /* Ensure that nobody removed the request before we locked it */ 509 + if (head != nfs_page_private_request(page) && !PageSwapCache(page)) { 510 + nfs_unlock_and_release_request(head); 511 + goto try_again; 512 + } 513 + 514 + ret = nfs_page_group_lock(head); 493 515 if (ret < 0) { 494 - spin_unlock(&inode->i_lock); 495 - 496 - if (!nonblock && ret == -EAGAIN) { 497 - nfs_page_group_lock_wait(head); 498 - nfs_release_request(head); 499 - goto try_again; 500 - } 501 - 502 - nfs_release_request(head); 516 + nfs_unlock_and_release_request(head); 503 517 return ERR_PTR(ret); 504 518 } 505 519 506 520 /* lock each request in the page group */ 507 - subreq = head; 508 - do { 521 + total_bytes = head->wb_bytes; 522 + for (subreq = head->wb_this_page; subreq != head; 523 + subreq = subreq->wb_this_page) { 524 + 525 + if (!kref_get_unless_zero(&subreq->wb_kref)) { 526 + if (subreq->wb_offset == head->wb_offset + total_bytes) 527 + total_bytes += subreq->wb_bytes; 528 + continue; 529 + } 530 + 531 + while (!nfs_lock_request(subreq)) { 532 + /* 533 + * Unlock page to allow nfs_page_group_sync_on_bit() 534 + * to succeed 535 + */ 536 + nfs_page_group_unlock(head); 537 + ret = nfs_wait_on_request(subreq); 538 + if (!ret) 539 + ret = nfs_page_group_lock(head); 540 + if (ret < 0) { 541 + nfs_unroll_locks(inode, head, subreq); 542 + nfs_release_request(subreq); 543 + nfs_unlock_and_release_request(head); 544 + return ERR_PTR(ret); 545 + } 546 + } 509 547 /* 510 548 * Subrequests are always contiguous, non overlapping 511 549 * and in order - but may be repeated (mirrored writes). ··· 537 535 ((subreq->wb_offset + subreq->wb_bytes) > 538 536 (head->wb_offset + total_bytes)))) { 539 537 nfs_page_group_unlock(head); 540 - spin_unlock(&inode->i_lock); 538 + nfs_unroll_locks(inode, head, subreq); 539 + nfs_unlock_and_release_request(subreq); 540 + nfs_unlock_and_release_request(head); 541 541 return ERR_PTR(-EIO); 542 542 } 543 - 544 - if (!nfs_lock_request(subreq)) { 545 - /* releases page group bit lock and 546 - * inode spin lock and all references */ 547 - ret = nfs_unroll_locks_and_wait(inode, head, 548 - subreq, nonblock); 549 - 550 - if (ret == 0) 551 - goto try_again; 552 - 553 - return ERR_PTR(ret); 554 - } 555 - 556 - subreq = subreq->wb_this_page; 557 - } while (subreq != head); 543 + } 558 544 559 545 /* Now that all requests are locked, make sure they aren't on any list. 560 546 * Commit list removal accounting is done after locks are dropped */ ··· 563 573 head->wb_bytes = total_bytes; 564 574 } 565 575 566 - /* 567 - * prepare head request to be added to new pgio descriptor 568 - */ 569 - nfs_page_group_clear_bits(head); 570 - 571 - /* 572 - * some part of the group was still on the inode list - otherwise 573 - * the group wouldn't be involved in async write. 574 - * grab a reference for the head request, iff it needs one. 575 - */ 576 - if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) 576 + /* Postpone destruction of this request */ 577 + if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) { 578 + set_bit(PG_INODE_REF, &head->wb_flags); 577 579 kref_get(&head->wb_kref); 580 + atomic_long_inc(&NFS_I(inode)->nrequests); 581 + } 578 582 579 583 nfs_page_group_unlock(head); 580 584 581 - /* drop lock to clean uprequests on destroy list */ 582 - spin_unlock(&inode->i_lock); 585 + nfs_destroy_unlinked_subrequests(destroy_list, head, inode); 583 586 584 - nfs_destroy_unlinked_subrequests(destroy_list, head); 587 + /* Did we lose a race with nfs_inode_remove_request()? */ 588 + if (!(PagePrivate(page) || PageSwapCache(page))) { 589 + nfs_unlock_and_release_request(head); 590 + return NULL; 591 + } 585 592 586 - /* still holds ref on head from nfs_page_find_head_request_locked 593 + /* still holds ref on head from nfs_page_find_head_request 587 594 * and still has lock on head from lock loop */ 588 595 return head; 589 596 } 590 597 591 598 static void nfs_write_error_remove_page(struct nfs_page *req) 592 599 { 593 - nfs_unlock_request(req); 594 600 nfs_end_page_writeback(req); 595 601 generic_error_remove_page(page_file_mapping(req->wb_page), 596 602 req->wb_page); ··· 610 624 * May return an error if the user signalled nfs_wait_on_request(). 611 625 */ 612 626 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, 613 - struct page *page, bool nonblock) 627 + struct page *page) 614 628 { 615 629 struct nfs_page *req; 616 630 int ret = 0; 617 631 618 - req = nfs_lock_and_join_requests(page, nonblock); 632 + req = nfs_lock_and_join_requests(page); 619 633 if (!req) 620 634 goto out; 621 635 ret = PTR_ERR(req); ··· 658 672 int ret; 659 673 660 674 nfs_pageio_cond_complete(pgio, page_index(page)); 661 - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); 675 + ret = nfs_page_async_flush(pgio, page); 662 676 if (ret == -EAGAIN) { 663 677 redirty_page_for_writepage(wbc, page); 664 678 ret = 0; ··· 745 759 */ 746 760 static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) 747 761 { 762 + struct address_space *mapping = page_file_mapping(req->wb_page); 748 763 struct nfs_inode *nfsi = NFS_I(inode); 749 764 750 765 WARN_ON_ONCE(req->wb_this_page != req); ··· 753 766 /* Lock the request! */ 754 767 nfs_lock_request(req); 755 768 756 - spin_lock(&inode->i_lock); 757 - if (!nfsi->nrequests && 758 - NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) 759 - inode->i_version++; 760 769 /* 761 770 * Swap-space should not get truncated. Hence no need to plug the race 762 771 * with invalidate/truncate. 763 772 */ 773 + spin_lock(&mapping->private_lock); 774 + if (!nfs_have_writebacks(inode) && 775 + NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) { 776 + spin_lock(&inode->i_lock); 777 + inode->i_version++; 778 + spin_unlock(&inode->i_lock); 779 + } 764 780 if (likely(!PageSwapCache(req->wb_page))) { 765 781 set_bit(PG_MAPPED, &req->wb_flags); 766 782 SetPagePrivate(req->wb_page); 767 783 set_page_private(req->wb_page, (unsigned long)req); 768 784 } 769 - nfsi->nrequests++; 785 + spin_unlock(&mapping->private_lock); 786 + atomic_long_inc(&nfsi->nrequests); 770 787 /* this a head request for a page group - mark it as having an 771 788 * extra reference so sub groups can follow suit. 772 789 * This flag also informs pgio layer when to bump nrequests when 773 790 * adding subrequests. */ 774 791 WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); 775 792 kref_get(&req->wb_kref); 776 - spin_unlock(&inode->i_lock); 777 793 } 778 794 779 795 /* ··· 784 794 */ 785 795 static void nfs_inode_remove_request(struct nfs_page *req) 786 796 { 787 - struct inode *inode = d_inode(req->wb_context->dentry); 797 + struct address_space *mapping = page_file_mapping(req->wb_page); 798 + struct inode *inode = mapping->host; 788 799 struct nfs_inode *nfsi = NFS_I(inode); 789 800 struct nfs_page *head; 790 801 802 + atomic_long_dec(&nfsi->nrequests); 791 803 if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { 792 804 head = req->wb_head; 793 805 794 - spin_lock(&inode->i_lock); 806 + spin_lock(&mapping->private_lock); 795 807 if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { 796 808 set_page_private(head->wb_page, 0); 797 809 ClearPagePrivate(head->wb_page); 798 810 clear_bit(PG_MAPPED, &head->wb_flags); 799 811 } 800 - nfsi->nrequests--; 801 - spin_unlock(&inode->i_lock); 802 - } else { 803 - spin_lock(&inode->i_lock); 804 - nfsi->nrequests--; 805 - spin_unlock(&inode->i_lock); 812 + spin_unlock(&mapping->private_lock); 806 813 } 807 814 808 815 if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) ··· 855 868 * number of outstanding requests requiring a commit as well as 856 869 * the MM page stats. 857 870 * 858 - * The caller must hold cinfo->inode->i_lock, and the nfs_page lock. 871 + * The caller must hold NFS_I(cinfo->inode)->commit_mutex, and the 872 + * nfs_page lock. 859 873 */ 860 874 void 861 875 nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, ··· 864 876 { 865 877 set_bit(PG_CLEAN, &req->wb_flags); 866 878 nfs_list_add_request(req, dst); 867 - cinfo->mds->ncommit++; 879 + atomic_long_inc(&cinfo->mds->ncommit); 868 880 } 869 881 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); 870 882 ··· 884 896 void 885 897 nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) 886 898 { 887 - spin_lock(&cinfo->inode->i_lock); 899 + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); 888 900 nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); 889 - spin_unlock(&cinfo->inode->i_lock); 901 + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); 890 902 if (req->wb_page) 891 903 nfs_mark_page_unstable(req->wb_page, cinfo); 892 904 } ··· 910 922 if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) 911 923 return; 912 924 nfs_list_remove_request(req); 913 - cinfo->mds->ncommit--; 925 + atomic_long_dec(&cinfo->mds->ncommit); 914 926 } 915 927 EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); 916 928 ··· 955 967 WB_RECLAIMABLE); 956 968 } 957 969 958 - /* Called holding inode (/cinfo) lock */ 970 + /* Called holding the request lock on @req */ 959 971 static void 960 972 nfs_clear_request_commit(struct nfs_page *req) 961 973 { ··· 964 976 struct nfs_commit_info cinfo; 965 977 966 978 nfs_init_cinfo_from_inode(&cinfo, inode); 979 + mutex_lock(&NFS_I(inode)->commit_mutex); 967 980 if (!pnfs_clear_request_commit(req, &cinfo)) { 968 981 nfs_request_remove_commit_list(req, &cinfo); 969 982 } 983 + mutex_unlock(&NFS_I(inode)->commit_mutex); 970 984 nfs_clear_page_commit(req->wb_page); 971 985 } 972 986 } ··· 1013 1023 remove_req: 1014 1024 nfs_inode_remove_request(req); 1015 1025 next: 1016 - nfs_unlock_request(req); 1017 1026 nfs_end_page_writeback(req); 1018 1027 nfs_release_request(req); 1019 1028 } ··· 1024 1035 unsigned long 1025 1036 nfs_reqs_to_commit(struct nfs_commit_info *cinfo) 1026 1037 { 1027 - return cinfo->mds->ncommit; 1038 + return atomic_long_read(&cinfo->mds->ncommit); 1028 1039 } 1029 1040 1030 - /* cinfo->inode->i_lock held by caller */ 1041 + /* NFS_I(cinfo->inode)->commit_mutex held by caller */ 1031 1042 int 1032 1043 nfs_scan_commit_list(struct list_head *src, struct list_head *dst, 1033 1044 struct nfs_commit_info *cinfo, int max) ··· 1035 1046 struct nfs_page *req, *tmp; 1036 1047 int ret = 0; 1037 1048 1049 + restart: 1038 1050 list_for_each_entry_safe(req, tmp, src, wb_list) { 1039 - if (!nfs_lock_request(req)) 1040 - continue; 1041 1051 kref_get(&req->wb_kref); 1042 - if (cond_resched_lock(&cinfo->inode->i_lock)) 1043 - list_safe_reset_next(req, tmp, wb_list); 1052 + if (!nfs_lock_request(req)) { 1053 + int status; 1054 + 1055 + /* Prevent deadlock with nfs_lock_and_join_requests */ 1056 + if (!list_empty(dst)) { 1057 + nfs_release_request(req); 1058 + continue; 1059 + } 1060 + /* Ensure we make progress to prevent livelock */ 1061 + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); 1062 + status = nfs_wait_on_request(req); 1063 + nfs_release_request(req); 1064 + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); 1065 + if (status < 0) 1066 + break; 1067 + goto restart; 1068 + } 1044 1069 nfs_request_remove_commit_list(req, cinfo); 1070 + clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); 1045 1071 nfs_list_add_request(req, dst); 1046 1072 ret++; 1047 1073 if ((ret == max) && !cinfo->dreq) 1048 1074 break; 1075 + cond_resched(); 1049 1076 } 1050 1077 return ret; 1051 1078 } 1079 + EXPORT_SYMBOL_GPL(nfs_scan_commit_list); 1052 1080 1053 1081 /* 1054 1082 * nfs_scan_commit - Scan an inode for commit requests ··· 1082 1076 { 1083 1077 int ret = 0; 1084 1078 1085 - spin_lock(&cinfo->inode->i_lock); 1086 - if (cinfo->mds->ncommit > 0) { 1079 + if (!atomic_long_read(&cinfo->mds->ncommit)) 1080 + return 0; 1081 + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); 1082 + if (atomic_long_read(&cinfo->mds->ncommit) > 0) { 1087 1083 const int max = INT_MAX; 1088 1084 1089 1085 ret = nfs_scan_commit_list(&cinfo->mds->list, dst, 1090 1086 cinfo, max); 1091 1087 ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); 1092 1088 } 1093 - spin_unlock(&cinfo->inode->i_lock); 1089 + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); 1094 1090 return ret; 1095 1091 } 1096 1092 ··· 1113 1105 unsigned int end; 1114 1106 int error; 1115 1107 1116 - if (!PagePrivate(page)) 1117 - return NULL; 1118 - 1119 1108 end = offset + bytes; 1120 - spin_lock(&inode->i_lock); 1121 1109 1122 - for (;;) { 1123 - req = nfs_page_find_head_request_locked(NFS_I(inode), page); 1124 - if (req == NULL) 1125 - goto out_unlock; 1110 + req = nfs_lock_and_join_requests(page); 1111 + if (IS_ERR_OR_NULL(req)) 1112 + return req; 1126 1113 1127 - /* should be handled by nfs_flush_incompatible */ 1128 - WARN_ON_ONCE(req->wb_head != req); 1129 - WARN_ON_ONCE(req->wb_this_page != req); 1130 - 1131 - rqend = req->wb_offset + req->wb_bytes; 1132 - /* 1133 - * Tell the caller to flush out the request if 1134 - * the offsets are non-contiguous. 1135 - * Note: nfs_flush_incompatible() will already 1136 - * have flushed out requests having wrong owners. 1137 - */ 1138 - if (offset > rqend 1139 - || end < req->wb_offset) 1140 - goto out_flushme; 1141 - 1142 - if (nfs_lock_request(req)) 1143 - break; 1144 - 1145 - /* The request is locked, so wait and then retry */ 1146 - spin_unlock(&inode->i_lock); 1147 - error = nfs_wait_on_request(req); 1148 - nfs_release_request(req); 1149 - if (error != 0) 1150 - goto out_err; 1151 - spin_lock(&inode->i_lock); 1152 - } 1114 + rqend = req->wb_offset + req->wb_bytes; 1115 + /* 1116 + * Tell the caller to flush out the request if 1117 + * the offsets are non-contiguous. 1118 + * Note: nfs_flush_incompatible() will already 1119 + * have flushed out requests having wrong owners. 1120 + */ 1121 + if (offset > rqend || end < req->wb_offset) 1122 + goto out_flushme; 1153 1123 1154 1124 /* Okay, the request matches. Update the region */ 1155 1125 if (offset < req->wb_offset) { ··· 1138 1152 req->wb_bytes = end - req->wb_offset; 1139 1153 else 1140 1154 req->wb_bytes = rqend - req->wb_offset; 1141 - out_unlock: 1142 - if (req) 1143 - nfs_clear_request_commit(req); 1144 - spin_unlock(&inode->i_lock); 1145 1155 return req; 1146 1156 out_flushme: 1147 - spin_unlock(&inode->i_lock); 1148 - nfs_release_request(req); 1157 + /* 1158 + * Note: we mark the request dirty here because 1159 + * nfs_lock_and_join_requests() cannot preserve 1160 + * commit flags, so we have to replay the write. 1161 + */ 1162 + nfs_mark_request_dirty(req); 1163 + nfs_unlock_and_release_request(req); 1149 1164 error = nfs_wb_page(inode, page); 1150 - out_err: 1151 - return ERR_PTR(error); 1165 + return (error < 0) ? ERR_PTR(error) : NULL; 1152 1166 } 1153 1167 1154 1168 /* ··· 1213 1227 l_ctx = req->wb_lock_context; 1214 1228 do_flush = req->wb_page != page || 1215 1229 !nfs_match_open_context(req->wb_context, ctx); 1216 - /* for now, flush if more than 1 request in page_group */ 1217 - do_flush |= req->wb_this_page != req; 1218 1230 if (l_ctx && flctx && 1219 1231 !(list_empty_careful(&flctx->flc_posix) && 1220 1232 list_empty_careful(&flctx->flc_flock))) { ··· 1396 1412 { 1397 1413 nfs_mark_request_dirty(req); 1398 1414 set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); 1399 - nfs_unlock_request(req); 1400 1415 nfs_end_page_writeback(req); 1401 1416 nfs_release_request(req); 1402 1417 } ··· 1435 1452 pg_ops = server->pnfs_curr_ld->pg_write_ops; 1436 1453 #endif 1437 1454 nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, 1438 - server->wsize, ioflags, GFP_NOIO); 1455 + server->wsize, ioflags); 1439 1456 } 1440 1457 EXPORT_SYMBOL_GPL(nfs_pageio_init_write); 1441 1458 ··· 1917 1934 int ret = 0; 1918 1935 1919 1936 /* no commits means nothing needs to be done */ 1920 - if (!nfsi->commit_info.ncommit) 1937 + if (!atomic_long_read(&nfsi->commit_info.ncommit)) 1921 1938 return ret; 1922 1939 1923 1940 if (wbc->sync_mode == WB_SYNC_NONE) { ··· 1998 2015 1999 2016 /* blocking call to cancel all requests and join to a single (head) 2000 2017 * request */ 2001 - req = nfs_lock_and_join_requests(page, false); 2018 + req = nfs_lock_and_join_requests(page); 2002 2019 2003 2020 if (IS_ERR(req)) { 2004 2021 ret = PTR_ERR(req);
+3 -3
include/linux/nfs_fs.h
··· 49 49 struct nfs_access_entry { 50 50 struct rb_node rb_node; 51 51 struct list_head lru; 52 - unsigned long jiffies; 53 52 struct rpc_cred * cred; 54 53 __u32 mask; 55 54 struct rcu_head rcu_head; ··· 153 154 */ 154 155 __be32 cookieverf[2]; 155 156 156 - unsigned long nrequests; 157 + atomic_long_t nrequests; 157 158 struct nfs_mds_commit_info commit_info; 158 159 159 160 /* Open contexts for shared mmap writes */ ··· 162 163 /* Readers: in-flight sillydelete RPC calls */ 163 164 /* Writers: rmdir */ 164 165 struct rw_semaphore rmdir_sem; 166 + struct mutex commit_mutex; 165 167 166 168 #if IS_ENABLED(CONFIG_NFS_V4) 167 169 struct nfs4_cached_acl *nfs4_acl; ··· 510 510 static inline int 511 511 nfs_have_writebacks(struct inode *inode) 512 512 { 513 - return NFS_I(inode)->nrequests != 0; 513 + return atomic_long_read(&NFS_I(inode)->nrequests) != 0; 514 514 } 515 515 516 516 /*
+2 -4
include/linux/nfs_page.h
··· 125 125 const struct nfs_pgio_completion_ops *compl_ops, 126 126 const struct nfs_rw_ops *rw_ops, 127 127 size_t bsize, 128 - int how, 129 - gfp_t gfp_flags); 128 + int how); 130 129 extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, 131 130 struct nfs_page *); 132 131 extern int nfs_pageio_resend(struct nfs_pageio_descriptor *, ··· 138 139 extern int nfs_wait_on_request(struct nfs_page *); 139 140 extern void nfs_unlock_request(struct nfs_page *req); 140 141 extern void nfs_unlock_and_release_request(struct nfs_page *); 141 - extern int nfs_page_group_lock(struct nfs_page *, bool); 142 - extern void nfs_page_group_lock_wait(struct nfs_page *); 142 + extern int nfs_page_group_lock(struct nfs_page *); 143 143 extern void nfs_page_group_unlock(struct nfs_page *); 144 144 extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); 145 145 extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
+1 -1
include/linux/nfs_xdr.h
··· 1476 1476 1477 1477 struct nfs_mds_commit_info { 1478 1478 atomic_t rpcs_out; 1479 - unsigned long ncommit; 1479 + atomic_long_t ncommit; 1480 1480 struct list_head list; 1481 1481 }; 1482 1482
+2
include/linux/sunrpc/sched.h
··· 139 139 #define RPC_TASK_RUNNING 0 140 140 #define RPC_TASK_QUEUED 1 141 141 #define RPC_TASK_ACTIVE 2 142 + #define RPC_TASK_MSG_RECV 3 143 + #define RPC_TASK_MSG_RECV_WAIT 4 142 144 143 145 #define RPC_IS_RUNNING(t) test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) 144 146 #define rpc_set_running(t) set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
+13
include/linux/sunrpc/xdr.h
··· 239 239 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); 240 240 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); 241 241 242 + /** 243 + * xdr_stream_remaining - Return the number of bytes remaining in the stream 244 + * @xdr: pointer to struct xdr_stream 245 + * 246 + * Return value: 247 + * Number of bytes remaining in @xdr before xdr->end 248 + */ 249 + static inline size_t 250 + xdr_stream_remaining(const struct xdr_stream *xdr) 251 + { 252 + return xdr->nwords << 2; 253 + } 254 + 242 255 ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, 243 256 size_t maxlen, gfp_t gfp_flags); 244 257 /**
+4 -1
include/linux/sunrpc/xprt.h
··· 174 174 175 175 struct rpc_xprt { 176 176 struct kref kref; /* Reference count */ 177 - struct rpc_xprt_ops * ops; /* transport methods */ 177 + const struct rpc_xprt_ops *ops; /* transport methods */ 178 178 179 179 const struct rpc_timeout *timeout; /* timeout parms */ 180 180 struct sockaddr_storage addr; /* server address */ ··· 232 232 */ 233 233 spinlock_t transport_lock; /* lock transport info */ 234 234 spinlock_t reserve_lock; /* lock slot table */ 235 + spinlock_t recv_lock; /* lock receive list */ 235 236 u32 xid; /* Next XID value to use */ 236 237 struct rpc_task * snd_task; /* Task blocked in send */ 237 238 struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ ··· 373 372 void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result); 374 373 struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid); 375 374 void xprt_complete_rqst(struct rpc_task *task, int copied); 375 + void xprt_pin_rqst(struct rpc_rqst *req); 376 + void xprt_unpin_rqst(struct rpc_rqst *req); 376 377 void xprt_release_rqst_cong(struct rpc_task *task); 377 378 void xprt_disconnect_done(struct rpc_xprt *xprt); 378 379 void xprt_force_disconnect(struct rpc_xprt *xprt);
+2 -2
net/sunrpc/backchannel_rqst.c
··· 171 171 /* 172 172 * Add the temporary list to the backchannel preallocation list 173 173 */ 174 - spin_lock_bh(&xprt->bc_pa_lock); 174 + spin_lock(&xprt->bc_pa_lock); 175 175 list_splice(&tmp_list, &xprt->bc_pa_list); 176 176 xprt_inc_alloc_count(xprt, min_reqs); 177 - spin_unlock_bh(&xprt->bc_pa_lock); 177 + spin_unlock(&xprt->bc_pa_lock); 178 178 179 179 dprintk("RPC: setup backchannel transport done\n"); 180 180 return 0;
+8 -4
net/sunrpc/clnt.c
··· 1903 1903 task->tk_status = 0; 1904 1904 switch (status) { 1905 1905 case -ECONNREFUSED: 1906 + /* A positive refusal suggests a rebind is needed. */ 1907 + if (RPC_IS_SOFTCONN(task)) 1908 + break; 1909 + if (clnt->cl_autobind) { 1910 + rpc_force_rebind(clnt); 1911 + task->tk_action = call_bind; 1912 + return; 1913 + } 1906 1914 case -ECONNRESET: 1907 1915 case -ECONNABORTED: 1908 1916 case -ENETUNREACH: ··· 2147 2139 rpc_delay(task, 3*HZ); 2148 2140 case -ETIMEDOUT: 2149 2141 task->tk_action = call_timeout; 2150 - if (!(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) 2151 - && task->tk_client->cl_discrtry) 2152 - xprt_conditional_disconnect(req->rq_xprt, 2153 - req->rq_connect_cookie); 2154 2142 break; 2155 2143 case -ECONNREFUSED: 2156 2144 case -ECONNRESET:
+3 -3
net/sunrpc/svcsock.c
··· 1013 1013 1014 1014 if (!bc_xprt) 1015 1015 return -EAGAIN; 1016 - spin_lock_bh(&bc_xprt->transport_lock); 1016 + spin_lock(&bc_xprt->recv_lock); 1017 1017 req = xprt_lookup_rqst(bc_xprt, xid); 1018 1018 if (!req) 1019 1019 goto unlock_notfound; ··· 1031 1031 memcpy(dst->iov_base, src->iov_base, src->iov_len); 1032 1032 xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len); 1033 1033 rqstp->rq_arg.len = 0; 1034 - spin_unlock_bh(&bc_xprt->transport_lock); 1034 + spin_unlock(&bc_xprt->recv_lock); 1035 1035 return 0; 1036 1036 unlock_notfound: 1037 1037 printk(KERN_NOTICE ··· 1040 1040 __func__, ntohl(calldir), 1041 1041 bc_xprt, ntohl(xid)); 1042 1042 unlock_eagain: 1043 - spin_unlock_bh(&bc_xprt->transport_lock); 1043 + spin_unlock(&bc_xprt->recv_lock); 1044 1044 return -EAGAIN; 1045 1045 } 1046 1046
+53 -4
net/sunrpc/xprt.c
··· 844 844 } 845 845 EXPORT_SYMBOL_GPL(xprt_lookup_rqst); 846 846 847 + /** 848 + * xprt_pin_rqst - Pin a request on the transport receive list 849 + * @req: Request to pin 850 + * 851 + * Caller must ensure this is atomic with the call to xprt_lookup_rqst() 852 + * so should be holding the xprt transport lock. 853 + */ 854 + void xprt_pin_rqst(struct rpc_rqst *req) 855 + { 856 + set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate); 857 + } 858 + EXPORT_SYMBOL_GPL(xprt_pin_rqst); 859 + 860 + /** 861 + * xprt_unpin_rqst - Unpin a request on the transport receive list 862 + * @req: Request to pin 863 + * 864 + * Caller should be holding the xprt transport lock. 865 + */ 866 + void xprt_unpin_rqst(struct rpc_rqst *req) 867 + { 868 + struct rpc_task *task = req->rq_task; 869 + 870 + clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate); 871 + if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate)) 872 + wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV); 873 + } 874 + EXPORT_SYMBOL_GPL(xprt_unpin_rqst); 875 + 876 + static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) 877 + __must_hold(&req->rq_xprt->recv_lock) 878 + { 879 + struct rpc_task *task = req->rq_task; 880 + 881 + if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) { 882 + spin_unlock(&req->rq_xprt->recv_lock); 883 + set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); 884 + wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV, 885 + TASK_UNINTERRUPTIBLE); 886 + clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); 887 + spin_lock(&req->rq_xprt->recv_lock); 888 + } 889 + } 890 + 847 891 static void xprt_update_rtt(struct rpc_task *task) 848 892 { 849 893 struct rpc_rqst *req = task->tk_rqstp; ··· 1010 966 /* 1011 967 * Add to the list only if we're expecting a reply 1012 968 */ 1013 - spin_lock_bh(&xprt->transport_lock); 1014 969 /* Update the softirq receive buffer */ 1015 970 memcpy(&req->rq_private_buf, &req->rq_rcv_buf, 1016 971 sizeof(req->rq_private_buf)); 1017 972 /* Add request to the receive list */ 973 + spin_lock(&xprt->recv_lock); 1018 974 list_add_tail(&req->rq_list, &xprt->recv); 1019 - spin_unlock_bh(&xprt->transport_lock); 975 + spin_unlock(&xprt->recv_lock); 1020 976 xprt_reset_majortimeo(req); 1021 977 /* Turn off autodisconnect */ 1022 978 del_singleshot_timer_sync(&xprt->timer); ··· 1331 1287 task->tk_ops->rpc_count_stats(task, task->tk_calldata); 1332 1288 else if (task->tk_client) 1333 1289 rpc_count_iostats(task, task->tk_client->cl_metrics); 1290 + spin_lock(&xprt->recv_lock); 1291 + if (!list_empty(&req->rq_list)) { 1292 + list_del(&req->rq_list); 1293 + xprt_wait_on_pinned_rqst(req); 1294 + } 1295 + spin_unlock(&xprt->recv_lock); 1334 1296 spin_lock_bh(&xprt->transport_lock); 1335 1297 xprt->ops->release_xprt(xprt, task); 1336 1298 if (xprt->ops->release_request) 1337 1299 xprt->ops->release_request(task); 1338 - if (!list_empty(&req->rq_list)) 1339 - list_del(&req->rq_list); 1340 1300 xprt->last_used = jiffies; 1341 1301 xprt_schedule_autodisconnect(xprt); 1342 1302 spin_unlock_bh(&xprt->transport_lock); ··· 1366 1318 1367 1319 spin_lock_init(&xprt->transport_lock); 1368 1320 spin_lock_init(&xprt->reserve_lock); 1321 + spin_lock_init(&xprt->recv_lock); 1369 1322 1370 1323 INIT_LIST_HEAD(&xprt->free); 1371 1324 INIT_LIST_HEAD(&xprt->recv);
+26 -43
net/sunrpc/xprtrdma/backchannel.c
··· 49 49 if (IS_ERR(rb)) 50 50 goto out_fail; 51 51 req->rl_rdmabuf = rb; 52 + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); 52 53 53 54 size = r_xprt->rx_data.inline_rsize; 54 55 rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); ··· 203 202 */ 204 203 int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) 205 204 { 206 - struct rpc_xprt *xprt = rqst->rq_xprt; 207 - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 205 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 208 206 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 209 - struct rpcrdma_msg *headerp; 207 + __be32 *p; 210 208 211 - headerp = rdmab_to_msg(req->rl_rdmabuf); 212 - headerp->rm_xid = rqst->rq_xid; 213 - headerp->rm_vers = rpcrdma_version; 214 - headerp->rm_credit = 215 - cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); 216 - headerp->rm_type = rdma_msg; 217 - headerp->rm_body.rm_chunks[0] = xdr_zero; 218 - headerp->rm_body.rm_chunks[1] = xdr_zero; 219 - headerp->rm_body.rm_chunks[2] = xdr_zero; 209 + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); 210 + xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf, 211 + req->rl_rdmabuf->rg_base); 212 + 213 + p = xdr_reserve_space(&req->rl_stream, 28); 214 + if (unlikely(!p)) 215 + return -EIO; 216 + *p++ = rqst->rq_xid; 217 + *p++ = rpcrdma_version; 218 + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); 219 + *p++ = rdma_msg; 220 + *p++ = xdr_zero; 221 + *p++ = xdr_zero; 222 + *p = xdr_zero; 220 223 221 224 if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN, 222 225 &rqst->rq_snd_buf, rpcrdma_noch)) ··· 276 271 * @xprt: transport receiving the call 277 272 * @rep: receive buffer containing the call 278 273 * 279 - * Called in the RPC reply handler, which runs in a tasklet. 280 - * Be quick about it. 281 - * 282 274 * Operational assumptions: 283 275 * o Backchannel credits are ignored, just as the NFS server 284 276 * forechannel currently does ··· 286 284 struct rpcrdma_rep *rep) 287 285 { 288 286 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 289 - struct rpcrdma_msg *headerp; 290 287 struct svc_serv *bc_serv; 291 288 struct rpcrdma_req *req; 292 289 struct rpc_rqst *rqst; ··· 293 292 size_t size; 294 293 __be32 *p; 295 294 296 - headerp = rdmab_to_msg(rep->rr_rdmabuf); 295 + p = xdr_inline_decode(&rep->rr_stream, 0); 296 + size = xdr_stream_remaining(&rep->rr_stream); 297 + 297 298 #ifdef RPCRDMA_BACKCHANNEL_DEBUG 298 299 pr_info("RPC: %s: callback XID %08x, length=%u\n", 299 - __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len); 300 - pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp); 300 + __func__, be32_to_cpup(p), size); 301 + pr_info("RPC: %s: %*ph\n", __func__, size, p); 301 302 #endif 302 - 303 - /* Sanity check: 304 - * Need at least enough bytes for RPC/RDMA header, as code 305 - * here references the header fields by array offset. Also, 306 - * backward calls are always inline, so ensure there 307 - * are some bytes beyond the RPC/RDMA header. 308 - */ 309 - if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24) 310 - goto out_short; 311 - p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); 312 - size = rep->rr_len - RPCRDMA_HDRLEN_MIN; 313 303 314 304 /* Grab a free bc rqst */ 315 305 spin_lock(&xprt->bc_pa_lock); ··· 317 325 /* Prepare rqst */ 318 326 rqst->rq_reply_bytes_recvd = 0; 319 327 rqst->rq_bytes_sent = 0; 320 - rqst->rq_xid = headerp->rm_xid; 328 + rqst->rq_xid = *p; 321 329 322 330 rqst->rq_private_buf.len = size; 323 331 set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); ··· 329 337 buf->len = size; 330 338 331 339 /* The receive buffer has to be hooked to the rpcrdma_req 332 - * so that it can be reposted after the server is done 333 - * parsing it but just before sending the backward 334 - * direction reply. 340 + * so that it is not released while the req is pointing 341 + * to its buffer, and so that it can be reposted after 342 + * the Upper Layer is done decoding it. 335 343 */ 336 344 req = rpcr_to_rdmar(rqst); 337 345 dprintk("RPC: %s: attaching rep %p to req %p\n", ··· 359 367 * when the connection is re-established. 360 368 */ 361 369 return; 362 - 363 - out_short: 364 - pr_warn("RPC/RDMA short backward direction call\n"); 365 - 366 - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) 367 - xprt_disconnect_done(xprt); 368 - else 369 - pr_warn("RPC: %s: reposting rep %p\n", 370 - __func__, rep); 371 370 }
+5 -5
net/sunrpc/xprtrdma/fmr_ops.c
··· 177 177 /* Use the ib_map_phys_fmr() verb to register a memory region 178 178 * for remote access via RDMA READ or RDMA WRITE. 179 179 */ 180 - static int 180 + static struct rpcrdma_mr_seg * 181 181 fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 182 182 int nsegs, bool writing, struct rpcrdma_mw **out) 183 183 { ··· 188 188 189 189 mw = rpcrdma_get_mw(r_xprt); 190 190 if (!mw) 191 - return -ENOBUFS; 191 + return ERR_PTR(-ENOBUFS); 192 192 193 193 pageoff = offset_in_page(seg1->mr_offset); 194 194 seg1->mr_offset -= pageoff; /* start of page */ ··· 232 232 mw->mw_offset = dma_pages[0] + pageoff; 233 233 234 234 *out = mw; 235 - return mw->mw_nents; 235 + return seg; 236 236 237 237 out_dmamap_err: 238 238 pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", 239 239 mw->mw_sg, i); 240 240 rpcrdma_put_mw(r_xprt, mw); 241 - return -EIO; 241 + return ERR_PTR(-EIO); 242 242 243 243 out_maperr: 244 244 pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", ··· 247 247 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, 248 248 mw->mw_sg, mw->mw_nents, mw->mw_dir); 249 249 rpcrdma_put_mw(r_xprt, mw); 250 - return -EIO; 250 + return ERR_PTR(-EIO); 251 251 } 252 252 253 253 /* Invalidate all memory regions that were registered for "req".
+6 -6
net/sunrpc/xprtrdma/frwr_ops.c
··· 344 344 /* Post a REG_MR Work Request to register a memory region 345 345 * for remote access via RDMA READ or RDMA WRITE. 346 346 */ 347 - static int 347 + static struct rpcrdma_mr_seg * 348 348 frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 349 349 int nsegs, bool writing, struct rpcrdma_mw **out) 350 350 { ··· 364 364 rpcrdma_defer_mr_recovery(mw); 365 365 mw = rpcrdma_get_mw(r_xprt); 366 366 if (!mw) 367 - return -ENOBUFS; 367 + return ERR_PTR(-ENOBUFS); 368 368 } while (mw->frmr.fr_state != FRMR_IS_INVALID); 369 369 frmr = &mw->frmr; 370 370 frmr->fr_state = FRMR_IS_VALID; ··· 429 429 mw->mw_offset = mr->iova; 430 430 431 431 *out = mw; 432 - return mw->mw_nents; 432 + return seg; 433 433 434 434 out_dmamap_err: 435 435 pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", 436 436 mw->mw_sg, i); 437 437 frmr->fr_state = FRMR_IS_INVALID; 438 438 rpcrdma_put_mw(r_xprt, mw); 439 - return -EIO; 439 + return ERR_PTR(-EIO); 440 440 441 441 out_mapmr_err: 442 442 pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", 443 443 frmr->fr_mr, n, mw->mw_nents); 444 444 rpcrdma_defer_mr_recovery(mw); 445 - return -EIO; 445 + return ERR_PTR(-EIO); 446 446 447 447 out_senderr: 448 448 pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); 449 449 rpcrdma_defer_mr_recovery(mw); 450 - return -ENOTCONN; 450 + return ERR_PTR(-ENOTCONN); 451 451 } 452 452 453 453 /* Invalidate all memory regions that were registered for "req".
+538 -400
net/sunrpc/xprtrdma/rpc_rdma.c
··· 169 169 return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; 170 170 } 171 171 172 - /* Split "vec" on page boundaries into segments. FMR registers pages, 173 - * not a byte range. Other modes coalesce these segments into a single 174 - * MR when they can. 172 + /* Split @vec on page boundaries into SGEs. FMR registers pages, not 173 + * a byte range. Other modes coalesce these SGEs into a single MR 174 + * when they can. 175 + * 176 + * Returns pointer to next available SGE, and bumps the total number 177 + * of SGEs consumed. 175 178 */ 176 - static int 177 - rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) 179 + static struct rpcrdma_mr_seg * 180 + rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 181 + unsigned int *n) 178 182 { 179 - size_t page_offset; 180 - u32 remaining; 183 + u32 remaining, page_offset; 181 184 char *base; 182 185 183 186 base = vec->iov_base; 184 187 page_offset = offset_in_page(base); 185 188 remaining = vec->iov_len; 186 - while (remaining && n < RPCRDMA_MAX_SEGS) { 187 - seg[n].mr_page = NULL; 188 - seg[n].mr_offset = base; 189 - seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); 190 - remaining -= seg[n].mr_len; 191 - base += seg[n].mr_len; 192 - ++n; 189 + while (remaining) { 190 + seg->mr_page = NULL; 191 + seg->mr_offset = base; 192 + seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); 193 + remaining -= seg->mr_len; 194 + base += seg->mr_len; 195 + ++seg; 196 + ++(*n); 193 197 page_offset = 0; 194 198 } 195 - return n; 199 + return seg; 196 200 } 197 201 198 - /* 199 - * Chunk assembly from upper layer xdr_buf. 202 + /* Convert @xdrbuf into SGEs no larger than a page each. As they 203 + * are registered, these SGEs are then coalesced into RDMA segments 204 + * when the selected memreg mode supports it. 200 205 * 201 - * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk 202 - * elements. Segments are then coalesced when registered, if possible 203 - * within the selected memreg mode. 204 - * 205 - * Returns positive number of segments converted, or a negative errno. 206 + * Returns positive number of SGEs consumed, or a negative errno. 206 207 */ 207 208 208 209 static int ··· 211 210 unsigned int pos, enum rpcrdma_chunktype type, 212 211 struct rpcrdma_mr_seg *seg) 213 212 { 214 - int len, n, p, page_base; 213 + unsigned long page_base; 214 + unsigned int len, n; 215 215 struct page **ppages; 216 216 217 217 n = 0; 218 - if (pos == 0) { 219 - n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); 220 - if (n == RPCRDMA_MAX_SEGS) 221 - goto out_overflow; 222 - } 218 + if (pos == 0) 219 + seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); 223 220 224 221 len = xdrbuf->page_len; 225 222 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 226 223 page_base = offset_in_page(xdrbuf->page_base); 227 - p = 0; 228 - while (len && n < RPCRDMA_MAX_SEGS) { 229 - if (!ppages[p]) { 230 - /* alloc the pagelist for receiving buffer */ 231 - ppages[p] = alloc_page(GFP_ATOMIC); 232 - if (!ppages[p]) 224 + while (len) { 225 + if (unlikely(!*ppages)) { 226 + /* XXX: Certain upper layer operations do 227 + * not provide receive buffer pages. 228 + */ 229 + *ppages = alloc_page(GFP_ATOMIC); 230 + if (!*ppages) 233 231 return -EAGAIN; 234 232 } 235 - seg[n].mr_page = ppages[p]; 236 - seg[n].mr_offset = (void *)(unsigned long) page_base; 237 - seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); 238 - if (seg[n].mr_len > PAGE_SIZE) 239 - goto out_overflow; 240 - len -= seg[n].mr_len; 233 + seg->mr_page = *ppages; 234 + seg->mr_offset = (char *)page_base; 235 + seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); 236 + len -= seg->mr_len; 237 + ++ppages; 238 + ++seg; 241 239 ++n; 242 - ++p; 243 - page_base = 0; /* page offset only applies to first page */ 240 + page_base = 0; 244 241 } 245 - 246 - /* Message overflows the seg array */ 247 - if (len && n == RPCRDMA_MAX_SEGS) 248 - goto out_overflow; 249 242 250 243 /* When encoding a Read chunk, the tail iovec contains an 251 244 * XDR pad and may be omitted. 252 245 */ 253 246 if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) 254 - return n; 247 + goto out; 255 248 256 249 /* When encoding a Write chunk, some servers need to see an 257 250 * extra segment for non-XDR-aligned Write chunks. The upper ··· 253 258 * for this purpose. 254 259 */ 255 260 if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) 256 - return n; 261 + goto out; 257 262 258 - if (xdrbuf->tail[0].iov_len) { 259 - n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); 260 - if (n == RPCRDMA_MAX_SEGS) 261 - goto out_overflow; 262 - } 263 + if (xdrbuf->tail[0].iov_len) 264 + seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); 263 265 266 + out: 267 + if (unlikely(n > RPCRDMA_MAX_SEGS)) 268 + return -EIO; 264 269 return n; 265 - 266 - out_overflow: 267 - pr_err("rpcrdma: segment array overflow\n"); 268 - return -EIO; 269 270 } 270 271 271 - static inline __be32 * 272 + static inline int 273 + encode_item_present(struct xdr_stream *xdr) 274 + { 275 + __be32 *p; 276 + 277 + p = xdr_reserve_space(xdr, sizeof(*p)); 278 + if (unlikely(!p)) 279 + return -EMSGSIZE; 280 + 281 + *p = xdr_one; 282 + return 0; 283 + } 284 + 285 + static inline int 286 + encode_item_not_present(struct xdr_stream *xdr) 287 + { 288 + __be32 *p; 289 + 290 + p = xdr_reserve_space(xdr, sizeof(*p)); 291 + if (unlikely(!p)) 292 + return -EMSGSIZE; 293 + 294 + *p = xdr_zero; 295 + return 0; 296 + } 297 + 298 + static void 272 299 xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) 273 300 { 274 301 *iptr++ = cpu_to_be32(mw->mw_handle); 275 302 *iptr++ = cpu_to_be32(mw->mw_length); 276 - return xdr_encode_hyper(iptr, mw->mw_offset); 303 + xdr_encode_hyper(iptr, mw->mw_offset); 277 304 } 278 305 279 - /* XDR-encode the Read list. Supports encoding a list of read 306 + static int 307 + encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) 308 + { 309 + __be32 *p; 310 + 311 + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 312 + if (unlikely(!p)) 313 + return -EMSGSIZE; 314 + 315 + xdr_encode_rdma_segment(p, mw); 316 + return 0; 317 + } 318 + 319 + static int 320 + encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, 321 + u32 position) 322 + { 323 + __be32 *p; 324 + 325 + p = xdr_reserve_space(xdr, 6 * sizeof(*p)); 326 + if (unlikely(!p)) 327 + return -EMSGSIZE; 328 + 329 + *p++ = xdr_one; /* Item present */ 330 + *p++ = cpu_to_be32(position); 331 + xdr_encode_rdma_segment(p, mw); 332 + return 0; 333 + } 334 + 335 + /* Register and XDR encode the Read list. Supports encoding a list of read 280 336 * segments that belong to a single read chunk. 281 337 * 282 338 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): ··· 336 290 * N elements, position P (same P for all chunks of same arg!): 337 291 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 338 292 * 339 - * Returns a pointer to the XDR word in the RDMA header following 340 - * the end of the Read list, or an error pointer. 293 + * Returns zero on success, or a negative errno if a failure occurred. 294 + * @xdr is advanced to the next position in the stream. 295 + * 296 + * Only a single @pos value is currently supported. 341 297 */ 342 - static __be32 * 343 - rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, 344 - struct rpcrdma_req *req, struct rpc_rqst *rqst, 345 - __be32 *iptr, enum rpcrdma_chunktype rtype) 298 + static noinline int 299 + rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 300 + struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) 346 301 { 302 + struct xdr_stream *xdr = &req->rl_stream; 347 303 struct rpcrdma_mr_seg *seg; 348 304 struct rpcrdma_mw *mw; 349 305 unsigned int pos; 350 - int n, nsegs; 351 - 352 - if (rtype == rpcrdma_noch) { 353 - *iptr++ = xdr_zero; /* item not present */ 354 - return iptr; 355 - } 306 + int nsegs; 356 307 357 308 pos = rqst->rq_snd_buf.head[0].iov_len; 358 309 if (rtype == rpcrdma_areadch) ··· 358 315 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, 359 316 rtype, seg); 360 317 if (nsegs < 0) 361 - return ERR_PTR(nsegs); 318 + return nsegs; 362 319 363 320 do { 364 - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 365 - false, &mw); 366 - if (n < 0) 367 - return ERR_PTR(n); 321 + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 322 + false, &mw); 323 + if (IS_ERR(seg)) 324 + return PTR_ERR(seg); 368 325 rpcrdma_push_mw(mw, &req->rl_registered); 369 326 370 - *iptr++ = xdr_one; /* item present */ 371 - 372 - /* All read segments in this chunk 373 - * have the same "position". 374 - */ 375 - *iptr++ = cpu_to_be32(pos); 376 - iptr = xdr_encode_rdma_segment(iptr, mw); 327 + if (encode_read_segment(xdr, mw, pos) < 0) 328 + return -EMSGSIZE; 377 329 378 330 dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", 379 331 rqst->rq_task->tk_pid, __func__, pos, 380 332 mw->mw_length, (unsigned long long)mw->mw_offset, 381 - mw->mw_handle, n < nsegs ? "more" : "last"); 333 + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); 382 334 383 335 r_xprt->rx_stats.read_chunk_count++; 384 - seg += n; 385 - nsegs -= n; 336 + nsegs -= mw->mw_nents; 386 337 } while (nsegs); 387 338 388 - /* Finish Read list */ 389 - *iptr++ = xdr_zero; /* Next item not present */ 390 - return iptr; 339 + return 0; 391 340 } 392 341 393 - /* XDR-encode the Write list. Supports encoding a list containing 394 - * one array of plain segments that belong to a single write chunk. 342 + /* Register and XDR encode the Write list. Supports encoding a list 343 + * containing one array of plain segments that belong to a single 344 + * write chunk. 395 345 * 396 346 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 397 347 * ··· 392 356 * N elements: 393 357 * 1 - N - HLOO - HLOO - ... - HLOO - 0 394 358 * 395 - * Returns a pointer to the XDR word in the RDMA header following 396 - * the end of the Write list, or an error pointer. 359 + * Returns zero on success, or a negative errno if a failure occurred. 360 + * @xdr is advanced to the next position in the stream. 361 + * 362 + * Only a single Write chunk is currently supported. 397 363 */ 398 - static __be32 * 364 + static noinline int 399 365 rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 400 - struct rpc_rqst *rqst, __be32 *iptr, 401 - enum rpcrdma_chunktype wtype) 366 + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) 402 367 { 368 + struct xdr_stream *xdr = &req->rl_stream; 403 369 struct rpcrdma_mr_seg *seg; 404 370 struct rpcrdma_mw *mw; 405 - int n, nsegs, nchunks; 371 + int nsegs, nchunks; 406 372 __be32 *segcount; 407 - 408 - if (wtype != rpcrdma_writech) { 409 - *iptr++ = xdr_zero; /* no Write list present */ 410 - return iptr; 411 - } 412 373 413 374 seg = req->rl_segments; 414 375 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 415 376 rqst->rq_rcv_buf.head[0].iov_len, 416 377 wtype, seg); 417 378 if (nsegs < 0) 418 - return ERR_PTR(nsegs); 379 + return nsegs; 419 380 420 - *iptr++ = xdr_one; /* Write list present */ 421 - segcount = iptr++; /* save location of segment count */ 381 + if (encode_item_present(xdr) < 0) 382 + return -EMSGSIZE; 383 + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 384 + if (unlikely(!segcount)) 385 + return -EMSGSIZE; 386 + /* Actual value encoded below */ 422 387 423 388 nchunks = 0; 424 389 do { 425 - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 426 - true, &mw); 427 - if (n < 0) 428 - return ERR_PTR(n); 390 + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 391 + true, &mw); 392 + if (IS_ERR(seg)) 393 + return PTR_ERR(seg); 429 394 rpcrdma_push_mw(mw, &req->rl_registered); 430 395 431 - iptr = xdr_encode_rdma_segment(iptr, mw); 396 + if (encode_rdma_segment(xdr, mw) < 0) 397 + return -EMSGSIZE; 432 398 433 399 dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", 434 400 rqst->rq_task->tk_pid, __func__, 435 401 mw->mw_length, (unsigned long long)mw->mw_offset, 436 - mw->mw_handle, n < nsegs ? "more" : "last"); 402 + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); 437 403 438 404 r_xprt->rx_stats.write_chunk_count++; 439 405 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 440 406 nchunks++; 441 - seg += n; 442 - nsegs -= n; 407 + nsegs -= mw->mw_nents; 443 408 } while (nsegs); 444 409 445 410 /* Update count of segments in this Write chunk */ 446 411 *segcount = cpu_to_be32(nchunks); 447 412 448 - /* Finish Write list */ 449 - *iptr++ = xdr_zero; /* Next item not present */ 450 - return iptr; 413 + return 0; 451 414 } 452 415 453 - /* XDR-encode the Reply chunk. Supports encoding an array of plain 454 - * segments that belong to a single write (reply) chunk. 416 + /* Register and XDR encode the Reply chunk. Supports encoding an array 417 + * of plain segments that belong to a single write (reply) chunk. 455 418 * 456 419 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 457 420 * ··· 458 423 * N elements: 459 424 * 1 - N - HLOO - HLOO - ... - HLOO 460 425 * 461 - * Returns a pointer to the XDR word in the RDMA header following 462 - * the end of the Reply chunk, or an error pointer. 426 + * Returns zero on success, or a negative errno if a failure occurred. 427 + * @xdr is advanced to the next position in the stream. 463 428 */ 464 - static __be32 * 465 - rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, 466 - struct rpcrdma_req *req, struct rpc_rqst *rqst, 467 - __be32 *iptr, enum rpcrdma_chunktype wtype) 429 + static noinline int 430 + rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 431 + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) 468 432 { 433 + struct xdr_stream *xdr = &req->rl_stream; 469 434 struct rpcrdma_mr_seg *seg; 470 435 struct rpcrdma_mw *mw; 471 - int n, nsegs, nchunks; 436 + int nsegs, nchunks; 472 437 __be32 *segcount; 473 - 474 - if (wtype != rpcrdma_replych) { 475 - *iptr++ = xdr_zero; /* no Reply chunk present */ 476 - return iptr; 477 - } 478 438 479 439 seg = req->rl_segments; 480 440 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); 481 441 if (nsegs < 0) 482 - return ERR_PTR(nsegs); 442 + return nsegs; 483 443 484 - *iptr++ = xdr_one; /* Reply chunk present */ 485 - segcount = iptr++; /* save location of segment count */ 444 + if (encode_item_present(xdr) < 0) 445 + return -EMSGSIZE; 446 + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 447 + if (unlikely(!segcount)) 448 + return -EMSGSIZE; 449 + /* Actual value encoded below */ 486 450 487 451 nchunks = 0; 488 452 do { 489 - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 490 - true, &mw); 491 - if (n < 0) 492 - return ERR_PTR(n); 453 + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 454 + true, &mw); 455 + if (IS_ERR(seg)) 456 + return PTR_ERR(seg); 493 457 rpcrdma_push_mw(mw, &req->rl_registered); 494 458 495 - iptr = xdr_encode_rdma_segment(iptr, mw); 459 + if (encode_rdma_segment(xdr, mw) < 0) 460 + return -EMSGSIZE; 496 461 497 462 dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", 498 463 rqst->rq_task->tk_pid, __func__, 499 464 mw->mw_length, (unsigned long long)mw->mw_offset, 500 - mw->mw_handle, n < nsegs ? "more" : "last"); 465 + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); 501 466 502 467 r_xprt->rx_stats.reply_chunk_count++; 503 468 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 504 469 nchunks++; 505 - seg += n; 506 - nsegs -= n; 470 + nsegs -= mw->mw_nents; 507 471 } while (nsegs); 508 472 509 473 /* Update count of segments in the Reply chunk */ 510 474 *segcount = cpu_to_be32(nchunks); 511 475 512 - return iptr; 476 + return 0; 513 477 } 514 478 515 479 /* Prepare the RPC-over-RDMA header SGE. ··· 685 651 req->rl_mapped_sges = 0; 686 652 } 687 653 688 - /* 689 - * Marshal a request: the primary job of this routine is to choose 690 - * the transfer modes. See comments below. 654 + /** 655 + * rpcrdma_marshal_req - Marshal and send one RPC request 656 + * @r_xprt: controlling transport 657 + * @rqst: RPC request to be marshaled 691 658 * 692 - * Returns zero on success, otherwise a negative errno. 659 + * For the RPC in "rqst", this function: 660 + * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) 661 + * - Registers Read, Write, and Reply chunks 662 + * - Constructs the transport header 663 + * - Posts a Send WR to send the transport header and request 664 + * 665 + * Returns: 666 + * %0 if the RPC was sent successfully, 667 + * %-ENOTCONN if the connection was lost, 668 + * %-EAGAIN if not enough pages are available for on-demand reply buffer, 669 + * %-ENOBUFS if no MRs are available to register chunks, 670 + * %-EMSGSIZE if the transport header is too small, 671 + * %-EIO if a permanent problem occurred while marshaling. 693 672 */ 694 - 695 673 int 696 - rpcrdma_marshal_req(struct rpc_rqst *rqst) 674 + rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) 697 675 { 698 - struct rpc_xprt *xprt = rqst->rq_xprt; 699 - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 700 676 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 677 + struct xdr_stream *xdr = &req->rl_stream; 701 678 enum rpcrdma_chunktype rtype, wtype; 702 - struct rpcrdma_msg *headerp; 703 679 bool ddp_allowed; 704 - ssize_t hdrlen; 705 - size_t rpclen; 706 - __be32 *iptr; 680 + __be32 *p; 681 + int ret; 707 682 708 683 #if defined(CONFIG_SUNRPC_BACKCHANNEL) 709 684 if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) 710 685 return rpcrdma_bc_marshal_reply(rqst); 711 686 #endif 712 687 713 - headerp = rdmab_to_msg(req->rl_rdmabuf); 714 - /* don't byte-swap XID, it's already done in request */ 715 - headerp->rm_xid = rqst->rq_xid; 716 - headerp->rm_vers = rpcrdma_version; 717 - headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); 718 - headerp->rm_type = rdma_msg; 688 + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); 689 + xdr_init_encode(xdr, &req->rl_hdrbuf, 690 + req->rl_rdmabuf->rg_base); 691 + 692 + /* Fixed header fields */ 693 + ret = -EMSGSIZE; 694 + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 695 + if (!p) 696 + goto out_err; 697 + *p++ = rqst->rq_xid; 698 + *p++ = rpcrdma_version; 699 + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); 719 700 720 701 /* When the ULP employs a GSS flavor that guarantees integrity 721 702 * or privacy, direct data placement of individual data items ··· 770 721 * by themselves are larger than the inline threshold. 771 722 */ 772 723 if (rpcrdma_args_inline(r_xprt, rqst)) { 724 + *p++ = rdma_msg; 773 725 rtype = rpcrdma_noch; 774 - rpclen = rqst->rq_snd_buf.len; 775 726 } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { 727 + *p++ = rdma_msg; 776 728 rtype = rpcrdma_readch; 777 - rpclen = rqst->rq_snd_buf.head[0].iov_len + 778 - rqst->rq_snd_buf.tail[0].iov_len; 779 729 } else { 780 730 r_xprt->rx_stats.nomsg_call_count++; 781 - headerp->rm_type = htonl(RDMA_NOMSG); 731 + *p++ = rdma_nomsg; 782 732 rtype = rpcrdma_areadch; 783 - rpclen = 0; 784 733 } 785 - 786 - req->rl_xid = rqst->rq_xid; 787 - rpcrdma_insert_req(&r_xprt->rx_buf, req); 788 734 789 735 /* This implementation supports the following combinations 790 736 * of chunk lists in one RPC-over-RDMA Call message: ··· 803 759 * send a Call message with a Position Zero Read chunk and a 804 760 * regular Read chunk at the same time. 805 761 */ 806 - iptr = headerp->rm_body.rm_chunks; 807 - iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); 808 - if (IS_ERR(iptr)) 762 + if (rtype != rpcrdma_noch) { 763 + ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); 764 + if (ret) 765 + goto out_err; 766 + } 767 + ret = encode_item_not_present(xdr); 768 + if (ret) 809 769 goto out_err; 810 - iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); 811 - if (IS_ERR(iptr)) 812 - goto out_err; 813 - iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); 814 - if (IS_ERR(iptr)) 815 - goto out_err; 816 - hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; 817 770 818 - dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", 771 + if (wtype == rpcrdma_writech) { 772 + ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); 773 + if (ret) 774 + goto out_err; 775 + } 776 + ret = encode_item_not_present(xdr); 777 + if (ret) 778 + goto out_err; 779 + 780 + if (wtype != rpcrdma_replych) 781 + ret = encode_item_not_present(xdr); 782 + else 783 + ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); 784 + if (ret) 785 + goto out_err; 786 + 787 + dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n", 819 788 rqst->rq_task->tk_pid, __func__, 820 789 transfertypes[rtype], transfertypes[wtype], 821 - hdrlen, rpclen); 790 + xdr_stream_pos(xdr)); 822 791 823 - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, 792 + if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, 793 + xdr_stream_pos(xdr), 824 794 &rqst->rq_snd_buf, rtype)) { 825 - iptr = ERR_PTR(-EIO); 795 + ret = -EIO; 826 796 goto out_err; 827 797 } 828 798 return 0; 829 799 830 800 out_err: 831 - if (PTR_ERR(iptr) != -ENOBUFS) { 832 - pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", 833 - PTR_ERR(iptr)); 801 + if (ret != -ENOBUFS) { 802 + pr_err("rpcrdma: header marshaling failed (%d)\n", ret); 834 803 r_xprt->rx_stats.failed_marshal_count++; 835 804 } 836 - return PTR_ERR(iptr); 837 - } 838 - 839 - /* 840 - * Chase down a received write or reply chunklist to get length 841 - * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) 842 - */ 843 - static int 844 - rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) 845 - { 846 - unsigned int i, total_len; 847 - struct rpcrdma_write_chunk *cur_wchunk; 848 - char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); 849 - 850 - i = be32_to_cpu(**iptrp); 851 - cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); 852 - total_len = 0; 853 - while (i--) { 854 - struct rpcrdma_segment *seg = &cur_wchunk->wc_target; 855 - ifdebug(FACILITY) { 856 - u64 off; 857 - xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); 858 - dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n", 859 - __func__, 860 - be32_to_cpu(seg->rs_length), 861 - (unsigned long long)off, 862 - be32_to_cpu(seg->rs_handle)); 863 - } 864 - total_len += be32_to_cpu(seg->rs_length); 865 - ++cur_wchunk; 866 - } 867 - /* check and adjust for properly terminated write chunk */ 868 - if (wrchunk) { 869 - __be32 *w = (__be32 *) cur_wchunk; 870 - if (*w++ != xdr_zero) 871 - return -1; 872 - cur_wchunk = (struct rpcrdma_write_chunk *) w; 873 - } 874 - if ((char *)cur_wchunk > base + rep->rr_len) 875 - return -1; 876 - 877 - *iptrp = (__be32 *) cur_wchunk; 878 - return total_len; 805 + return ret; 879 806 } 880 807 881 808 /** ··· 964 949 } 965 950 } 966 951 967 - #if defined(CONFIG_SUNRPC_BACKCHANNEL) 968 952 /* By convention, backchannel calls arrive via rdma_msg type 969 953 * messages, and never populate the chunk lists. This makes 970 954 * the RPC/RDMA header small and fixed in size, so it is 971 955 * straightforward to check the RPC header's direction field. 972 956 */ 973 957 static bool 974 - rpcrdma_is_bcall(struct rpcrdma_msg *headerp) 958 + rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 959 + __be32 xid, __be32 proc) 960 + #if defined(CONFIG_SUNRPC_BACKCHANNEL) 975 961 { 976 - __be32 *p = (__be32 *)headerp; 962 + struct xdr_stream *xdr = &rep->rr_stream; 963 + __be32 *p; 977 964 978 - if (headerp->rm_type != rdma_msg) 979 - return false; 980 - if (headerp->rm_body.rm_chunks[0] != xdr_zero) 981 - return false; 982 - if (headerp->rm_body.rm_chunks[1] != xdr_zero) 983 - return false; 984 - if (headerp->rm_body.rm_chunks[2] != xdr_zero) 965 + if (proc != rdma_msg) 985 966 return false; 986 967 987 - /* sanity */ 988 - if (p[7] != headerp->rm_xid) 968 + /* Peek at stream contents without advancing. */ 969 + p = xdr_inline_decode(xdr, 0); 970 + 971 + /* Chunk lists */ 972 + if (*p++ != xdr_zero) 989 973 return false; 990 - /* call direction */ 991 - if (p[8] != cpu_to_be32(RPC_CALL)) 974 + if (*p++ != xdr_zero) 975 + return false; 976 + if (*p++ != xdr_zero) 992 977 return false; 993 978 979 + /* RPC header */ 980 + if (*p++ != xid) 981 + return false; 982 + if (*p != cpu_to_be32(RPC_CALL)) 983 + return false; 984 + 985 + /* Now that we are sure this is a backchannel call, 986 + * advance to the RPC header. 987 + */ 988 + p = xdr_inline_decode(xdr, 3 * sizeof(*p)); 989 + if (unlikely(!p)) 990 + goto out_short; 991 + 992 + rpcrdma_bc_receive_call(r_xprt, rep); 993 + return true; 994 + 995 + out_short: 996 + pr_warn("RPC/RDMA short backward direction call\n"); 997 + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) 998 + xprt_disconnect_done(&r_xprt->rx_xprt); 994 999 return true; 995 1000 } 1001 + #else /* CONFIG_SUNRPC_BACKCHANNEL */ 1002 + { 1003 + return false; 1004 + } 996 1005 #endif /* CONFIG_SUNRPC_BACKCHANNEL */ 1006 + 1007 + static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) 1008 + { 1009 + __be32 *p; 1010 + 1011 + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); 1012 + if (unlikely(!p)) 1013 + return -EIO; 1014 + 1015 + ifdebug(FACILITY) { 1016 + u64 offset; 1017 + u32 handle; 1018 + 1019 + handle = be32_to_cpup(p++); 1020 + *length = be32_to_cpup(p++); 1021 + xdr_decode_hyper(p, &offset); 1022 + dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n", 1023 + __func__, *length, (unsigned long long)offset, 1024 + handle); 1025 + } else { 1026 + *length = be32_to_cpup(p + 1); 1027 + } 1028 + 1029 + return 0; 1030 + } 1031 + 1032 + static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) 1033 + { 1034 + u32 segcount, seglength; 1035 + __be32 *p; 1036 + 1037 + p = xdr_inline_decode(xdr, sizeof(*p)); 1038 + if (unlikely(!p)) 1039 + return -EIO; 1040 + 1041 + *length = 0; 1042 + segcount = be32_to_cpup(p); 1043 + while (segcount--) { 1044 + if (decode_rdma_segment(xdr, &seglength)) 1045 + return -EIO; 1046 + *length += seglength; 1047 + } 1048 + 1049 + dprintk("RPC: %s: segcount=%u, %u bytes\n", 1050 + __func__, be32_to_cpup(p), *length); 1051 + return 0; 1052 + } 1053 + 1054 + /* In RPC-over-RDMA Version One replies, a Read list is never 1055 + * expected. This decoder is a stub that returns an error if 1056 + * a Read list is present. 1057 + */ 1058 + static int decode_read_list(struct xdr_stream *xdr) 1059 + { 1060 + __be32 *p; 1061 + 1062 + p = xdr_inline_decode(xdr, sizeof(*p)); 1063 + if (unlikely(!p)) 1064 + return -EIO; 1065 + if (unlikely(*p != xdr_zero)) 1066 + return -EIO; 1067 + return 0; 1068 + } 1069 + 1070 + /* Supports only one Write chunk in the Write list 1071 + */ 1072 + static int decode_write_list(struct xdr_stream *xdr, u32 *length) 1073 + { 1074 + u32 chunklen; 1075 + bool first; 1076 + __be32 *p; 1077 + 1078 + *length = 0; 1079 + first = true; 1080 + do { 1081 + p = xdr_inline_decode(xdr, sizeof(*p)); 1082 + if (unlikely(!p)) 1083 + return -EIO; 1084 + if (*p == xdr_zero) 1085 + break; 1086 + if (!first) 1087 + return -EIO; 1088 + 1089 + if (decode_write_chunk(xdr, &chunklen)) 1090 + return -EIO; 1091 + *length += chunklen; 1092 + first = false; 1093 + } while (true); 1094 + return 0; 1095 + } 1096 + 1097 + static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) 1098 + { 1099 + __be32 *p; 1100 + 1101 + p = xdr_inline_decode(xdr, sizeof(*p)); 1102 + if (unlikely(!p)) 1103 + return -EIO; 1104 + 1105 + *length = 0; 1106 + if (*p != xdr_zero) 1107 + if (decode_write_chunk(xdr, length)) 1108 + return -EIO; 1109 + return 0; 1110 + } 1111 + 1112 + static int 1113 + rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1114 + struct rpc_rqst *rqst) 1115 + { 1116 + struct xdr_stream *xdr = &rep->rr_stream; 1117 + u32 writelist, replychunk, rpclen; 1118 + char *base; 1119 + 1120 + /* Decode the chunk lists */ 1121 + if (decode_read_list(xdr)) 1122 + return -EIO; 1123 + if (decode_write_list(xdr, &writelist)) 1124 + return -EIO; 1125 + if (decode_reply_chunk(xdr, &replychunk)) 1126 + return -EIO; 1127 + 1128 + /* RDMA_MSG sanity checks */ 1129 + if (unlikely(replychunk)) 1130 + return -EIO; 1131 + 1132 + /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ 1133 + base = (char *)xdr_inline_decode(xdr, 0); 1134 + rpclen = xdr_stream_remaining(xdr); 1135 + r_xprt->rx_stats.fixup_copy_count += 1136 + rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); 1137 + 1138 + r_xprt->rx_stats.total_rdma_reply += writelist; 1139 + return rpclen + xdr_align_size(writelist); 1140 + } 1141 + 1142 + static noinline int 1143 + rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1144 + { 1145 + struct xdr_stream *xdr = &rep->rr_stream; 1146 + u32 writelist, replychunk; 1147 + 1148 + /* Decode the chunk lists */ 1149 + if (decode_read_list(xdr)) 1150 + return -EIO; 1151 + if (decode_write_list(xdr, &writelist)) 1152 + return -EIO; 1153 + if (decode_reply_chunk(xdr, &replychunk)) 1154 + return -EIO; 1155 + 1156 + /* RDMA_NOMSG sanity checks */ 1157 + if (unlikely(writelist)) 1158 + return -EIO; 1159 + if (unlikely(!replychunk)) 1160 + return -EIO; 1161 + 1162 + /* Reply chunk buffer already is the reply vector */ 1163 + r_xprt->rx_stats.total_rdma_reply += replychunk; 1164 + return replychunk; 1165 + } 1166 + 1167 + static noinline int 1168 + rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1169 + struct rpc_rqst *rqst) 1170 + { 1171 + struct xdr_stream *xdr = &rep->rr_stream; 1172 + __be32 *p; 1173 + 1174 + p = xdr_inline_decode(xdr, sizeof(*p)); 1175 + if (unlikely(!p)) 1176 + return -EIO; 1177 + 1178 + switch (*p) { 1179 + case err_vers: 1180 + p = xdr_inline_decode(xdr, 2 * sizeof(*p)); 1181 + if (!p) 1182 + break; 1183 + dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n", 1184 + rqst->rq_task->tk_pid, __func__, 1185 + be32_to_cpup(p), be32_to_cpu(*(p + 1))); 1186 + break; 1187 + case err_chunk: 1188 + dprintk("RPC: %5u: %s: server reports header decoding error\n", 1189 + rqst->rq_task->tk_pid, __func__); 1190 + break; 1191 + default: 1192 + dprintk("RPC: %5u: %s: server reports unrecognized error %d\n", 1193 + rqst->rq_task->tk_pid, __func__, be32_to_cpup(p)); 1194 + } 1195 + 1196 + r_xprt->rx_stats.bad_reply_count++; 1197 + return -EREMOTEIO; 1198 + } 997 1199 998 1200 /* Process received RPC/RDMA messages. 999 1201 * ··· 1223 991 struct rpcrdma_rep *rep = 1224 992 container_of(work, struct rpcrdma_rep, rr_work); 1225 993 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1226 - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1227 994 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1228 - struct rpcrdma_msg *headerp; 995 + struct xdr_stream *xdr = &rep->rr_stream; 1229 996 struct rpcrdma_req *req; 1230 997 struct rpc_rqst *rqst; 1231 - __be32 *iptr; 1232 - int rdmalen, status, rmerr; 998 + __be32 *p, xid, vers, proc; 1233 999 unsigned long cwnd; 1234 - struct list_head mws; 1000 + int status; 1235 1001 1236 1002 dprintk("RPC: %s: incoming rep %p\n", __func__, rep); 1237 1003 1238 - if (rep->rr_len == RPCRDMA_BAD_LEN) 1004 + if (rep->rr_hdrbuf.head[0].iov_len == 0) 1239 1005 goto out_badstatus; 1240 - if (rep->rr_len < RPCRDMA_HDRLEN_ERR) 1241 - goto out_shortreply; 1242 1006 1243 - headerp = rdmab_to_msg(rep->rr_rdmabuf); 1244 - #if defined(CONFIG_SUNRPC_BACKCHANNEL) 1245 - if (rpcrdma_is_bcall(headerp)) 1246 - goto out_bcall; 1247 - #endif 1007 + xdr_init_decode(xdr, &rep->rr_hdrbuf, 1008 + rep->rr_hdrbuf.head[0].iov_base); 1009 + 1010 + /* Fixed transport header fields */ 1011 + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); 1012 + if (unlikely(!p)) 1013 + goto out_shortreply; 1014 + xid = *p++; 1015 + vers = *p++; 1016 + p++; /* credits */ 1017 + proc = *p++; 1018 + 1019 + if (rpcrdma_is_bcall(r_xprt, rep, xid, proc)) 1020 + return; 1248 1021 1249 1022 /* Match incoming rpcrdma_rep to an rpcrdma_req to 1250 1023 * get context for handling any incoming chunks. 1251 1024 */ 1252 - spin_lock(&buf->rb_lock); 1253 - req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf, 1254 - headerp->rm_xid); 1255 - if (!req) 1256 - goto out_nomatch; 1257 - if (req->rl_reply) 1258 - goto out_duplicate; 1259 - 1260 - list_replace_init(&req->rl_registered, &mws); 1261 - rpcrdma_mark_remote_invalidation(&mws, rep); 1262 - 1263 - /* Avoid races with signals and duplicate replies 1264 - * by marking this req as matched. 1265 - */ 1025 + spin_lock(&xprt->recv_lock); 1026 + rqst = xprt_lookup_rqst(xprt, xid); 1027 + if (!rqst) 1028 + goto out_norqst; 1029 + xprt_pin_rqst(rqst); 1030 + spin_unlock(&xprt->recv_lock); 1031 + req = rpcr_to_rdmar(rqst); 1266 1032 req->rl_reply = rep; 1267 - spin_unlock(&buf->rb_lock); 1268 1033 1269 1034 dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", 1270 - __func__, rep, req, be32_to_cpu(headerp->rm_xid)); 1035 + __func__, rep, req, be32_to_cpu(xid)); 1271 1036 1272 1037 /* Invalidate and unmap the data payloads before waking the 1273 1038 * waiting application. This guarantees the memory regions ··· 1273 1044 * waking the next RPC waits until this RPC has relinquished 1274 1045 * all its Send Queue entries. 1275 1046 */ 1276 - if (!list_empty(&mws)) 1277 - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &mws); 1278 - 1279 - /* Perform XID lookup, reconstruction of the RPC reply, and 1280 - * RPC completion while holding the transport lock to ensure 1281 - * the rep, rqst, and rq_task pointers remain stable. 1282 - */ 1283 - spin_lock_bh(&xprt->transport_lock); 1284 - rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); 1285 - if (!rqst) 1286 - goto out_norqst; 1287 - xprt->reestablish_timeout = 0; 1288 - if (headerp->rm_vers != rpcrdma_version) 1289 - goto out_badversion; 1290 - 1291 - /* check for expected message types */ 1292 - /* The order of some of these tests is important. */ 1293 - switch (headerp->rm_type) { 1294 - case rdma_msg: 1295 - /* never expect read chunks */ 1296 - /* never expect reply chunks (two ways to check) */ 1297 - if (headerp->rm_body.rm_chunks[0] != xdr_zero || 1298 - (headerp->rm_body.rm_chunks[1] == xdr_zero && 1299 - headerp->rm_body.rm_chunks[2] != xdr_zero)) 1300 - goto badheader; 1301 - if (headerp->rm_body.rm_chunks[1] != xdr_zero) { 1302 - /* count any expected write chunks in read reply */ 1303 - /* start at write chunk array count */ 1304 - iptr = &headerp->rm_body.rm_chunks[2]; 1305 - rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); 1306 - /* check for validity, and no reply chunk after */ 1307 - if (rdmalen < 0 || *iptr++ != xdr_zero) 1308 - goto badheader; 1309 - rep->rr_len -= 1310 - ((unsigned char *)iptr - (unsigned char *)headerp); 1311 - status = rep->rr_len + rdmalen; 1312 - r_xprt->rx_stats.total_rdma_reply += rdmalen; 1313 - /* special case - last chunk may omit padding */ 1314 - if (rdmalen &= 3) { 1315 - rdmalen = 4 - rdmalen; 1316 - status += rdmalen; 1317 - } 1318 - } else { 1319 - /* else ordinary inline */ 1320 - rdmalen = 0; 1321 - iptr = (__be32 *)((unsigned char *)headerp + 1322 - RPCRDMA_HDRLEN_MIN); 1323 - rep->rr_len -= RPCRDMA_HDRLEN_MIN; 1324 - status = rep->rr_len; 1325 - } 1326 - 1327 - r_xprt->rx_stats.fixup_copy_count += 1328 - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, 1329 - rdmalen); 1330 - break; 1331 - 1332 - case rdma_nomsg: 1333 - /* never expect read or write chunks, always reply chunks */ 1334 - if (headerp->rm_body.rm_chunks[0] != xdr_zero || 1335 - headerp->rm_body.rm_chunks[1] != xdr_zero || 1336 - headerp->rm_body.rm_chunks[2] != xdr_one) 1337 - goto badheader; 1338 - iptr = (__be32 *)((unsigned char *)headerp + 1339 - RPCRDMA_HDRLEN_MIN); 1340 - rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); 1341 - if (rdmalen < 0) 1342 - goto badheader; 1343 - r_xprt->rx_stats.total_rdma_reply += rdmalen; 1344 - /* Reply chunk buffer already is the reply vector - no fixup. */ 1345 - status = rdmalen; 1346 - break; 1347 - 1348 - case rdma_error: 1349 - goto out_rdmaerr; 1350 - 1351 - badheader: 1352 - default: 1353 - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", 1354 - rqst->rq_task->tk_pid, __func__, 1355 - be32_to_cpu(headerp->rm_type)); 1356 - status = -EIO; 1357 - r_xprt->rx_stats.bad_reply_count++; 1358 - break; 1047 + if (!list_empty(&req->rl_registered)) { 1048 + rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); 1049 + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, 1050 + &req->rl_registered); 1359 1051 } 1360 1052 1053 + xprt->reestablish_timeout = 0; 1054 + if (vers != rpcrdma_version) 1055 + goto out_badversion; 1056 + 1057 + switch (proc) { 1058 + case rdma_msg: 1059 + status = rpcrdma_decode_msg(r_xprt, rep, rqst); 1060 + break; 1061 + case rdma_nomsg: 1062 + status = rpcrdma_decode_nomsg(r_xprt, rep); 1063 + break; 1064 + case rdma_error: 1065 + status = rpcrdma_decode_error(r_xprt, rep, rqst); 1066 + break; 1067 + default: 1068 + status = -EIO; 1069 + } 1070 + if (status < 0) 1071 + goto out_badheader; 1072 + 1361 1073 out: 1074 + spin_lock(&xprt->recv_lock); 1362 1075 cwnd = xprt->cwnd; 1363 1076 xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; 1364 1077 if (xprt->cwnd > cwnd) 1365 1078 xprt_release_rqst_cong(rqst->rq_task); 1366 1079 1367 1080 xprt_complete_rqst(rqst->rq_task, status); 1368 - spin_unlock_bh(&xprt->transport_lock); 1081 + xprt_unpin_rqst(rqst); 1082 + spin_unlock(&xprt->recv_lock); 1369 1083 dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", 1370 1084 __func__, xprt, rqst, status); 1371 1085 return; ··· 1321 1149 } 1322 1150 return; 1323 1151 1324 - #if defined(CONFIG_SUNRPC_BACKCHANNEL) 1325 - out_bcall: 1326 - rpcrdma_bc_receive_call(r_xprt, rep); 1327 - return; 1328 - #endif 1329 - 1330 1152 /* If the incoming reply terminated a pending RPC, the next 1331 1153 * RPC call will post a replacement receive buffer as it is 1332 1154 * being marshaled. 1333 1155 */ 1334 1156 out_badversion: 1335 1157 dprintk("RPC: %s: invalid version %d\n", 1336 - __func__, be32_to_cpu(headerp->rm_vers)); 1158 + __func__, be32_to_cpu(vers)); 1337 1159 status = -EIO; 1338 1160 r_xprt->rx_stats.bad_reply_count++; 1339 1161 goto out; 1340 1162 1341 - out_rdmaerr: 1342 - rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err); 1343 - switch (rmerr) { 1344 - case ERR_VERS: 1345 - pr_err("%s: server reports header version error (%u-%u)\n", 1346 - __func__, 1347 - be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low), 1348 - be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high)); 1349 - break; 1350 - case ERR_CHUNK: 1351 - pr_err("%s: server reports header decoding error\n", 1352 - __func__); 1353 - break; 1354 - default: 1355 - pr_err("%s: server reports unknown error %d\n", 1356 - __func__, rmerr); 1357 - } 1358 - status = -EREMOTEIO; 1163 + out_badheader: 1164 + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", 1165 + rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc)); 1359 1166 r_xprt->rx_stats.bad_reply_count++; 1167 + status = -EIO; 1360 1168 goto out; 1361 1169 1362 - /* The req was still available, but by the time the transport_lock 1170 + /* The req was still available, but by the time the recv_lock 1363 1171 * was acquired, the rqst and task had been released. Thus the RPC 1364 1172 * has already been terminated. 1365 1173 */ 1366 1174 out_norqst: 1367 - spin_unlock_bh(&xprt->transport_lock); 1368 - rpcrdma_buffer_put(req); 1369 - dprintk("RPC: %s: race, no rqst left for req %p\n", 1370 - __func__, req); 1371 - return; 1175 + spin_unlock(&xprt->recv_lock); 1176 + dprintk("RPC: %s: no match for incoming xid 0x%08x\n", 1177 + __func__, be32_to_cpu(xid)); 1178 + goto repost; 1372 1179 1373 1180 out_shortreply: 1374 1181 dprintk("RPC: %s: short/invalid reply\n", __func__); 1375 1182 goto repost; 1376 - 1377 - out_nomatch: 1378 - spin_unlock(&buf->rb_lock); 1379 - dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n", 1380 - __func__, be32_to_cpu(headerp->rm_xid), 1381 - rep->rr_len); 1382 - goto repost; 1383 - 1384 - out_duplicate: 1385 - spin_unlock(&buf->rb_lock); 1386 - dprintk("RPC: %s: " 1387 - "duplicate reply %p to RPC request %p: xid 0x%08x\n", 1388 - __func__, rep, req, be32_to_cpu(headerp->rm_xid)); 1389 1183 1390 1184 /* If no pending RPC transaction was matched, post a replacement 1391 1185 * receive buffer before returning.
+6 -3
net/sunrpc/xprtrdma/svc_rdma_backchannel.c
··· 52 52 if (src->iov_len < 24) 53 53 goto out_shortreply; 54 54 55 - spin_lock_bh(&xprt->transport_lock); 55 + spin_lock(&xprt->recv_lock); 56 56 req = xprt_lookup_rqst(xprt, xid); 57 57 if (!req) 58 58 goto out_notfound; ··· 69 69 else if (credits > r_xprt->rx_buf.rb_bc_max_requests) 70 70 credits = r_xprt->rx_buf.rb_bc_max_requests; 71 71 72 + spin_lock_bh(&xprt->transport_lock); 72 73 cwnd = xprt->cwnd; 73 74 xprt->cwnd = credits << RPC_CWNDSHIFT; 74 75 if (xprt->cwnd > cwnd) 75 76 xprt_release_rqst_cong(req->rq_task); 77 + spin_unlock_bh(&xprt->transport_lock); 78 + 76 79 77 80 ret = 0; 78 81 xprt_complete_rqst(req->rq_task, rcvbuf->len); 79 82 rcvbuf->len = 0; 80 83 81 84 out_unlock: 82 - spin_unlock_bh(&xprt->transport_lock); 85 + spin_unlock(&xprt->recv_lock); 83 86 out: 84 87 return ret; 85 88 ··· 269 266 module_put(THIS_MODULE); 270 267 } 271 268 272 - static struct rpc_xprt_ops xprt_rdma_bc_procs = { 269 + static const struct rpc_xprt_ops xprt_rdma_bc_procs = { 273 270 .reserve_xprt = xprt_reserve_xprt_cong, 274 271 .release_xprt = xprt_release_xprt_cong, 275 272 .alloc_slot = xprt_alloc_slot,
+4 -4
net/sunrpc/xprtrdma/transport.c
··· 149 149 150 150 #endif 151 151 152 - static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */ 152 + static const struct rpc_xprt_ops xprt_rdma_procs; 153 153 154 154 static void 155 155 xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) ··· 559 559 560 560 r_xprt->rx_stats.hardway_register_count += size; 561 561 req->rl_rdmabuf = rb; 562 + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); 562 563 return true; 563 564 } 564 565 ··· 685 684 686 685 dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); 687 686 688 - rpcrdma_remove_req(&r_xprt->rx_buf, req); 689 687 if (!list_empty(&req->rl_registered)) 690 688 ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); 691 689 rpcrdma_unmap_sges(ia, req); ··· 730 730 if (unlikely(!list_empty(&req->rl_registered))) 731 731 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); 732 732 733 - rc = rpcrdma_marshal_req(rqst); 733 + rc = rpcrdma_marshal_req(r_xprt, rqst); 734 734 if (rc < 0) 735 735 goto failed_marshal; 736 736 ··· 811 811 * Plumbing for rpc transport switch and kernel module 812 812 */ 813 813 814 - static struct rpc_xprt_ops xprt_rdma_procs = { 814 + static const struct rpc_xprt_ops xprt_rdma_procs = { 815 815 .reserve_xprt = xprt_reserve_xprt_cong, 816 816 .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ 817 817 .alloc_slot = xprt_alloc_slot,
+9 -13
net/sunrpc/xprtrdma/verbs.c
··· 139 139 static void 140 140 rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) 141 141 { 142 - struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf); 143 142 struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf; 143 + __be32 *p = rep->rr_rdmabuf->rg_base; 144 144 u32 credits; 145 145 146 - if (rep->rr_len < RPCRDMA_HDRLEN_ERR) 147 - return; 148 - 149 - credits = be32_to_cpu(rmsgp->rm_credit); 146 + credits = be32_to_cpup(p + 2); 150 147 if (credits == 0) 151 148 credits = 1; /* don't deadlock */ 152 149 else if (credits > buffer->rb_max_requests) ··· 170 173 goto out_fail; 171 174 172 175 /* status == SUCCESS means all fields in wc are trustworthy */ 173 - if (wc->opcode != IB_WC_RECV) 174 - return; 175 - 176 176 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", 177 177 __func__, rep, wc->byte_len); 178 178 179 - rep->rr_len = wc->byte_len; 179 + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); 180 180 rep->rr_wc_flags = wc->wc_flags; 181 181 rep->rr_inv_rkey = wc->ex.invalidate_rkey; 182 182 183 183 ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf), 184 184 rdmab_addr(rep->rr_rdmabuf), 185 - rep->rr_len, DMA_FROM_DEVICE); 185 + wc->byte_len, DMA_FROM_DEVICE); 186 186 187 - rpcrdma_update_granted_credits(rep); 187 + if (wc->byte_len >= RPCRDMA_HDRLEN_ERR) 188 + rpcrdma_update_granted_credits(rep); 188 189 189 190 out_schedule: 190 191 queue_work(rpcrdma_receive_wq, &rep->rr_work); ··· 193 198 pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", 194 199 ib_wc_status_msg(wc->status), 195 200 wc->status, wc->vendor_err); 196 - rep->rr_len = RPCRDMA_BAD_LEN; 201 + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0); 197 202 goto out_schedule; 198 203 } 199 204 ··· 969 974 rc = PTR_ERR(rep->rr_rdmabuf); 970 975 goto out_free; 971 976 } 977 + xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base, 978 + rdmab_length(rep->rr_rdmabuf)); 972 979 973 980 rep->rr_cqe.done = rpcrdma_wc_receive; 974 981 rep->rr_rxprt = r_xprt; ··· 1001 1004 spin_lock_init(&buf->rb_recovery_lock); 1002 1005 INIT_LIST_HEAD(&buf->rb_mws); 1003 1006 INIT_LIST_HEAD(&buf->rb_all); 1004 - INIT_LIST_HEAD(&buf->rb_pending); 1005 1007 INIT_LIST_HEAD(&buf->rb_stale_mrs); 1006 1008 INIT_DELAYED_WORK(&buf->rb_refresh_worker, 1007 1009 rpcrdma_mr_refresh_worker);
+22 -41
net/sunrpc/xprtrdma/xprt_rdma.h
··· 218 218 219 219 struct rpcrdma_rep { 220 220 struct ib_cqe rr_cqe; 221 - unsigned int rr_len; 222 221 int rr_wc_flags; 223 222 u32 rr_inv_rkey; 223 + struct rpcrdma_regbuf *rr_rdmabuf; 224 224 struct rpcrdma_xprt *rr_rxprt; 225 225 struct work_struct rr_work; 226 + struct xdr_buf rr_hdrbuf; 227 + struct xdr_stream rr_stream; 226 228 struct list_head rr_list; 227 229 struct ib_recv_wr rr_recv_wr; 228 - struct rpcrdma_regbuf *rr_rdmabuf; 229 230 }; 230 - 231 - #define RPCRDMA_BAD_LEN (~0U) 232 231 233 232 /* 234 233 * struct rpcrdma_mw - external memory region metadata ··· 340 341 struct rpcrdma_buffer; 341 342 struct rpcrdma_req { 342 343 struct list_head rl_list; 343 - __be32 rl_xid; 344 344 unsigned int rl_mapped_sges; 345 345 unsigned int rl_connect_cookie; 346 346 struct rpcrdma_buffer *rl_buffer; 347 347 struct rpcrdma_rep *rl_reply; 348 + struct xdr_stream rl_stream; 349 + struct xdr_buf rl_hdrbuf; 348 350 struct ib_send_wr rl_send_wr; 349 351 struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES]; 350 352 struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ ··· 403 403 int rb_send_count, rb_recv_count; 404 404 struct list_head rb_send_bufs; 405 405 struct list_head rb_recv_bufs; 406 - struct list_head rb_pending; 407 406 u32 rb_max_requests; 408 407 atomic_t rb_credits; /* most recent credit grant */ 409 408 ··· 439 440 * Statistics for RPCRDMA 440 441 */ 441 442 struct rpcrdma_stats { 443 + /* accessed when sending a call */ 442 444 unsigned long read_chunk_count; 443 445 unsigned long write_chunk_count; 444 446 unsigned long reply_chunk_count; 445 - 446 447 unsigned long long total_rdma_request; 447 - unsigned long long total_rdma_reply; 448 448 449 + /* rarely accessed error counters */ 449 450 unsigned long long pullup_copy_count; 450 - unsigned long long fixup_copy_count; 451 451 unsigned long hardway_register_count; 452 452 unsigned long failed_marshal_count; 453 453 unsigned long bad_reply_count; 454 - unsigned long nomsg_call_count; 455 - unsigned long bcall_count; 456 454 unsigned long mrs_recovered; 457 455 unsigned long mrs_orphaned; 458 456 unsigned long mrs_allocated; 457 + 458 + /* accessed when receiving a reply */ 459 + unsigned long long total_rdma_reply; 460 + unsigned long long fixup_copy_count; 459 461 unsigned long local_inv_needed; 462 + unsigned long nomsg_call_count; 463 + unsigned long bcall_count; 460 464 }; 461 465 462 466 /* ··· 467 465 */ 468 466 struct rpcrdma_xprt; 469 467 struct rpcrdma_memreg_ops { 470 - int (*ro_map)(struct rpcrdma_xprt *, 468 + struct rpcrdma_mr_seg * 469 + (*ro_map)(struct rpcrdma_xprt *, 471 470 struct rpcrdma_mr_seg *, int, bool, 472 471 struct rpcrdma_mw **); 473 472 void (*ro_unmap_sync)(struct rpcrdma_xprt *, ··· 555 552 int rpcrdma_buffer_create(struct rpcrdma_xprt *); 556 553 void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); 557 554 558 - static inline void 559 - rpcrdma_insert_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) 560 - { 561 - spin_lock(&buffers->rb_lock); 562 - if (list_empty(&req->rl_list)) 563 - list_add_tail(&req->rl_list, &buffers->rb_pending); 564 - spin_unlock(&buffers->rb_lock); 565 - } 566 - 567 - static inline struct rpcrdma_req * 568 - rpcrdma_lookup_req_locked(struct rpcrdma_buffer *buffers, __be32 xid) 569 - { 570 - struct rpcrdma_req *pos; 571 - 572 - list_for_each_entry(pos, &buffers->rb_pending, rl_list) 573 - if (pos->rl_xid == xid) 574 - return pos; 575 - return NULL; 576 - } 577 - 578 - static inline void 579 - rpcrdma_remove_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) 580 - { 581 - spin_lock(&buffers->rb_lock); 582 - list_del(&req->rl_list); 583 - spin_unlock(&buffers->rb_lock); 584 - } 585 - 586 555 struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); 587 556 void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); 588 557 struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); ··· 613 638 bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, 614 639 u32, struct xdr_buf *, enum rpcrdma_chunktype); 615 640 void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); 616 - int rpcrdma_marshal_req(struct rpc_rqst *); 641 + int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); 617 642 void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); 618 643 void rpcrdma_reply_handler(struct work_struct *work); 644 + 645 + static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) 646 + { 647 + xdr->head[0].iov_len = len; 648 + xdr->len = len; 649 + } 619 650 620 651 /* RPC/RDMA module init - xprtrdma/transport.c 621 652 */
+48 -44
net/sunrpc/xprtsock.c
··· 969 969 return; 970 970 971 971 /* Look up and lock the request corresponding to the given XID */ 972 - spin_lock_bh(&xprt->transport_lock); 972 + spin_lock(&xprt->recv_lock); 973 973 rovr = xprt_lookup_rqst(xprt, *xp); 974 974 if (!rovr) 975 975 goto out_unlock; 976 + xprt_pin_rqst(rovr); 977 + spin_unlock(&xprt->recv_lock); 976 978 task = rovr->rq_task; 977 979 978 980 copied = rovr->rq_private_buf.buflen; ··· 983 981 984 982 if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) { 985 983 dprintk("RPC: sk_buff copy failed\n"); 986 - goto out_unlock; 984 + spin_lock(&xprt->recv_lock); 985 + goto out_unpin; 987 986 } 988 987 988 + spin_lock(&xprt->recv_lock); 989 989 xprt_complete_rqst(task, copied); 990 - 990 + out_unpin: 991 + xprt_unpin_rqst(rovr); 991 992 out_unlock: 992 - spin_unlock_bh(&xprt->transport_lock); 993 + spin_unlock(&xprt->recv_lock); 993 994 } 994 995 995 996 static void xs_local_data_receive(struct sock_xprt *transport) ··· 1055 1050 return; 1056 1051 1057 1052 /* Look up and lock the request corresponding to the given XID */ 1058 - spin_lock_bh(&xprt->transport_lock); 1053 + spin_lock(&xprt->recv_lock); 1059 1054 rovr = xprt_lookup_rqst(xprt, *xp); 1060 1055 if (!rovr) 1061 1056 goto out_unlock; 1057 + xprt_pin_rqst(rovr); 1058 + spin_unlock(&xprt->recv_lock); 1062 1059 task = rovr->rq_task; 1063 1060 1064 1061 if ((copied = rovr->rq_private_buf.buflen) > repsize) ··· 1069 1062 /* Suck it into the iovec, verify checksum if not done by hw. */ 1070 1063 if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { 1071 1064 __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); 1072 - goto out_unlock; 1065 + spin_lock(&xprt->recv_lock); 1066 + goto out_unpin; 1073 1067 } 1074 1068 1075 1069 __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); 1076 1070 1071 + spin_lock_bh(&xprt->transport_lock); 1077 1072 xprt_adjust_cwnd(xprt, task, copied); 1078 - xprt_complete_rqst(task, copied); 1079 - 1080 - out_unlock: 1081 1073 spin_unlock_bh(&xprt->transport_lock); 1074 + spin_lock(&xprt->recv_lock); 1075 + xprt_complete_rqst(task, copied); 1076 + out_unpin: 1077 + xprt_unpin_rqst(rovr); 1078 + out_unlock: 1079 + spin_unlock(&xprt->recv_lock); 1082 1080 } 1083 1081 1084 1082 static void xs_udp_data_receive(struct sock_xprt *transport) ··· 1289 1277 } 1290 1278 1291 1279 len = desc->count; 1292 - if (len > transport->tcp_reclen - transport->tcp_offset) { 1293 - struct xdr_skb_reader my_desc; 1294 - 1295 - len = transport->tcp_reclen - transport->tcp_offset; 1296 - memcpy(&my_desc, desc, sizeof(my_desc)); 1297 - my_desc.count = len; 1298 - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, 1299 - &my_desc, xdr_skb_read_bits); 1300 - desc->count -= r; 1301 - desc->offset += r; 1302 - } else 1303 - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, 1280 + if (len > transport->tcp_reclen - transport->tcp_offset) 1281 + desc->count = transport->tcp_reclen - transport->tcp_offset; 1282 + r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, 1304 1283 desc, xdr_skb_read_bits); 1305 1284 1306 - if (r > 0) { 1307 - transport->tcp_copied += r; 1308 - transport->tcp_offset += r; 1309 - } 1310 - if (r != len) { 1285 + if (desc->count) { 1311 1286 /* Error when copying to the receive buffer, 1312 1287 * usually because we weren't able to allocate 1313 1288 * additional buffer pages. All we can do now ··· 1313 1314 transport->tcp_offset, transport->tcp_reclen); 1314 1315 return; 1315 1316 } 1317 + 1318 + transport->tcp_copied += r; 1319 + transport->tcp_offset += r; 1320 + desc->count = len - r; 1316 1321 1317 1322 dprintk("RPC: XID %08x read %zd bytes\n", 1318 1323 ntohl(transport->tcp_xid), r); ··· 1346 1343 dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); 1347 1344 1348 1345 /* Find and lock the request corresponding to this xid */ 1349 - spin_lock_bh(&xprt->transport_lock); 1346 + spin_lock(&xprt->recv_lock); 1350 1347 req = xprt_lookup_rqst(xprt, transport->tcp_xid); 1351 1348 if (!req) { 1352 1349 dprintk("RPC: XID %08x request not found!\n", 1353 1350 ntohl(transport->tcp_xid)); 1354 - spin_unlock_bh(&xprt->transport_lock); 1351 + spin_unlock(&xprt->recv_lock); 1355 1352 return -1; 1356 1353 } 1354 + xprt_pin_rqst(req); 1355 + spin_unlock(&xprt->recv_lock); 1357 1356 1358 1357 xs_tcp_read_common(xprt, desc, req); 1359 1358 1359 + spin_lock(&xprt->recv_lock); 1360 1360 if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) 1361 1361 xprt_complete_rqst(req->rq_task, transport->tcp_copied); 1362 - 1363 - spin_unlock_bh(&xprt->transport_lock); 1362 + xprt_unpin_rqst(req); 1363 + spin_unlock(&xprt->recv_lock); 1364 1364 return 0; 1365 1365 } 1366 1366 ··· 1382 1376 container_of(xprt, struct sock_xprt, xprt); 1383 1377 struct rpc_rqst *req; 1384 1378 1385 - /* Look up and lock the request corresponding to the given XID */ 1386 - spin_lock_bh(&xprt->transport_lock); 1379 + /* Look up the request corresponding to the given XID */ 1387 1380 req = xprt_lookup_bc_request(xprt, transport->tcp_xid); 1388 1381 if (req == NULL) { 1389 - spin_unlock_bh(&xprt->transport_lock); 1390 1382 printk(KERN_WARNING "Callback slot table overflowed\n"); 1391 1383 xprt_force_disconnect(xprt); 1392 1384 return -1; ··· 1395 1391 1396 1392 if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) 1397 1393 xprt_complete_bc_request(req, transport->tcp_copied); 1398 - spin_unlock_bh(&xprt->transport_lock); 1399 1394 1400 1395 return 0; 1401 1396 } ··· 1519 1516 .arg.data = xprt, 1520 1517 }; 1521 1518 unsigned long total = 0; 1519 + int loop; 1522 1520 int read = 0; 1523 1521 1524 1522 mutex_lock(&transport->recv_mutex); ··· 1528 1524 goto out; 1529 1525 1530 1526 /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ 1531 - for (;;) { 1527 + for (loop = 0; loop < 64; loop++) { 1532 1528 lock_sock(sk); 1533 1529 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); 1534 1530 if (read <= 0) { 1535 1531 clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); 1536 1532 release_sock(sk); 1537 - if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) 1538 - break; 1539 - } else { 1540 - release_sock(sk); 1541 - total += read; 1533 + break; 1542 1534 } 1535 + release_sock(sk); 1536 + total += read; 1543 1537 rd_desc.count = 65536; 1544 1538 } 1539 + if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) 1540 + queue_work(xprtiod_workqueue, &transport->recv_worker); 1545 1541 out: 1546 1542 mutex_unlock(&transport->recv_mutex); 1547 1543 trace_xs_tcp_data_ready(xprt, read, total); ··· 2728 2724 module_put(THIS_MODULE); 2729 2725 } 2730 2726 2731 - static struct rpc_xprt_ops xs_local_ops = { 2727 + static const struct rpc_xprt_ops xs_local_ops = { 2732 2728 .reserve_xprt = xprt_reserve_xprt, 2733 2729 .release_xprt = xs_tcp_release_xprt, 2734 2730 .alloc_slot = xprt_alloc_slot, ··· 2746 2742 .disable_swap = xs_disable_swap, 2747 2743 }; 2748 2744 2749 - static struct rpc_xprt_ops xs_udp_ops = { 2745 + static const struct rpc_xprt_ops xs_udp_ops = { 2750 2746 .set_buffer_size = xs_udp_set_buffer_size, 2751 2747 .reserve_xprt = xprt_reserve_xprt_cong, 2752 2748 .release_xprt = xprt_release_xprt_cong, ··· 2768 2764 .inject_disconnect = xs_inject_disconnect, 2769 2765 }; 2770 2766 2771 - static struct rpc_xprt_ops xs_tcp_ops = { 2767 + static const struct rpc_xprt_ops xs_tcp_ops = { 2772 2768 .reserve_xprt = xprt_reserve_xprt, 2773 2769 .release_xprt = xs_tcp_release_xprt, 2774 2770 .alloc_slot = xprt_lock_and_alloc_slot, ··· 2799 2795 * The rpc_xprt_ops for the server backchannel 2800 2796 */ 2801 2797 2802 - static struct rpc_xprt_ops bc_tcp_ops = { 2798 + static const struct rpc_xprt_ops bc_tcp_ops = { 2803 2799 .reserve_xprt = xprt_reserve_xprt, 2804 2800 .release_xprt = xprt_release_xprt, 2805 2801 .alloc_slot = xprt_alloc_slot,