commit 63e9b66e29357dd12e8b1d3ebf7036e7591f81e3 · tjh.dev/kernel

+1 -1

MAINTAINERS

··· 2247 M: bfields@fieldses.org 2248 P: Neil Brown 2249 M: neilb@suse.de 2250 - L: nfs@lists.sourceforge.net 2251 W: http://nfs.sourceforge.net/ 2252 S: Supported 2253

··· 2247 M: bfields@fieldses.org 2248 P: Neil Brown 2249 M: neilb@suse.de 2250 + L: linux-nfs@vger.kernel.org 2251 W: http://nfs.sourceforge.net/ 2252 S: Supported 2253

+2

fs/Kconfig

··· 1674 select CRYPTO_MD5 if NFSD_V4 1675 select CRYPTO if NFSD_V4 1676 select FS_POSIX_ACL if NFSD_V4 1677 help 1678 If you want your Linux box to act as an NFS *server*, so that other 1679 computers on your local network which support NFS can access certain

··· 1674 select CRYPTO_MD5 if NFSD_V4 1675 select CRYPTO if NFSD_V4 1676 select FS_POSIX_ACL if NFSD_V4 1677 + select PROC_FS if NFSD_V4 1678 + select PROC_FS if SUNRPC_GSS 1679 help 1680 If you want your Linux box to act as an NFS *server*, so that other 1681 computers on your local network which support NFS can access certain

+17 -9

fs/lockd/host.c

··· 34 35 static void nlm_gc_hosts(void); 36 static struct nsm_handle * __nsm_find(const struct sockaddr_in *, 37 - const char *, int, int); 38 static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, 39 const char *hostname, 40 - int hostname_len); 41 42 /* 43 * Common host lookup routine for server & client ··· 45 static struct nlm_host * 46 nlm_lookup_host(int server, const struct sockaddr_in *sin, 47 int proto, int version, const char *hostname, 48 - int hostname_len, const struct sockaddr_in *ssin) 49 { 50 struct hlist_head *chain; 51 struct hlist_node *pos; ··· 177 */ 178 struct nlm_host * 179 nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, 180 - const char *hostname, int hostname_len) 181 { 182 struct sockaddr_in ssin = {0}; 183 ··· 190 */ 191 struct nlm_host * 192 nlmsvc_lookup_host(struct svc_rqst *rqstp, 193 - const char *hostname, int hostname_len) 194 { 195 struct sockaddr_in ssin = {0}; 196 ··· 308 * Release all resources held by that peer. 309 */ 310 void nlm_host_rebooted(const struct sockaddr_in *sin, 311 - const char *hostname, int hostname_len, 312 u32 new_state) 313 { 314 struct hlist_head *chain; ··· 379 /* First, make all hosts eligible for gc */ 380 dprintk("lockd: nuking all hosts...\n"); 381 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 382 - hlist_for_each_entry(host, pos, chain, h_hash) 383 host->h_expires = jiffies - 1; 384 } 385 386 /* Then, perform a garbage collection pass */ ··· 456 457 static struct nsm_handle * 458 __nsm_find(const struct sockaddr_in *sin, 459 - const char *hostname, int hostname_len, 460 int create) 461 { 462 struct nsm_handle *nsm = NULL; ··· 510 } 511 512 static struct nsm_handle * 513 - nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len) 514 { 515 return __nsm_find(sin, hostname, hostname_len, 1); 516 }

··· 34 35 static void nlm_gc_hosts(void); 36 static struct nsm_handle * __nsm_find(const struct sockaddr_in *, 37 + const char *, unsigned int, int); 38 static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, 39 const char *hostname, 40 + unsigned int hostname_len); 41 42 /* 43 * Common host lookup routine for server & client ··· 45 static struct nlm_host * 46 nlm_lookup_host(int server, const struct sockaddr_in *sin, 47 int proto, int version, const char *hostname, 48 + unsigned int hostname_len, 49 + const struct sockaddr_in *ssin) 50 { 51 struct hlist_head *chain; 52 struct hlist_node *pos; ··· 176 */ 177 struct nlm_host * 178 nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, 179 + const char *hostname, unsigned int hostname_len) 180 { 181 struct sockaddr_in ssin = {0}; 182 ··· 189 */ 190 struct nlm_host * 191 nlmsvc_lookup_host(struct svc_rqst *rqstp, 192 + const char *hostname, unsigned int hostname_len) 193 { 194 struct sockaddr_in ssin = {0}; 195 ··· 307 * Release all resources held by that peer. 308 */ 309 void nlm_host_rebooted(const struct sockaddr_in *sin, 310 + const char *hostname, 311 + unsigned int hostname_len, 312 u32 new_state) 313 { 314 struct hlist_head *chain; ··· 377 /* First, make all hosts eligible for gc */ 378 dprintk("lockd: nuking all hosts...\n"); 379 for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { 380 + hlist_for_each_entry(host, pos, chain, h_hash) { 381 host->h_expires = jiffies - 1; 382 + if (host->h_rpcclnt) { 383 + rpc_shutdown_client(host->h_rpcclnt); 384 + host->h_rpcclnt = NULL; 385 + } 386 + } 387 } 388 389 /* Then, perform a garbage collection pass */ ··· 449 450 static struct nsm_handle * 451 __nsm_find(const struct sockaddr_in *sin, 452 + const char *hostname, unsigned int hostname_len, 453 int create) 454 { 455 struct nsm_handle *nsm = NULL; ··· 503 } 504 505 static struct nsm_handle * 506 + nsm_find(const struct sockaddr_in *sin, const char *hostname, 507 + unsigned int hostname_len) 508 { 509 return __nsm_find(sin, hostname, hostname_len, 1); 510 }

+17 -22

fs/lockd/svc.c

··· 219 module_put_and_exit(0); 220 } 221 222 - 223 - static int find_socket(struct svc_serv *serv, int proto) 224 - { 225 - struct svc_sock *svsk; 226 - int found = 0; 227 - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) 228 - if (svsk->sk_sk->sk_protocol == proto) { 229 - found = 1; 230 - break; 231 - } 232 - return found; 233 - } 234 - 235 /* 236 * Make any sockets that are needed but not present. 237 * If nlm_udpport or nlm_tcpport were set as module ··· 227 static int make_socks(struct svc_serv *serv, int proto) 228 { 229 static int warned; 230 int err = 0; 231 232 - if (proto == IPPROTO_UDP || nlm_udpport) 233 - if (!find_socket(serv, IPPROTO_UDP)) 234 - err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport, 235 - SVC_SOCK_DEFAULTS); 236 - if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) 237 - if (!find_socket(serv, IPPROTO_TCP)) 238 - err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport, 239 - SVC_SOCK_DEFAULTS); 240 - 241 if (err >= 0) { 242 warned = 0; 243 err = 0;

··· 219 module_put_and_exit(0); 220 } 221 222 /* 223 * Make any sockets that are needed but not present. 224 * If nlm_udpport or nlm_tcpport were set as module ··· 240 static int make_socks(struct svc_serv *serv, int proto) 241 { 242 static int warned; 243 + struct svc_xprt *xprt; 244 int err = 0; 245 246 + if (proto == IPPROTO_UDP || nlm_udpport) { 247 + xprt = svc_find_xprt(serv, "udp", 0, 0); 248 + if (!xprt) 249 + err = svc_create_xprt(serv, "udp", nlm_udpport, 250 + SVC_SOCK_DEFAULTS); 251 + else 252 + svc_xprt_put(xprt); 253 + } 254 + if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) { 255 + xprt = svc_find_xprt(serv, "tcp", 0, 0); 256 + if (!xprt) 257 + err = svc_create_xprt(serv, "tcp", nlm_tcpport, 258 + SVC_SOCK_DEFAULTS); 259 + else 260 + svc_xprt_put(xprt); 261 + } 262 if (err >= 0) { 263 warned = 0; 264 err = 0;

+12 -8

fs/lockd/svc4proc.c

··· 84 { 85 struct nlm_host *host; 86 struct nlm_file *file; 87 88 dprintk("lockd: TEST4 called\n"); 89 resp->cookie = argp->cookie; ··· 92 /* Don't accept test requests during grace period */ 93 if (nlmsvc_grace_period) { 94 resp->status = nlm_lck_denied_grace_period; 95 - return rpc_success; 96 } 97 98 /* Obtain client and file */ ··· 102 /* Now check for conflicting locks */ 103 resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie); 104 if (resp->status == nlm_drop_reply) 105 - return rpc_drop_reply; 106 107 - dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); 108 nlm_release_host(host); 109 nlm_release_file(file); 110 - return rpc_success; 111 } 112 113 static __be32 ··· 117 { 118 struct nlm_host *host; 119 struct nlm_file *file; 120 121 dprintk("lockd: LOCK called\n"); 122 ··· 126 /* Don't accept new lock requests during grace period */ 127 if (nlmsvc_grace_period && !argp->reclaim) { 128 resp->status = nlm_lck_denied_grace_period; 129 - return rpc_success; 130 } 131 132 /* Obtain client and file */ ··· 149 resp->status = nlmsvc_lock(rqstp, file, &argp->lock, 150 argp->block, &argp->cookie); 151 if (resp->status == nlm_drop_reply) 152 - return rpc_drop_reply; 153 154 - dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); 155 nlm_release_host(host); 156 nlm_release_file(file); 157 - return rpc_success; 158 } 159 160 static __be32

··· 84 { 85 struct nlm_host *host; 86 struct nlm_file *file; 87 + int rc = rpc_success; 88 89 dprintk("lockd: TEST4 called\n"); 90 resp->cookie = argp->cookie; ··· 91 /* Don't accept test requests during grace period */ 92 if (nlmsvc_grace_period) { 93 resp->status = nlm_lck_denied_grace_period; 94 + return rc; 95 } 96 97 /* Obtain client and file */ ··· 101 /* Now check for conflicting locks */ 102 resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie); 103 if (resp->status == nlm_drop_reply) 104 + rc = rpc_drop_reply; 105 + else 106 + dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); 107 108 nlm_release_host(host); 109 nlm_release_file(file); 110 + return rc; 111 } 112 113 static __be32 ··· 115 { 116 struct nlm_host *host; 117 struct nlm_file *file; 118 + int rc = rpc_success; 119 120 dprintk("lockd: LOCK called\n"); 121 ··· 123 /* Don't accept new lock requests during grace period */ 124 if (nlmsvc_grace_period && !argp->reclaim) { 125 resp->status = nlm_lck_denied_grace_period; 126 + return rc; 127 } 128 129 /* Obtain client and file */ ··· 146 resp->status = nlmsvc_lock(rqstp, file, &argp->lock, 147 argp->block, &argp->cookie); 148 if (resp->status == nlm_drop_reply) 149 + rc = rpc_drop_reply; 150 + else 151 + dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); 152 153 nlm_release_host(host); 154 nlm_release_file(file); 155 + return rc; 156 } 157 158 static __be32

+12 -8

fs/lockd/svclock.c

··· 501 block, block->b_flags, block->b_fl); 502 if (block->b_flags & B_TIMED_OUT) { 503 nlmsvc_unlink_block(block); 504 - return nlm_lck_denied; 505 } 506 if (block->b_flags & B_GOT_CALLBACK) { 507 if (block->b_fl != NULL 508 && block->b_fl->fl_type != F_UNLCK) { 509 lock->fl = *block->b_fl; 510 goto conf_lock; 511 - } 512 - else { 513 - nlmsvc_unlink_block(block); 514 - return nlm_granted; 515 } 516 } 517 - return nlm_drop_reply; 518 } 519 520 error = vfs_test_lock(file->f_file, &lock->fl); 521 - if (error == -EINPROGRESS) 522 - return nlmsvc_defer_lock_rqst(rqstp, block); 523 if (error) { 524 ret = nlm_lck_denied_nolocks; 525 goto out;

··· 501 block, block->b_flags, block->b_fl); 502 if (block->b_flags & B_TIMED_OUT) { 503 nlmsvc_unlink_block(block); 504 + ret = nlm_lck_denied; 505 + goto out; 506 } 507 if (block->b_flags & B_GOT_CALLBACK) { 508 + nlmsvc_unlink_block(block); 509 if (block->b_fl != NULL 510 && block->b_fl->fl_type != F_UNLCK) { 511 lock->fl = *block->b_fl; 512 goto conf_lock; 513 + } else { 514 + ret = nlm_granted; 515 + goto out; 516 } 517 } 518 + ret = nlm_drop_reply; 519 + goto out; 520 } 521 522 error = vfs_test_lock(file->f_file, &lock->fl); 523 + if (error == -EINPROGRESS) { 524 + ret = nlmsvc_defer_lock_rqst(rqstp, block); 525 + goto out; 526 + } 527 if (error) { 528 ret = nlm_lck_denied_nolocks; 529 goto out;

+13 -9

fs/lockd/svcproc.c

··· 113 { 114 struct nlm_host *host; 115 struct nlm_file *file; 116 117 dprintk("lockd: TEST called\n"); 118 resp->cookie = argp->cookie; ··· 121 /* Don't accept test requests during grace period */ 122 if (nlmsvc_grace_period) { 123 resp->status = nlm_lck_denied_grace_period; 124 - return rpc_success; 125 } 126 127 /* Obtain client and file */ ··· 131 /* Now check for conflicting locks */ 132 resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie)); 133 if (resp->status == nlm_drop_reply) 134 - return rpc_drop_reply; 135 136 - dprintk("lockd: TEST status %d vers %d\n", 137 - ntohl(resp->status), rqstp->rq_vers); 138 nlm_release_host(host); 139 nlm_release_file(file); 140 - return rpc_success; 141 } 142 143 static __be32 ··· 147 { 148 struct nlm_host *host; 149 struct nlm_file *file; 150 151 dprintk("lockd: LOCK called\n"); 152 ··· 156 /* Don't accept new lock requests during grace period */ 157 if (nlmsvc_grace_period && !argp->reclaim) { 158 resp->status = nlm_lck_denied_grace_period; 159 - return rpc_success; 160 } 161 162 /* Obtain client and file */ ··· 179 resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock, 180 argp->block, &argp->cookie)); 181 if (resp->status == nlm_drop_reply) 182 - return rpc_drop_reply; 183 184 - dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); 185 nlm_release_host(host); 186 nlm_release_file(file); 187 - return rpc_success; 188 } 189 190 static __be32

··· 113 { 114 struct nlm_host *host; 115 struct nlm_file *file; 116 + int rc = rpc_success; 117 118 dprintk("lockd: TEST called\n"); 119 resp->cookie = argp->cookie; ··· 120 /* Don't accept test requests during grace period */ 121 if (nlmsvc_grace_period) { 122 resp->status = nlm_lck_denied_grace_period; 123 + return rc; 124 } 125 126 /* Obtain client and file */ ··· 130 /* Now check for conflicting locks */ 131 resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie)); 132 if (resp->status == nlm_drop_reply) 133 + rc = rpc_drop_reply; 134 + else 135 + dprintk("lockd: TEST status %d vers %d\n", 136 + ntohl(resp->status), rqstp->rq_vers); 137 138 nlm_release_host(host); 139 nlm_release_file(file); 140 + return rc; 141 } 142 143 static __be32 ··· 145 { 146 struct nlm_host *host; 147 struct nlm_file *file; 148 + int rc = rpc_success; 149 150 dprintk("lockd: LOCK called\n"); 151 ··· 153 /* Don't accept new lock requests during grace period */ 154 if (nlmsvc_grace_period && !argp->reclaim) { 155 resp->status = nlm_lck_denied_grace_period; 156 + return rc; 157 } 158 159 /* Obtain client and file */ ··· 176 resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock, 177 argp->block, &argp->cookie)); 178 if (resp->status == nlm_drop_reply) 179 + rc = rpc_drop_reply; 180 + else 181 + dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); 182 183 nlm_release_host(host); 184 nlm_release_file(file); 185 + return rc; 186 } 187 188 static __be32

+1 -1

fs/lockd/svcsubs.c

··· 87 unsigned int hash; 88 __be32 nfserr; 89 90 - nlm_debug_print_fh("nlm_file_lookup", f); 91 92 hash = file_hash(f); 93

··· 87 unsigned int hash; 88 __be32 nfserr; 89 90 + nlm_debug_print_fh("nlm_lookup_file", f); 91 92 hash = file_hash(f); 93

+2 -2

fs/nfs/callback.c

··· 119 if (!serv) 120 goto out_err; 121 122 - ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport, 123 - SVC_SOCK_ANONYMOUS); 124 if (ret <= 0) 125 goto out_destroy; 126 nfs_callback_tcpport = ret;

··· 119 if (!serv) 120 goto out_err; 121 122 + ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, 123 + SVC_SOCK_ANONYMOUS); 124 if (ret <= 0) 125 goto out_destroy; 126 nfs_callback_tcpport = ret;

+11 -9

fs/nfsd/export.c

··· 1357 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); 1358 1359 exp = rqst_exp_find(rqstp, FSID_NUM, fsidv); 1360 - if (PTR_ERR(exp) == -ENOENT) 1361 - return nfserr_perm; 1362 if (IS_ERR(exp)) 1363 return nfserrno(PTR_ERR(exp)); 1364 rv = fh_compose(fhp, exp, exp->ex_dentry, NULL); ··· 1635 /* 1636 * Initialize the exports module. 1637 */ 1638 - void 1639 nfsd_export_init(void) 1640 { 1641 dprintk("nfsd: initializing export module.\n"); 1642 1643 - cache_register(&svc_export_cache); 1644 - cache_register(&svc_expkey_cache); 1645 1646 } 1647 ··· 1674 1675 exp_writelock(); 1676 1677 - if (cache_unregister(&svc_expkey_cache)) 1678 - printk(KERN_ERR "nfsd: failed to unregister expkey cache\n"); 1679 - if (cache_unregister(&svc_export_cache)) 1680 - printk(KERN_ERR "nfsd: failed to unregister export cache\n"); 1681 svcauth_unix_purge(); 1682 1683 exp_writeunlock();

··· 1357 mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); 1358 1359 exp = rqst_exp_find(rqstp, FSID_NUM, fsidv); 1360 if (IS_ERR(exp)) 1361 return nfserrno(PTR_ERR(exp)); 1362 rv = fh_compose(fhp, exp, exp->ex_dentry, NULL); ··· 1637 /* 1638 * Initialize the exports module. 1639 */ 1640 + int 1641 nfsd_export_init(void) 1642 { 1643 + int rv; 1644 dprintk("nfsd: initializing export module.\n"); 1645 1646 + rv = cache_register(&svc_export_cache); 1647 + if (rv) 1648 + return rv; 1649 + rv = cache_register(&svc_expkey_cache); 1650 + if (rv) 1651 + cache_unregister(&svc_export_cache); 1652 + return rv; 1653 1654 } 1655 ··· 1670 1671 exp_writelock(); 1672 1673 + cache_unregister(&svc_expkey_cache); 1674 + cache_unregister(&svc_export_cache); 1675 svcauth_unix_purge(); 1676 1677 exp_writeunlock();

+6 -1

fs/nfsd/nfs2acl.c

··· 221 struct nfsd3_getaclres *resp) 222 { 223 struct dentry *dentry = resp->fh.fh_dentry; 224 - struct inode *inode = dentry->d_inode; 225 struct kvec *head = rqstp->rq_res.head; 226 unsigned int base; 227 int n; 228 int w; 229 230 if (dentry == NULL || dentry->d_inode == NULL) 231 return 0; 232 inode = dentry->d_inode;

··· 221 struct nfsd3_getaclres *resp) 222 { 223 struct dentry *dentry = resp->fh.fh_dentry; 224 + struct inode *inode; 225 struct kvec *head = rqstp->rq_res.head; 226 unsigned int base; 227 int n; 228 int w; 229 230 + /* 231 + * Since this is version 2, the check for nfserr in 232 + * nfsd_dispatch actually ensures the following cannot happen. 233 + * However, it seems fragile to depend on that. 234 + */ 235 if (dentry == NULL || dentry->d_inode == NULL) 236 return 0; 237 inode = dentry->d_inode;

+11 -10

fs/nfsd/nfs3xdr.c

··· 21 #include <linux/sunrpc/svc.h> 22 #include <linux/nfsd/nfsd.h> 23 #include <linux/nfsd/xdr3.h> 24 25 #define NFSDDBG_FACILITY NFSDDBG_XDR 26 ··· 89 * no slashes or null bytes. 90 */ 91 static __be32 * 92 - decode_filename(__be32 *p, char **namp, int *lenp) 93 { 94 char *name; 95 - int i; 96 97 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { 98 for (i = 0, name = *namp; i < *lenp; i++, name++) { ··· 453 nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, 454 struct nfsd3_symlinkargs *args) 455 { 456 - unsigned int len; 457 - int avail; 458 char *old, *new; 459 struct kvec *vec; 460 ··· 486 /* now copy next page if there is one */ 487 if (len && !avail && rqstp->rq_arg.page_len) { 488 avail = rqstp->rq_arg.page_len; 489 - if (avail > PAGE_SIZE) avail = PAGE_SIZE; 490 old = page_address(rqstp->rq_arg.pages[0]); 491 } 492 while (len && avail && *old) { ··· 817 encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, 818 struct svc_fh *fhp) 819 { 820 - p = encode_post_op_attr(cd->rqstp, p, fhp); 821 - *p++ = xdr_one; /* yes, a file handle follows */ 822 - p = encode_fh(p, fhp); 823 - fh_put(fhp); 824 - return p; 825 } 826 827 static int

··· 21 #include <linux/sunrpc/svc.h> 22 #include <linux/nfsd/nfsd.h> 23 #include <linux/nfsd/xdr3.h> 24 + #include "auth.h" 25 26 #define NFSDDBG_FACILITY NFSDDBG_XDR 27 ··· 88 * no slashes or null bytes. 89 */ 90 static __be32 * 91 + decode_filename(__be32 *p, char **namp, unsigned int *lenp) 92 { 93 char *name; 94 + unsigned int i; 95 96 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { 97 for (i = 0, name = *namp; i < *lenp; i++, name++) { ··· 452 nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, 453 struct nfsd3_symlinkargs *args) 454 { 455 + unsigned int len, avail; 456 char *old, *new; 457 struct kvec *vec; 458 ··· 486 /* now copy next page if there is one */ 487 if (len && !avail && rqstp->rq_arg.page_len) { 488 avail = rqstp->rq_arg.page_len; 489 + if (avail > PAGE_SIZE) 490 + avail = PAGE_SIZE; 491 old = page_address(rqstp->rq_arg.pages[0]); 492 } 493 while (len && avail && *old) { ··· 816 encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, 817 struct svc_fh *fhp) 818 { 819 + p = encode_post_op_attr(cd->rqstp, p, fhp); 820 + *p++ = xdr_one; /* yes, a file handle follows */ 821 + p = encode_fh(p, fhp); 822 + fh_put(fhp); 823 + return p; 824 } 825 826 static int

+44 -48

fs/nfsd/nfs4callback.c

··· 350 static int do_probe_callback(void *data) 351 { 352 struct nfs4_client *clp = data; 353 - struct nfs4_callback *cb = &clp->cl_callback; 354 - struct rpc_message msg = { 355 - .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 356 - .rpc_argp = clp, 357 - }; 358 - int status; 359 - 360 - status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT); 361 - 362 - if (status) { 363 - rpc_shutdown_client(cb->cb_client); 364 - cb->cb_client = NULL; 365 - } else 366 - atomic_set(&cb->cb_set, 1); 367 - put_nfs4_client(clp); 368 - return 0; 369 - } 370 - 371 - /* 372 - * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... 373 - */ 374 - void 375 - nfsd4_probe_callback(struct nfs4_client *clp) 376 - { 377 struct sockaddr_in addr; 378 struct nfs4_callback *cb = &clp->cl_callback; 379 struct rpc_timeout timeparms = { ··· 366 .timeout = &timeparms, 367 .program = program, 368 .version = nfs_cb_version[1]->number, 369 - .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ 370 .flags = (RPC_CLNT_CREATE_NOPING), 371 }; 372 - struct task_struct *t; 373 - 374 - if (atomic_read(&cb->cb_set)) 375 - return; 376 377 /* Initialize address */ 378 memset(&addr, 0, sizeof(addr)); ··· 394 program->stats->program = program; 395 396 /* Create RPC client */ 397 - cb->cb_client = rpc_create(&args); 398 - if (IS_ERR(cb->cb_client)) { 399 dprintk("NFSD: couldn't create callback client\n"); 400 goto out_err; 401 } 402 403 /* the task holds a reference to the nfs4_client struct */ 404 atomic_inc(&clp->cl_count); ··· 435 t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe"); 436 437 if (IS_ERR(t)) 438 - goto out_release_clp; 439 440 return; 441 - 442 - out_release_clp: 443 - atomic_dec(&clp->cl_count); 444 - rpc_shutdown_client(cb->cb_client); 445 - out_err: 446 - cb->cb_client = NULL; 447 - dprintk("NFSD: warning: no callback path to client %.*s\n", 448 - (int)clp->cl_name.len, clp->cl_name.data); 449 } 450 451 /* ··· 457 int retries = 1; 458 int status = 0; 459 460 - if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt) 461 - return; 462 - 463 cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */ 464 cbr->cbr_dp = dp; 465 ··· 465 switch (status) { 466 case -EIO: 467 /* Network partition? */ 468 case -EBADHANDLE: 469 case -NFS4ERR_BAD_STATEID: 470 /* Race: client probably got cb_recall ··· 478 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT); 479 } 480 out_put_cred: 481 - if (status == -EIO) 482 - atomic_set(&clp->cl_callback.cb_set, 0); 483 - /* Success or failure, now we're either waiting for lease expiration 484 - * or deleg_return. */ 485 - dprintk("NFSD: nfs4_cb_recall: dp %p dl_flock %p dl_count %d\n",dp, dp->dl_flock, atomic_read(&dp->dl_count)); 486 put_nfs4_client(clp); 487 nfs4_put_delegation(dp); 488 return;

··· 350 static int do_probe_callback(void *data) 351 { 352 struct nfs4_client *clp = data; 353 struct sockaddr_in addr; 354 struct nfs4_callback *cb = &clp->cl_callback; 355 struct rpc_timeout timeparms = { ··· 390 .timeout = &timeparms, 391 .program = program, 392 .version = nfs_cb_version[1]->number, 393 + .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ 394 .flags = (RPC_CLNT_CREATE_NOPING), 395 }; 396 + struct rpc_message msg = { 397 + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 398 + .rpc_argp = clp, 399 + }; 400 + struct rpc_clnt *client; 401 + int status; 402 403 /* Initialize address */ 404 memset(&addr, 0, sizeof(addr)); ··· 416 program->stats->program = program; 417 418 /* Create RPC client */ 419 + client = rpc_create(&args); 420 + if (IS_ERR(client)) { 421 dprintk("NFSD: couldn't create callback client\n"); 422 + status = PTR_ERR(client); 423 goto out_err; 424 } 425 + 426 + status = rpc_call_sync(client, &msg, RPC_TASK_SOFT); 427 + 428 + if (status) 429 + goto out_release_client; 430 + 431 + cb->cb_client = client; 432 + atomic_set(&cb->cb_set, 1); 433 + put_nfs4_client(clp); 434 + return 0; 435 + out_release_client: 436 + rpc_shutdown_client(client); 437 + out_err: 438 + put_nfs4_client(clp); 439 + dprintk("NFSD: warning: no callback path to client %.*s\n", 440 + (int)clp->cl_name.len, clp->cl_name.data); 441 + return status; 442 + } 443 + 444 + /* 445 + * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... 446 + */ 447 + void 448 + nfsd4_probe_callback(struct nfs4_client *clp) 449 + { 450 + struct task_struct *t; 451 + 452 + BUG_ON(atomic_read(&clp->cl_callback.cb_set)); 453 454 /* the task holds a reference to the nfs4_client struct */ 455 atomic_inc(&clp->cl_count); ··· 428 t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe"); 429 430 if (IS_ERR(t)) 431 + atomic_dec(&clp->cl_count); 432 433 return; 434 } 435 436 /* ··· 458 int retries = 1; 459 int status = 0; 460 461 cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */ 462 cbr->cbr_dp = dp; 463 ··· 469 switch (status) { 470 case -EIO: 471 /* Network partition? */ 472 + atomic_set(&clp->cl_callback.cb_set, 0); 473 case -EBADHANDLE: 474 case -NFS4ERR_BAD_STATEID: 475 /* Race: client probably got cb_recall ··· 481 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT); 482 } 483 out_put_cred: 484 + /* 485 + * Success or failure, now we're either waiting for lease expiration 486 + * or deleg_return. 487 + */ 488 put_nfs4_client(clp); 489 nfs4_put_delegation(dp); 490 return;

+15 -13

fs/nfsd/nfs4idmap.c

··· 255 goto out; 256 if (len == 0) 257 set_bit(CACHE_NEGATIVE, &ent.h.flags); 258 - else { 259 - if (error >= IDMAP_NAMESZ) { 260 - error = -EINVAL; 261 - goto out; 262 - } 263 memcpy(ent.name, buf1, sizeof(ent.name)); 264 - } 265 error = -ENOMEM; 266 res = idtoname_update(&ent, res); 267 if (res == NULL) ··· 464 * Exported API 465 */ 466 467 - void 468 nfsd_idmap_init(void) 469 { 470 - cache_register(&idtoname_cache); 471 - cache_register(&nametoid_cache); 472 } 473 474 void 475 nfsd_idmap_shutdown(void) 476 { 477 - if (cache_unregister(&idtoname_cache)) 478 - printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n"); 479 - if (cache_unregister(&nametoid_cache)) 480 - printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n"); 481 } 482 483 /*

··· 255 goto out; 256 if (len == 0) 257 set_bit(CACHE_NEGATIVE, &ent.h.flags); 258 + else if (len >= IDMAP_NAMESZ) 259 + goto out; 260 + else 261 memcpy(ent.name, buf1, sizeof(ent.name)); 262 error = -ENOMEM; 263 res = idtoname_update(&ent, res); 264 if (res == NULL) ··· 467 * Exported API 468 */ 469 470 + int 471 nfsd_idmap_init(void) 472 { 473 + int rv; 474 + 475 + rv = cache_register(&idtoname_cache); 476 + if (rv) 477 + return rv; 478 + rv = cache_register(&nametoid_cache); 479 + if (rv) 480 + cache_unregister(&idtoname_cache); 481 + return rv; 482 } 483 484 void 485 nfsd_idmap_shutdown(void) 486 { 487 + cache_unregister(&idtoname_cache); 488 + cache_unregister(&nametoid_cache); 489 } 490 491 /*

+1 -1

fs/nfsd/nfs4proc.c

··· 750 cstate->current_fh.fh_export, 751 cstate->current_fh.fh_dentry, buf, 752 &count, verify->ve_bmval, 753 - rqstp); 754 755 /* this means that nfsd4_encode_fattr() ran out of space */ 756 if (status == nfserr_resource && count == 0)

··· 750 cstate->current_fh.fh_export, 751 cstate->current_fh.fh_dentry, buf, 752 &count, verify->ve_bmval, 753 + rqstp, 0); 754 755 /* this means that nfsd4_encode_fattr() ran out of space */ 756 if (status == nfserr_resource && count == 0)

+96 -161

fs/nfsd/nfs4state.c

··· 61 static time_t user_lease_time = 90; 62 static time_t boot_time; 63 static int in_grace = 1; 64 - static u32 current_clientid = 1; 65 static u32 current_ownerid = 1; 66 static u32 current_fileid = 1; 67 static u32 current_delegid = 1; ··· 339 * This type of memory management is somewhat inefficient, but we use it 340 * anyway since SETCLIENTID is not a common operation. 341 */ 342 - static inline struct nfs4_client * 343 - alloc_client(struct xdr_netobj name) 344 { 345 struct nfs4_client *clp; 346 347 - if ((clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) { 348 - if ((clp->cl_name.data = kmalloc(name.len, GFP_KERNEL)) != NULL) { 349 - memcpy(clp->cl_name.data, name.data, name.len); 350 - clp->cl_name.len = name.len; 351 - } 352 - else { 353 - kfree(clp); 354 - clp = NULL; 355 - } 356 } 357 return clp; 358 } 359 ··· 361 { 362 struct rpc_clnt *clnt = clp->cl_callback.cb_client; 363 364 - /* shutdown rpc client, ending any outstanding recall rpcs */ 365 if (clnt) { 366 clp->cl_callback.cb_client = NULL; 367 rpc_shutdown_client(clnt); 368 } ··· 423 put_nfs4_client(clp); 424 } 425 426 - static struct nfs4_client * 427 - create_client(struct xdr_netobj name, char *recdir) { 428 struct nfs4_client *clp; 429 430 - if (!(clp = alloc_client(name))) 431 - goto out; 432 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 433 atomic_set(&clp->cl_count, 1); 434 atomic_set(&clp->cl_callback.cb_set, 0); ··· 438 INIT_LIST_HEAD(&clp->cl_openowners); 439 INIT_LIST_HEAD(&clp->cl_delegations); 440 INIT_LIST_HEAD(&clp->cl_lru); 441 - out: 442 return clp; 443 } 444 445 - static void 446 - copy_verf(struct nfs4_client *target, nfs4_verifier *source) { 447 - memcpy(target->cl_verifier.data, source->data, sizeof(target->cl_verifier.data)); 448 } 449 450 - static void 451 - copy_clid(struct nfs4_client *target, struct nfs4_client *source) { 452 target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; 453 target->cl_clientid.cl_id = source->cl_clientid.cl_id; 454 } 455 456 - static void 457 - copy_cred(struct svc_cred *target, struct svc_cred *source) { 458 - 459 target->cr_uid = source->cr_uid; 460 target->cr_gid = source->cr_gid; 461 target->cr_group_info = source->cr_group_info; 462 get_group_info(target->cr_group_info); 463 } 464 465 - static inline int 466 - same_name(const char *n1, const char *n2) 467 { 468 return 0 == memcmp(n1, n2, HEXDIR_LEN); 469 } ··· 485 return cr1->cr_uid == cr2->cr_uid; 486 } 487 488 - static void 489 - gen_clid(struct nfs4_client *clp) { 490 clp->cl_clientid.cl_boot = boot_time; 491 clp->cl_clientid.cl_id = current_clientid++; 492 } 493 494 - static void 495 - gen_confirm(struct nfs4_client *clp) { 496 - struct timespec tv; 497 - u32 * p; 498 499 - tv = CURRENT_TIME; 500 p = (u32 *)clp->cl_confirm.data; 501 - *p++ = tv.tv_sec; 502 - *p++ = tv.tv_nsec; 503 } 504 505 - static int 506 - check_name(struct xdr_netobj name) { 507 - 508 if (name.len == 0) 509 return 0; 510 if (name.len > NFS4_OPAQUE_LIMIT) { ··· 683 return; 684 } 685 686 - /* 687 - * RFC 3010 has a complex implmentation description of processing a 688 - * SETCLIENTID request consisting of 5 bullets, labeled as 689 - * CASE0 - CASE4 below. 690 - * 691 - * NOTES: 692 - * callback information will be processed in a future patch 693 - * 694 - * an unconfirmed record is added when: 695 - * NORMAL (part of CASE 4): there is no confirmed nor unconfirmed record. 696 - * CASE 1: confirmed record found with matching name, principal, 697 - * verifier, and clientid. 698 - * CASE 2: confirmed record found with matching name, principal, 699 - * and there is no unconfirmed record with matching 700 - * name and principal 701 - * 702 - * an unconfirmed record is replaced when: 703 - * CASE 3: confirmed record found with matching name, principal, 704 - * and an unconfirmed record is found with matching 705 - * name, principal, and with clientid and 706 - * confirm that does not match the confirmed record. 707 - * CASE 4: there is no confirmed record with matching name and 708 - * principal. there is an unconfirmed record with 709 - * matching name, principal. 710 - * 711 - * an unconfirmed record is deleted when: 712 - * CASE 1: an unconfirmed record that matches input name, verifier, 713 - * and confirmed clientid. 714 - * CASE 4: any unconfirmed records with matching name and principal 715 - * that exist after an unconfirmed record has been replaced 716 - * as described above. 717 - * 718 - */ 719 __be32 720 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 721 struct nfsd4_setclientid *setclid) ··· 715 nfs4_lock_state(); 716 conf = find_confirmed_client_by_str(dname, strhashval); 717 if (conf) { 718 - /* 719 - * CASE 0: 720 - * clname match, confirmed, different principal 721 - * or different ip_address 722 - */ 723 status = nfserr_clid_inuse; 724 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) 725 || conf->cl_addr != sin->sin_addr.s_addr) { ··· 724 goto out; 725 } 726 } 727 unconf = find_unconfirmed_client_by_str(dname, strhashval); 728 status = nfserr_resource; 729 if (!conf) { 730 - /* 731 - * CASE 4: 732 - * placed first, because it is the normal case. 733 */ 734 if (unconf) 735 expire_client(unconf); ··· 744 gen_clid(new); 745 } else if (same_verf(&conf->cl_verifier, &clverifier)) { 746 /* 747 - * CASE 1: 748 - * cl_name match, confirmed, principal match 749 - * verifier match: probable callback update 750 - * 751 - * remove any unconfirmed nfs4_client with 752 - * matching cl_name, cl_verifier, and cl_clientid 753 - * 754 - * create and insert an unconfirmed nfs4_client with same 755 - * cl_name, cl_verifier, and cl_clientid as existing 756 - * nfs4_client, but with the new callback info and a 757 - * new cl_confirm 758 */ 759 if (unconf) { 760 /* Note this is removing unconfirmed {*x***}, ··· 761 copy_clid(new, conf); 762 } else if (!unconf) { 763 /* 764 - * CASE 2: 765 - * clname match, confirmed, principal match 766 - * verfier does not match 767 - * no unconfirmed. create a new unconfirmed nfs4_client 768 - * using input clverifier, clname, and callback info 769 - * and generate a new cl_clientid and cl_confirm. 770 */ 771 new = create_client(clname, dname); 772 if (new == NULL) 773 goto out; 774 gen_clid(new); 775 - } else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) { 776 - /* 777 - * CASE3: 778 - * confirmed found (name, principal match) 779 - * confirmed verifier does not match input clverifier 780 - * 781 - * unconfirmed found (name match) 782 - * confirmed->cl_confirm != unconfirmed->cl_confirm 783 - * 784 - * remove unconfirmed. 785 - * 786 - * create an unconfirmed nfs4_client 787 - * with same cl_name as existing confirmed nfs4_client, 788 - * but with new callback info, new cl_clientid, 789 - * new cl_verifier and a new cl_confirm 790 */ 791 expire_client(unconf); 792 new = create_client(clname, dname); 793 if (new == NULL) 794 goto out; 795 gen_clid(new); 796 - } else { 797 - /* No cases hit !!! */ 798 - status = nfserr_inval; 799 - goto out; 800 - 801 } 802 copy_verf(new, &clverifier); 803 new->cl_addr = sin->sin_addr.s_addr; ··· 798 799 800 /* 801 - * RFC 3010 has a complex implmentation description of processing a 802 - * SETCLIENTID_CONFIRM request consisting of 4 bullets describing 803 - * processing on a DRC miss, labeled as CASE1 - CASE4 below. 804 - * 805 - * NOTE: callback information will be processed here in a future patch 806 */ 807 __be32 808 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, ··· 831 if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) 832 goto out; 833 834 - if ((conf && unconf) && 835 - (same_verf(&unconf->cl_confirm, &confirm)) && 836 - (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) && 837 - (same_name(conf->cl_recdir,unconf->cl_recdir)) && 838 - (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) { 839 - /* CASE 1: 840 - * unconf record that matches input clientid and input confirm. 841 - * conf record that matches input clientid. 842 - * conf and unconf records match names, verifiers 843 - */ 844 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 845 status = nfserr_clid_inuse; 846 else { ··· 853 status = nfs_ok; 854 855 } 856 - } else if ((conf && !unconf) || 857 - ((conf && unconf) && 858 - (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) || 859 - !same_name(conf->cl_recdir, unconf->cl_recdir)))) { 860 - /* CASE 2: 861 - * conf record that matches input clientid. 862 - * if unconf record matches input clientid, then 863 - * unconf->cl_name or unconf->cl_verifier don't match the 864 - * conf record. 865 */ 866 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) 867 status = nfserr_clid_inuse; ··· 865 status = nfs_ok; 866 } else if (!conf && unconf 867 && same_verf(&unconf->cl_confirm, &confirm)) { 868 - /* CASE 3: 869 - * conf record not found. 870 - * unconf record found. 871 - * unconf->cl_confirm matches input confirm 872 */ 873 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { 874 status = nfserr_clid_inuse; ··· 882 } 883 move_to_confirmed(unconf); 884 conf = unconf; 885 status = nfs_ok; 886 } 887 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) 888 && (!unconf || (unconf && !same_verf(&unconf->cl_confirm, 889 &confirm)))) { 890 - /* CASE 4: 891 - * conf record not found, or if conf, conf->cl_confirm does not 892 - * match input confirm. 893 - * unconf record not found, or if unconf, unconf->cl_confirm 894 - * does not match input confirm. 895 */ 896 status = nfserr_stale_clientid; 897 } else { ··· 898 status = nfserr_clid_inuse; 899 } 900 out: 901 - if (!status) 902 - nfsd4_probe_callback(conf); 903 nfs4_unlock_state(); 904 return status; 905 } ··· 1157 return NULL; 1158 } 1159 1160 - static int access_valid(u32 x) 1161 { 1162 - return (x > 0 && x < 4); 1163 } 1164 1165 - static int deny_valid(u32 x) 1166 { 1167 - return (x >= 0 && x < 5); 1168 } 1169 1170 static void ··· 2098 goto check_replay; 2099 } 2100 2101 if (lock) { 2102 - struct nfs4_stateowner *sop = stp->st_stateowner; 2103 clientid_t *lockclid = &lock->v.new.clientid; 2104 struct nfs4_client *clp = sop->so_client; 2105 int lkflg = 0; ··· 2130 dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); 2131 return nfserr_bad_stateid; 2132 } 2133 - 2134 - *stpp = stp; 2135 - *sopp = sop = stp->st_stateowner; 2136 2137 /* 2138 * We now validate the seqid and stateid generation numbers.

··· 61 static time_t user_lease_time = 90; 62 static time_t boot_time; 63 static int in_grace = 1; 64 static u32 current_ownerid = 1; 65 static u32 current_fileid = 1; 66 static u32 current_delegid = 1; ··· 340 * This type of memory management is somewhat inefficient, but we use it 341 * anyway since SETCLIENTID is not a common operation. 342 */ 343 + static struct nfs4_client *alloc_client(struct xdr_netobj name) 344 { 345 struct nfs4_client *clp; 346 347 + clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); 348 + if (clp == NULL) 349 + return NULL; 350 + clp->cl_name.data = kmalloc(name.len, GFP_KERNEL); 351 + if (clp->cl_name.data == NULL) { 352 + kfree(clp); 353 + return NULL; 354 } 355 + memcpy(clp->cl_name.data, name.data, name.len); 356 + clp->cl_name.len = name.len; 357 return clp; 358 } 359 ··· 363 { 364 struct rpc_clnt *clnt = clp->cl_callback.cb_client; 365 366 if (clnt) { 367 + /* 368 + * Callback threads take a reference on the client, so there 369 + * should be no outstanding callbacks at this point. 370 + */ 371 clp->cl_callback.cb_client = NULL; 372 rpc_shutdown_client(clnt); 373 } ··· 422 put_nfs4_client(clp); 423 } 424 425 + static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir) 426 + { 427 struct nfs4_client *clp; 428 429 + clp = alloc_client(name); 430 + if (clp == NULL) 431 + return NULL; 432 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); 433 atomic_set(&clp->cl_count, 1); 434 atomic_set(&clp->cl_callback.cb_set, 0); ··· 436 INIT_LIST_HEAD(&clp->cl_openowners); 437 INIT_LIST_HEAD(&clp->cl_delegations); 438 INIT_LIST_HEAD(&clp->cl_lru); 439 return clp; 440 } 441 442 + static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) 443 + { 444 + memcpy(target->cl_verifier.data, source->data, 445 + sizeof(target->cl_verifier.data)); 446 } 447 448 + static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) 449 + { 450 target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; 451 target->cl_clientid.cl_id = source->cl_clientid.cl_id; 452 } 453 454 + static void copy_cred(struct svc_cred *target, struct svc_cred *source) 455 + { 456 target->cr_uid = source->cr_uid; 457 target->cr_gid = source->cr_gid; 458 target->cr_group_info = source->cr_group_info; 459 get_group_info(target->cr_group_info); 460 } 461 462 + static int same_name(const char *n1, const char *n2) 463 { 464 return 0 == memcmp(n1, n2, HEXDIR_LEN); 465 } ··· 485 return cr1->cr_uid == cr2->cr_uid; 486 } 487 488 + static void gen_clid(struct nfs4_client *clp) 489 + { 490 + static u32 current_clientid = 1; 491 + 492 clp->cl_clientid.cl_boot = boot_time; 493 clp->cl_clientid.cl_id = current_clientid++; 494 } 495 496 + static void gen_confirm(struct nfs4_client *clp) 497 + { 498 + static u32 i; 499 + u32 *p; 500 501 p = (u32 *)clp->cl_confirm.data; 502 + *p++ = get_seconds(); 503 + *p++ = i++; 504 } 505 506 + static int check_name(struct xdr_netobj name) 507 + { 508 if (name.len == 0) 509 return 0; 510 if (name.len > NFS4_OPAQUE_LIMIT) { ··· 683 return; 684 } 685 686 __be32 687 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 688 struct nfsd4_setclientid *setclid) ··· 748 nfs4_lock_state(); 749 conf = find_confirmed_client_by_str(dname, strhashval); 750 if (conf) { 751 + /* RFC 3530 14.2.33 CASE 0: */ 752 status = nfserr_clid_inuse; 753 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) 754 || conf->cl_addr != sin->sin_addr.s_addr) { ··· 761 goto out; 762 } 763 } 764 + /* 765 + * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION") 766 + * has a description of SETCLIENTID request processing consisting 767 + * of 5 bullet points, labeled as CASE0 - CASE4 below. 768 + */ 769 unconf = find_unconfirmed_client_by_str(dname, strhashval); 770 status = nfserr_resource; 771 if (!conf) { 772 + /* 773 + * RFC 3530 14.2.33 CASE 4: 774 + * placed first, because it is the normal case 775 */ 776 if (unconf) 777 expire_client(unconf); ··· 776 gen_clid(new); 777 } else if (same_verf(&conf->cl_verifier, &clverifier)) { 778 /* 779 + * RFC 3530 14.2.33 CASE 1: 780 + * probable callback update 781 */ 782 if (unconf) { 783 /* Note this is removing unconfirmed {*x***}, ··· 802 copy_clid(new, conf); 803 } else if (!unconf) { 804 /* 805 + * RFC 3530 14.2.33 CASE 2: 806 + * probable client reboot; state will be removed if 807 + * confirmed. 808 */ 809 new = create_client(clname, dname); 810 if (new == NULL) 811 goto out; 812 gen_clid(new); 813 + } else { 814 + /* 815 + * RFC 3530 14.2.33 CASE 3: 816 + * probable client reboot; state will be removed if 817 + * confirmed. 818 */ 819 expire_client(unconf); 820 new = create_client(clname, dname); 821 if (new == NULL) 822 goto out; 823 gen_clid(new); 824 } 825 copy_verf(new, &clverifier); 826 new->cl_addr = sin->sin_addr.s_addr; ··· 857 858 859 /* 860 + * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has 861 + * a description of SETCLIENTID_CONFIRM request processing consisting of 4 862 + * bullets, labeled as CASE1 - CASE4 below. 863 */ 864 __be32 865 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, ··· 892 if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) 893 goto out; 894 895 + /* 896 + * section 14.2.34 of RFC 3530 has a description of 897 + * SETCLIENTID_CONFIRM request processing consisting 898 + * of 4 bullet points, labeled as CASE1 - CASE4 below. 899 + */ 900 + if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) { 901 + /* 902 + * RFC 3530 14.2.34 CASE 1: 903 + * callback update 904 + */ 905 if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) 906 status = nfserr_clid_inuse; 907 else { ··· 914 status = nfs_ok; 915 916 } 917 + } else if (conf && !unconf) { 918 + /* 919 + * RFC 3530 14.2.34 CASE 2: 920 + * probable retransmitted request; play it safe and 921 + * do nothing. 922 */ 923 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) 924 status = nfserr_clid_inuse; ··· 930 status = nfs_ok; 931 } else if (!conf && unconf 932 && same_verf(&unconf->cl_confirm, &confirm)) { 933 + /* 934 + * RFC 3530 14.2.34 CASE 3: 935 + * Normal case; new or rebooted client: 936 */ 937 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { 938 status = nfserr_clid_inuse; ··· 948 } 949 move_to_confirmed(unconf); 950 conf = unconf; 951 + nfsd4_probe_callback(conf); 952 status = nfs_ok; 953 } 954 } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) 955 && (!unconf || (unconf && !same_verf(&unconf->cl_confirm, 956 &confirm)))) { 957 + /* 958 + * RFC 3530 14.2.34 CASE 4: 959 + * Client probably hasn't noticed that we rebooted yet. 960 */ 961 status = nfserr_stale_clientid; 962 } else { ··· 965 status = nfserr_clid_inuse; 966 } 967 out: 968 nfs4_unlock_state(); 969 return status; 970 } ··· 1226 return NULL; 1227 } 1228 1229 + static inline int access_valid(u32 x) 1230 { 1231 + if (x < NFS4_SHARE_ACCESS_READ) 1232 + return 0; 1233 + if (x > NFS4_SHARE_ACCESS_BOTH) 1234 + return 0; 1235 + return 1; 1236 } 1237 1238 + static inline int deny_valid(u32 x) 1239 { 1240 + /* Note: unlike access bits, deny bits may be zero. */ 1241 + return x <= NFS4_SHARE_DENY_BOTH; 1242 } 1243 1244 static void ··· 2162 goto check_replay; 2163 } 2164 2165 + *stpp = stp; 2166 + *sopp = sop = stp->st_stateowner; 2167 + 2168 if (lock) { 2169 clientid_t *lockclid = &lock->v.new.clientid; 2170 struct nfs4_client *clp = sop->so_client; 2171 int lkflg = 0; ··· 2192 dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); 2193 return nfserr_bad_stateid; 2194 } 2195 2196 /* 2197 * We now validate the seqid and stateid generation numbers.

+29 -7

fs/nfsd/nfs4xdr.c

··· 148 } \ 149 } while (0) 150 151 - static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes) 152 { 153 /* We want more bytes than seem to be available. 154 * Maybe we need a new page, maybe we have just run out 155 */ 156 - int avail = (char*)argp->end - (char*)argp->p; 157 __be32 *p; 158 if (avail + argp->pagelen < nbytes) 159 return NULL; ··· 169 return NULL; 170 171 } 172 memcpy(p, argp->p, avail); 173 /* step to next page */ 174 argp->p = page_address(argp->pagelist[0]); ··· 1453 __be32 1454 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, 1455 struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval, 1456 - struct svc_rqst *rqstp) 1457 { 1458 u32 bmval0 = bmval[0]; 1459 u32 bmval1 = bmval[1]; ··· 1833 if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { 1834 if ((buflen -= 8) < 0) 1835 goto out_resource; 1836 - if (exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) { 1837 err = vfs_getattr(exp->ex_mnt->mnt_parent, 1838 exp->ex_mnt->mnt_mountpoint, &stat); 1839 if (err) ··· 1874 struct svc_export *exp = cd->rd_fhp->fh_export; 1875 struct dentry *dentry; 1876 __be32 nfserr; 1877 1878 dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); 1879 if (IS_ERR(dentry)) 1880 return nfserrno(PTR_ERR(dentry)); 1881 1882 exp_get(exp); 1883 - if (d_mountpoint(dentry)) { 1884 int err; 1885 1886 /* ··· 1911 1912 } 1913 nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, 1914 - cd->rd_rqstp); 1915 out_put: 1916 dput(dentry); 1917 exp_put(exp); ··· 2065 buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2); 2066 nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, 2067 resp->p, &buflen, getattr->ga_bmval, 2068 - resp->rqstp); 2069 if (!nfserr) 2070 resp->p += buflen; 2071 return nfserr;

··· 148 } \ 149 } while (0) 150 151 + static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) 152 { 153 /* We want more bytes than seem to be available. 154 * Maybe we need a new page, maybe we have just run out 155 */ 156 + unsigned int avail = (char *)argp->end - (char *)argp->p; 157 __be32 *p; 158 if (avail + argp->pagelen < nbytes) 159 return NULL; ··· 169 return NULL; 170 171 } 172 + /* 173 + * The following memcpy is safe because read_buf is always 174 + * called with nbytes > avail, and the two cases above both 175 + * guarantee p points to at least nbytes bytes. 176 + */ 177 memcpy(p, argp->p, avail); 178 /* step to next page */ 179 argp->p = page_address(argp->pagelist[0]); ··· 1448 __be32 1449 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, 1450 struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval, 1451 + struct svc_rqst *rqstp, int ignore_crossmnt) 1452 { 1453 u32 bmval0 = bmval[0]; 1454 u32 bmval1 = bmval[1]; ··· 1828 if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { 1829 if ((buflen -= 8) < 0) 1830 goto out_resource; 1831 + /* 1832 + * Get parent's attributes if not ignoring crossmount 1833 + * and this is the root of a cross-mounted filesystem. 1834 + */ 1835 + if (ignore_crossmnt == 0 && 1836 + exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) { 1837 err = vfs_getattr(exp->ex_mnt->mnt_parent, 1838 exp->ex_mnt->mnt_mountpoint, &stat); 1839 if (err) ··· 1864 struct svc_export *exp = cd->rd_fhp->fh_export; 1865 struct dentry *dentry; 1866 __be32 nfserr; 1867 + int ignore_crossmnt = 0; 1868 1869 dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); 1870 if (IS_ERR(dentry)) 1871 return nfserrno(PTR_ERR(dentry)); 1872 1873 exp_get(exp); 1874 + /* 1875 + * In the case of a mountpoint, the client may be asking for 1876 + * attributes that are only properties of the underlying filesystem 1877 + * as opposed to the cross-mounted file system. In such a case, 1878 + * we will not follow the cross mount and will fill the attribtutes 1879 + * directly from the mountpoint dentry. 1880 + */ 1881 + if (d_mountpoint(dentry) && 1882 + (cd->rd_bmval[0] & ~FATTR4_WORD0_RDATTR_ERROR) == 0 && 1883 + (cd->rd_bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) == 0) 1884 + ignore_crossmnt = 1; 1885 + else if (d_mountpoint(dentry)) { 1886 int err; 1887 1888 /* ··· 1889 1890 } 1891 nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, 1892 + cd->rd_rqstp, ignore_crossmnt); 1893 out_put: 1894 dput(dentry); 1895 exp_put(exp); ··· 2043 buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2); 2044 nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, 2045 resp->p, &buflen, getattr->ga_bmval, 2046 + resp->rqstp, 0); 2047 if (!nfserr) 2048 resp->p += buflen; 2049 return nfserr;

+12 -16

fs/nfsd/nfscache.c

··· 44 */ 45 static DEFINE_SPINLOCK(cache_lock); 46 47 - void 48 - nfsd_cache_init(void) 49 { 50 struct svc_cacherep *rp; 51 int i; 52 53 INIT_LIST_HEAD(&lru_head); 54 i = CACHESIZE; 55 - while(i) { 56 rp = kmalloc(sizeof(*rp), GFP_KERNEL); 57 - if (!rp) break; 58 list_add(&rp->c_lru, &lru_head); 59 rp->c_state = RC_UNUSED; 60 rp->c_type = RC_NOCACHE; ··· 62 i--; 63 } 64 65 - if (i) 66 - printk (KERN_ERR "nfsd: cannot allocate all %d cache entries, only got %d\n", 67 - CACHESIZE, CACHESIZE-i); 68 - 69 hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); 70 - if (!hash_list) { 71 - nfsd_cache_shutdown(); 72 - printk (KERN_ERR "nfsd: cannot allocate %Zd bytes for hash list\n", 73 - HASHSIZE * sizeof(struct hlist_head)); 74 - return; 75 - } 76 77 cache_disabled = 0; 78 } 79 80 - void 81 - nfsd_cache_shutdown(void) 82 { 83 struct svc_cacherep *rp; 84

··· 44 */ 45 static DEFINE_SPINLOCK(cache_lock); 46 47 + int nfsd_reply_cache_init(void) 48 { 49 struct svc_cacherep *rp; 50 int i; 51 52 INIT_LIST_HEAD(&lru_head); 53 i = CACHESIZE; 54 + while (i) { 55 rp = kmalloc(sizeof(*rp), GFP_KERNEL); 56 + if (!rp) 57 + goto out_nomem; 58 list_add(&rp->c_lru, &lru_head); 59 rp->c_state = RC_UNUSED; 60 rp->c_type = RC_NOCACHE; ··· 62 i--; 63 } 64 65 hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); 66 + if (!hash_list) 67 + goto out_nomem; 68 69 cache_disabled = 0; 70 + return 0; 71 + out_nomem: 72 + printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); 73 + nfsd_reply_cache_shutdown(); 74 + return -ENOMEM; 75 } 76 77 + void nfsd_reply_cache_shutdown(void) 78 { 79 struct svc_cacherep *rp; 80

+103 -21

fs/nfsd/nfsctl.c

··· 304 struct auth_domain *dom; 305 struct knfsd_fh fh; 306 307 if (buf[size-1] != '\n') 308 return -EINVAL; 309 buf[size-1] = 0; ··· 506 int len = 0; 507 lock_kernel(); 508 if (nfsd_serv) 509 - len = svc_sock_names(buf, nfsd_serv, NULL); 510 unlock_kernel(); 511 return len; 512 } ··· 543 } 544 return err < 0 ? err : 0; 545 } 546 - if (buf[0] == '-') { 547 char *toclose = kstrdup(buf+1, GFP_KERNEL); 548 int len = 0; 549 if (!toclose) ··· 556 lockd_down(); 557 kfree(toclose); 558 return len; 559 } 560 return -EINVAL; 561 } ··· 666 char *recdir; 667 int len, status; 668 669 - if (size > PATH_MAX || buf[size-1] != '\n') 670 return -EINVAL; 671 buf[size-1] = 0; 672 ··· 724 .kill_sb = kill_litter_super, 725 }; 726 727 static int __init init_nfsd(void) 728 { 729 int retval; ··· 754 if (retval) 755 return retval; 756 nfsd_stat_init(); /* Statistics */ 757 - nfsd_cache_init(); /* RPC reply cache */ 758 - nfsd_export_init(); /* Exports table */ 759 nfsd_lockd_init(); /* lockd->nfsd callbacks */ 760 - nfsd_idmap_init(); /* Name to ID mapping */ 761 - if (proc_mkdir("fs/nfs", NULL)) { 762 - struct proc_dir_entry *entry; 763 - entry = create_proc_entry("fs/nfs/exports", 0, NULL); 764 - if (entry) 765 - entry->proc_fops = &exports_operations; 766 - } 767 retval = register_filesystem(&nfsd_fs_type); 768 - if (retval) { 769 - nfsd_export_shutdown(); 770 - nfsd_cache_shutdown(); 771 - remove_proc_entry("fs/nfs/exports", NULL); 772 - remove_proc_entry("fs/nfs", NULL); 773 - nfsd_stat_shutdown(); 774 - nfsd_lockd_shutdown(); 775 - } 776 return retval; 777 } 778 779 static void __exit exit_nfsd(void) 780 { 781 nfsd_export_shutdown(); 782 - nfsd_cache_shutdown(); 783 remove_proc_entry("fs/nfs/exports", NULL); 784 remove_proc_entry("fs/nfs", NULL); 785 nfsd_stat_shutdown();

··· 304 struct auth_domain *dom; 305 struct knfsd_fh fh; 306 307 + if (size == 0) 308 + return -EINVAL; 309 + 310 if (buf[size-1] != '\n') 311 return -EINVAL; 312 buf[size-1] = 0; ··· 503 int len = 0; 504 lock_kernel(); 505 if (nfsd_serv) 506 + len = svc_xprt_names(nfsd_serv, buf, 0); 507 unlock_kernel(); 508 return len; 509 } ··· 540 } 541 return err < 0 ? err : 0; 542 } 543 + if (buf[0] == '-' && isdigit(buf[1])) { 544 char *toclose = kstrdup(buf+1, GFP_KERNEL); 545 int len = 0; 546 if (!toclose) ··· 553 lockd_down(); 554 kfree(toclose); 555 return len; 556 + } 557 + /* 558 + * Add a transport listener by writing it's transport name 559 + */ 560 + if (isalpha(buf[0])) { 561 + int err; 562 + char transport[16]; 563 + int port; 564 + if (sscanf(buf, "%15s %4d", transport, &port) == 2) { 565 + err = nfsd_create_serv(); 566 + if (!err) { 567 + err = svc_create_xprt(nfsd_serv, 568 + transport, port, 569 + SVC_SOCK_ANONYMOUS); 570 + if (err == -ENOENT) 571 + /* Give a reasonable perror msg for 572 + * bad transport string */ 573 + err = -EPROTONOSUPPORT; 574 + } 575 + return err < 0 ? err : 0; 576 + } 577 + } 578 + /* 579 + * Remove a transport by writing it's transport name and port number 580 + */ 581 + if (buf[0] == '-' && isalpha(buf[1])) { 582 + struct svc_xprt *xprt; 583 + int err = -EINVAL; 584 + char transport[16]; 585 + int port; 586 + if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { 587 + if (port == 0) 588 + return -EINVAL; 589 + lock_kernel(); 590 + if (nfsd_serv) { 591 + xprt = svc_find_xprt(nfsd_serv, transport, 592 + AF_UNSPEC, port); 593 + if (xprt) { 594 + svc_close_xprt(xprt); 595 + svc_xprt_put(xprt); 596 + err = 0; 597 + } else 598 + err = -ENOTCONN; 599 + } 600 + unlock_kernel(); 601 + return err < 0 ? err : 0; 602 + } 603 } 604 return -EINVAL; 605 } ··· 616 char *recdir; 617 int len, status; 618 619 + if (size == 0 || size > PATH_MAX || buf[size-1] != '\n') 620 return -EINVAL; 621 buf[size-1] = 0; 622 ··· 674 .kill_sb = kill_litter_super, 675 }; 676 677 + #ifdef CONFIG_PROC_FS 678 + static int create_proc_exports_entry(void) 679 + { 680 + struct proc_dir_entry *entry; 681 + 682 + entry = proc_mkdir("fs/nfs", NULL); 683 + if (!entry) 684 + return -ENOMEM; 685 + entry = create_proc_entry("fs/nfs/exports", 0, NULL); 686 + if (!entry) 687 + return -ENOMEM; 688 + entry->proc_fops = &exports_operations; 689 + return 0; 690 + } 691 + #else /* CONFIG_PROC_FS */ 692 + static int create_proc_exports_entry(void) 693 + { 694 + return 0; 695 + } 696 + #endif 697 + 698 static int __init init_nfsd(void) 699 { 700 int retval; ··· 683 if (retval) 684 return retval; 685 nfsd_stat_init(); /* Statistics */ 686 + retval = nfsd_reply_cache_init(); 687 + if (retval) 688 + goto out_free_stat; 689 + retval = nfsd_export_init(); 690 + if (retval) 691 + goto out_free_cache; 692 nfsd_lockd_init(); /* lockd->nfsd callbacks */ 693 + retval = nfsd_idmap_init(); 694 + if (retval) 695 + goto out_free_lockd; 696 + retval = create_proc_exports_entry(); 697 + if (retval) 698 + goto out_free_idmap; 699 retval = register_filesystem(&nfsd_fs_type); 700 + if (retval) 701 + goto out_free_all; 702 + return 0; 703 + out_free_all: 704 + remove_proc_entry("fs/nfs/exports", NULL); 705 + remove_proc_entry("fs/nfs", NULL); 706 + out_free_idmap: 707 + nfsd_idmap_shutdown(); 708 + out_free_lockd: 709 + nfsd_lockd_shutdown(); 710 + nfsd_export_shutdown(); 711 + out_free_cache: 712 + nfsd_reply_cache_shutdown(); 713 + out_free_stat: 714 + nfsd_stat_shutdown(); 715 + nfsd4_free_slabs(); 716 return retval; 717 } 718 719 static void __exit exit_nfsd(void) 720 { 721 nfsd_export_shutdown(); 722 + nfsd_reply_cache_shutdown(); 723 remove_proc_entry("fs/nfs/exports", NULL); 724 remove_proc_entry("fs/nfs", NULL); 725 nfsd_stat_shutdown();

+1

fs/nfsd/nfsfh.c

··· 22 #include <linux/sunrpc/svc.h> 23 #include <linux/sunrpc/svcauth_gss.h> 24 #include <linux/nfsd/nfsd.h> 25 26 #define NFSDDBG_FACILITY NFSDDBG_FH 27

··· 22 #include <linux/sunrpc/svc.h> 23 #include <linux/sunrpc/svcauth_gss.h> 24 #include <linux/nfsd/nfsd.h> 25 + #include "auth.h" 26 27 #define NFSDDBG_FACILITY NFSDDBG_FH 28

+4 -4

fs/nfsd/nfssvc.c

··· 155 static void nfsd_last_thread(struct svc_serv *serv) 156 { 157 /* When last nfsd thread exits we need to do some clean-up */ 158 - struct svc_sock *svsk; 159 - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) 160 lockd_down(); 161 nfsd_serv = NULL; 162 nfsd_racache_shutdown(); ··· 236 237 error = lockd_up(IPPROTO_UDP); 238 if (error >= 0) { 239 - error = svc_makesock(nfsd_serv, IPPROTO_UDP, port, 240 SVC_SOCK_DEFAULTS); 241 if (error < 0) 242 lockd_down(); ··· 247 #ifdef CONFIG_NFSD_TCP 248 error = lockd_up(IPPROTO_TCP); 249 if (error >= 0) { 250 - error = svc_makesock(nfsd_serv, IPPROTO_TCP, port, 251 SVC_SOCK_DEFAULTS); 252 if (error < 0) 253 lockd_down();

··· 155 static void nfsd_last_thread(struct svc_serv *serv) 156 { 157 /* When last nfsd thread exits we need to do some clean-up */ 158 + struct svc_xprt *xprt; 159 + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) 160 lockd_down(); 161 nfsd_serv = NULL; 162 nfsd_racache_shutdown(); ··· 236 237 error = lockd_up(IPPROTO_UDP); 238 if (error >= 0) { 239 + error = svc_create_xprt(nfsd_serv, "udp", port, 240 SVC_SOCK_DEFAULTS); 241 if (error < 0) 242 lockd_down(); ··· 247 #ifdef CONFIG_NFSD_TCP 248 error = lockd_up(IPPROTO_TCP); 249 if (error >= 0) { 250 + error = svc_create_xprt(nfsd_serv, "tcp", port, 251 SVC_SOCK_DEFAULTS); 252 if (error < 0) 253 lockd_down();

+5 -4

fs/nfsd/nfsxdr.c

··· 15 #include <linux/nfsd/nfsd.h> 16 #include <linux/nfsd/xdr.h> 17 #include <linux/mm.h> 18 19 #define NFSDDBG_FACILITY NFSDDBG_XDR 20 ··· 63 * no slashes or null bytes. 64 */ 65 static __be32 * 66 - decode_filename(__be32 *p, char **namp, int *lenp) 67 { 68 char *name; 69 - int i; 70 71 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { 72 for (i = 0, name = *namp; i < *lenp; i++, name++) { ··· 79 } 80 81 static __be32 * 82 - decode_pathname(__be32 *p, char **namp, int *lenp) 83 { 84 char *name; 85 - int i; 86 87 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) { 88 for (i = 0, name = *namp; i < *lenp; i++, name++) {

··· 15 #include <linux/nfsd/nfsd.h> 16 #include <linux/nfsd/xdr.h> 17 #include <linux/mm.h> 18 + #include "auth.h" 19 20 #define NFSDDBG_FACILITY NFSDDBG_XDR 21 ··· 62 * no slashes or null bytes. 63 */ 64 static __be32 * 65 + decode_filename(__be32 *p, char **namp, unsigned int *lenp) 66 { 67 char *name; 68 + unsigned int i; 69 70 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { 71 for (i = 0, name = *namp; i < *lenp; i++, name++) { ··· 78 } 79 80 static __be32 * 81 + decode_pathname(__be32 *p, char **namp, unsigned int *lenp) 82 { 83 char *name; 84 + unsigned int i; 85 86 if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) { 87 for (i = 0, name = *namp; i < *lenp; i++, name++) {

+30 -21

fs/nfsd/vfs.c

··· 132 133 __be32 134 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, 135 - const char *name, int len, 136 struct svc_export **exp_ret, struct dentry **dentry_ret) 137 { 138 struct svc_export *exp; ··· 226 */ 227 __be32 228 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, 229 - int len, struct svc_fh *resfh) 230 { 231 struct svc_export *exp; 232 struct dentry *dentry; ··· 1151 } 1152 #endif /* CONFIG_NFSD_V3 */ 1153 1154 /* 1155 * Create a file (regular, directory, device, fifo); UNIX sockets 1156 * not yet implemented. ··· 1187 struct dentry *dentry, *dchild = NULL; 1188 struct inode *dirp; 1189 __be32 err; 1190 int host_err; 1191 1192 err = nfserr_perm; ··· 1278 } 1279 1280 1281 - /* Set file attributes. Mode has already been set and 1282 - * setting uid/gid works only for root. Irix appears to 1283 - * send along the gid when it tries to implement setgid 1284 - * directories via NFS. 1285 - */ 1286 - if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { 1287 - __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1288 - if (err2) 1289 - err = err2; 1290 - } 1291 /* 1292 * Update the file handle to get the new inode info. 1293 */ ··· 1309 struct dentry *dentry, *dchild = NULL; 1310 struct inode *dirp; 1311 __be32 err; 1312 int host_err; 1313 __u32 v_mtime=0, v_atime=0; 1314 ··· 1414 iap->ia_atime.tv_nsec = 0; 1415 } 1416 1417 - /* Set file attributes. 1418 - * Irix appears to send along the gid when it tries to 1419 - * implement setgid directories via NFS. Clear out all that cruft. 1420 - */ 1421 set_attr: 1422 - if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { 1423 - __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1424 - if (err2) 1425 - err = err2; 1426 - } 1427 1428 /* 1429 * Update the filehandle to get the new inode info.

··· 132 133 __be32 134 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, 135 + const char *name, unsigned int len, 136 struct svc_export **exp_ret, struct dentry **dentry_ret) 137 { 138 struct svc_export *exp; ··· 226 */ 227 __be32 228 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, 229 + unsigned int len, struct svc_fh *resfh) 230 { 231 struct svc_export *exp; 232 struct dentry *dentry; ··· 1151 } 1152 #endif /* CONFIG_NFSD_V3 */ 1153 1154 + __be32 1155 + nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, 1156 + struct iattr *iap) 1157 + { 1158 + /* 1159 + * Mode has already been set earlier in create: 1160 + */ 1161 + iap->ia_valid &= ~ATTR_MODE; 1162 + /* 1163 + * Setting uid/gid works only for root. Irix appears to 1164 + * send along the gid on create when it tries to implement 1165 + * setgid directories via NFS: 1166 + */ 1167 + if (current->fsuid != 0) 1168 + iap->ia_valid &= ~(ATTR_UID|ATTR_GID); 1169 + if (iap->ia_valid) 1170 + return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1171 + return 0; 1172 + } 1173 + 1174 /* 1175 * Create a file (regular, directory, device, fifo); UNIX sockets 1176 * not yet implemented. ··· 1167 struct dentry *dentry, *dchild = NULL; 1168 struct inode *dirp; 1169 __be32 err; 1170 + __be32 err2; 1171 int host_err; 1172 1173 err = nfserr_perm; ··· 1257 } 1258 1259 1260 + err2 = nfsd_create_setattr(rqstp, resfhp, iap); 1261 + if (err2) 1262 + err = err2; 1263 /* 1264 * Update the file handle to get the new inode info. 1265 */ ··· 1295 struct dentry *dentry, *dchild = NULL; 1296 struct inode *dirp; 1297 __be32 err; 1298 + __be32 err2; 1299 int host_err; 1300 __u32 v_mtime=0, v_atime=0; 1301 ··· 1399 iap->ia_atime.tv_nsec = 0; 1400 } 1401 1402 set_attr: 1403 + err2 = nfsd_create_setattr(rqstp, resfhp, iap); 1404 + if (err2) 1405 + err = err2; 1406 1407 /* 1408 * Update the filehandle to get the new inode info.

+6 -3

include/linux/lockd/lockd.h

··· 173 /* 174 * Host cache 175 */ 176 - struct nlm_host * nlmclnt_lookup_host(const struct sockaddr_in *, int, int, const char *, int); 177 - struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *, const char *, int); 178 struct rpc_clnt * nlm_bind_host(struct nlm_host *); 179 void nlm_rebind_host(struct nlm_host *); 180 struct nlm_host * nlm_get_host(struct nlm_host *); 181 void nlm_release_host(struct nlm_host *); 182 void nlm_shutdown_hosts(void); 183 - extern void nlm_host_rebooted(const struct sockaddr_in *, const char *, int, u32); 184 void nsm_release(struct nsm_handle *); 185 186

··· 173 /* 174 * Host cache 175 */ 176 + struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *, int, int, 177 + const char *, unsigned int); 178 + struct nlm_host *nlmsvc_lookup_host(struct svc_rqst *, const char *, 179 + unsigned int); 180 struct rpc_clnt * nlm_bind_host(struct nlm_host *); 181 void nlm_rebind_host(struct nlm_host *); 182 struct nlm_host * nlm_get_host(struct nlm_host *); 183 void nlm_release_host(struct nlm_host *); 184 void nlm_shutdown_hosts(void); 185 + extern void nlm_host_rebooted(const struct sockaddr_in *, const char *, 186 + unsigned int, u32); 187 void nsm_release(struct nsm_handle *); 188 189

+2 -2

include/linux/lockd/xdr.h

··· 29 /* Lock info passed via NLM */ 30 struct nlm_lock { 31 char * caller; 32 - int len; /* length of "caller" */ 33 struct nfs_fh fh; 34 struct xdr_netobj oh; 35 u32 svid; ··· 78 */ 79 struct nlm_reboot { 80 char * mon; 81 - int len; 82 u32 state; 83 __be32 addr; 84 __be32 vers;

··· 29 /* Lock info passed via NLM */ 30 struct nlm_lock { 31 char * caller; 32 + unsigned int len; /* length of "caller" */ 33 struct nfs_fh fh; 34 struct xdr_netobj oh; 35 u32 svid; ··· 78 */ 79 struct nlm_reboot { 80 char * mon; 81 + unsigned int len; 82 u32 state; 83 __be32 addr; 84 __be32 vers;

-1

include/linux/nfsd/Kbuild

··· 4 unifdef-y += syscall.h 5 unifdef-y += nfsfh.h 6 unifdef-y += debug.h 7 - unifdef-y += auth.h

··· 4 unifdef-y += syscall.h 5 unifdef-y += nfsfh.h 6 unifdef-y += debug.h

-5

include/linux/nfsd/auth.h fs/nfsd/auth.h

··· 1 /* 2 - * include/linux/nfsd/auth.h 3 - * 4 * nfsd-specific authentication stuff. 5 * uid/gid mapping not yet implemented. 6 * ··· 7 8 #ifndef LINUX_NFSD_AUTH_H 9 #define LINUX_NFSD_AUTH_H 10 - 11 - #ifdef __KERNEL__ 12 13 #define nfsd_luid(rq, uid) ((u32)(uid)) 14 #define nfsd_lgid(rq, gid) ((u32)(gid)) ··· 19 */ 20 int nfsd_setuser(struct svc_rqst *, struct svc_export *); 21 22 - #endif /* __KERNEL__ */ 23 #endif /* LINUX_NFSD_AUTH_H */

··· 1 /* 2 * nfsd-specific authentication stuff. 3 * uid/gid mapping not yet implemented. 4 * ··· 9 10 #ifndef LINUX_NFSD_AUTH_H 11 #define LINUX_NFSD_AUTH_H 12 13 #define nfsd_luid(rq, uid) ((u32)(uid)) 14 #define nfsd_lgid(rq, gid) ((u32)(gid)) ··· 23 */ 24 int nfsd_setuser(struct svc_rqst *, struct svc_export *); 25 26 #endif /* LINUX_NFSD_AUTH_H */

+2 -2

include/linux/nfsd/cache.h

··· 72 */ 73 #define RC_DELAY (HZ/5) 74 75 - void nfsd_cache_init(void); 76 - void nfsd_cache_shutdown(void); 77 int nfsd_cache_lookup(struct svc_rqst *, int); 78 void nfsd_cache_update(struct svc_rqst *, int, __be32 *); 79

··· 72 */ 73 #define RC_DELAY (HZ/5) 74 75 + int nfsd_reply_cache_init(void); 76 + void nfsd_reply_cache_shutdown(void); 77 int nfsd_cache_lookup(struct svc_rqst *, int); 78 void nfsd_cache_update(struct svc_rqst *, int, __be32 *); 79

+1 -1

include/linux/nfsd/export.h

··· 122 /* 123 * Function declarations 124 */ 125 - void nfsd_export_init(void); 126 void nfsd_export_shutdown(void); 127 void nfsd_export_flush(void); 128 void exp_readlock(void);

··· 122 /* 123 * Function declarations 124 */ 125 + int nfsd_export_init(void); 126 void nfsd_export_shutdown(void); 127 void nfsd_export_flush(void); 128 void exp_readlock(void);

+2 -3

include/linux/nfsd/nfsd.h

··· 20 #include <linux/nfsd/debug.h> 21 #include <linux/nfsd/nfsfh.h> 22 #include <linux/nfsd/export.h> 23 - #include <linux/nfsd/auth.h> 24 #include <linux/nfsd/stats.h> 25 /* 26 * nfsd version ··· 69 int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 70 struct svc_export **expp); 71 __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, 72 - const char *, int, struct svc_fh *); 73 __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, 74 - const char *, int, 75 struct svc_export **, struct dentry **); 76 __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, 77 struct iattr *, int, time_t);

··· 20 #include <linux/nfsd/debug.h> 21 #include <linux/nfsd/nfsfh.h> 22 #include <linux/nfsd/export.h> 23 #include <linux/nfsd/stats.h> 24 /* 25 * nfsd version ··· 70 int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 71 struct svc_export **expp); 72 __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, 73 + const char *, unsigned int, struct svc_fh *); 74 __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, 75 + const char *, unsigned int, 76 struct svc_export **, struct dentry **); 77 __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, 78 struct iattr *, int, time_t);

-1

include/linux/nfsd/syscall.h

··· 18 #include <linux/nfsd/const.h> 19 #include <linux/nfsd/export.h> 20 #include <linux/nfsd/nfsfh.h> 21 - #include <linux/nfsd/auth.h> 22 23 /* 24 * Version of the syscall interface

··· 18 #include <linux/nfsd/const.h> 19 #include <linux/nfsd/export.h> 20 #include <linux/nfsd/nfsfh.h> 21 22 /* 23 * Version of the syscall interface

+7 -7

include/linux/nfsd/xdr.h

··· 23 struct nfsd_diropargs { 24 struct svc_fh fh; 25 char * name; 26 - int len; 27 }; 28 29 struct nfsd_readargs { ··· 43 struct nfsd_createargs { 44 struct svc_fh fh; 45 char * name; 46 - int len; 47 struct iattr attrs; 48 }; 49 50 struct nfsd_renameargs { 51 struct svc_fh ffh; 52 char * fname; 53 - int flen; 54 struct svc_fh tfh; 55 char * tname; 56 - int tlen; 57 }; 58 59 struct nfsd_readlinkargs { ··· 65 struct svc_fh ffh; 66 struct svc_fh tfh; 67 char * tname; 68 - int tlen; 69 }; 70 71 struct nfsd_symlinkargs { 72 struct svc_fh ffh; 73 char * fname; 74 - int flen; 75 char * tname; 76 - int tlen; 77 struct iattr attrs; 78 }; 79

··· 23 struct nfsd_diropargs { 24 struct svc_fh fh; 25 char * name; 26 + unsigned int len; 27 }; 28 29 struct nfsd_readargs { ··· 43 struct nfsd_createargs { 44 struct svc_fh fh; 45 char * name; 46 + unsigned int len; 47 struct iattr attrs; 48 }; 49 50 struct nfsd_renameargs { 51 struct svc_fh ffh; 52 char * fname; 53 + unsigned int flen; 54 struct svc_fh tfh; 55 char * tname; 56 + unsigned int tlen; 57 }; 58 59 struct nfsd_readlinkargs { ··· 65 struct svc_fh ffh; 66 struct svc_fh tfh; 67 char * tname; 68 + unsigned int tlen; 69 }; 70 71 struct nfsd_symlinkargs { 72 struct svc_fh ffh; 73 char * fname; 74 + unsigned int flen; 75 char * tname; 76 + unsigned int tlen; 77 struct iattr attrs; 78 }; 79

+8 -8

include/linux/nfsd/xdr3.h

··· 21 struct nfsd3_diropargs { 22 struct svc_fh fh; 23 char * name; 24 - int len; 25 }; 26 27 struct nfsd3_accessargs { ··· 48 struct nfsd3_createargs { 49 struct svc_fh fh; 50 char * name; 51 - int len; 52 int createmode; 53 struct iattr attrs; 54 __be32 * verf; ··· 57 struct nfsd3_mknodargs { 58 struct svc_fh fh; 59 char * name; 60 - int len; 61 __u32 ftype; 62 __u32 major, minor; 63 struct iattr attrs; ··· 66 struct nfsd3_renameargs { 67 struct svc_fh ffh; 68 char * fname; 69 - int flen; 70 struct svc_fh tfh; 71 char * tname; 72 - int tlen; 73 }; 74 75 struct nfsd3_readlinkargs { ··· 81 struct svc_fh ffh; 82 struct svc_fh tfh; 83 char * tname; 84 - int tlen; 85 }; 86 87 struct nfsd3_symlinkargs { 88 struct svc_fh ffh; 89 char * fname; 90 - int flen; 91 char * tname; 92 - int tlen; 93 struct iattr attrs; 94 }; 95

··· 21 struct nfsd3_diropargs { 22 struct svc_fh fh; 23 char * name; 24 + unsigned int len; 25 }; 26 27 struct nfsd3_accessargs { ··· 48 struct nfsd3_createargs { 49 struct svc_fh fh; 50 char * name; 51 + unsigned int len; 52 int createmode; 53 struct iattr attrs; 54 __be32 * verf; ··· 57 struct nfsd3_mknodargs { 58 struct svc_fh fh; 59 char * name; 60 + unsigned int len; 61 __u32 ftype; 62 __u32 major, minor; 63 struct iattr attrs; ··· 66 struct nfsd3_renameargs { 67 struct svc_fh ffh; 68 char * fname; 69 + unsigned int flen; 70 struct svc_fh tfh; 71 char * tname; 72 + unsigned int tlen; 73 }; 74 75 struct nfsd3_readlinkargs { ··· 81 struct svc_fh ffh; 82 struct svc_fh tfh; 83 char * tname; 84 + unsigned int tlen; 85 }; 86 87 struct nfsd3_symlinkargs { 88 struct svc_fh ffh; 89 char * fname; 90 + unsigned int flen; 91 char * tname; 92 + unsigned int tlen; 93 struct iattr attrs; 94 }; 95

+1 -1

include/linux/nfsd/xdr4.h

··· 441 void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); 442 __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, 443 struct dentry *dentry, __be32 *buffer, int *countp, 444 - u32 *bmval, struct svc_rqst *); 445 extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, 446 struct nfsd4_compound_state *, 447 struct nfsd4_setclientid *setclid);

··· 441 void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); 442 __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, 443 struct dentry *dentry, __be32 *buffer, int *countp, 444 + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); 445 extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, 446 struct nfsd4_compound_state *, 447 struct nfsd4_setclientid *setclid);

+8 -3

include/linux/nfsd_idmap.h

··· 44 #define IDMAP_NAMESZ 128 45 46 #ifdef CONFIG_NFSD_V4 47 - void nfsd_idmap_init(void); 48 void nfsd_idmap_shutdown(void); 49 #else 50 - static inline void nfsd_idmap_init(void) {}; 51 - static inline void nfsd_idmap_shutdown(void) {}; 52 #endif 53 54 int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);

··· 44 #define IDMAP_NAMESZ 128 45 46 #ifdef CONFIG_NFSD_V4 47 + int nfsd_idmap_init(void); 48 void nfsd_idmap_shutdown(void); 49 #else 50 + static inline int nfsd_idmap_init(void) 51 + { 52 + return 0; 53 + } 54 + static inline void nfsd_idmap_shutdown(void) 55 + { 56 + } 57 #endif 58 59 int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);

+2 -2

include/linux/sunrpc/cache.h

··· 169 extern void cache_flush(void); 170 extern void cache_purge(struct cache_detail *detail); 171 #define NEVER (0x7FFFFFFF) 172 - extern void cache_register(struct cache_detail *cd); 173 - extern int cache_unregister(struct cache_detail *cd); 174 175 extern void qword_add(char **bpp, int *lp, char *str); 176 extern void qword_addhex(char **bpp, int *lp, char *buf, int blen);

··· 169 extern void cache_flush(void); 170 extern void cache_purge(struct cache_detail *detail); 171 #define NEVER (0x7FFFFFFF) 172 + extern int cache_register(struct cache_detail *cd); 173 + extern void cache_unregister(struct cache_detail *cd); 174 175 extern void qword_add(char **bpp, int *lp, char *str); 176 extern void qword_addhex(char **bpp, int *lp, char *buf, int blen);

+1 -1

include/linux/sunrpc/debug.h

··· 20 #define RPCDBG_BIND 0x0020 21 #define RPCDBG_SCHED 0x0040 22 #define RPCDBG_TRANS 0x0080 23 - #define RPCDBG_SVCSOCK 0x0100 24 #define RPCDBG_SVCDSP 0x0200 25 #define RPCDBG_MISC 0x0400 26 #define RPCDBG_CACHE 0x0800

··· 20 #define RPCDBG_BIND 0x0020 21 #define RPCDBG_SCHED 0x0040 22 #define RPCDBG_TRANS 0x0080 23 + #define RPCDBG_SVCXPRT 0x0100 24 #define RPCDBG_SVCDSP 0x0200 25 #define RPCDBG_MISC 0x0400 26 #define RPCDBG_CACHE 0x0800

+7 -3

include/linux/sunrpc/svc.h

··· 204 struct svc_rqst { 205 struct list_head rq_list; /* idle list */ 206 struct list_head rq_all; /* all threads list */ 207 - struct svc_sock * rq_sock; /* socket */ 208 struct sockaddr_storage rq_addr; /* peer address */ 209 size_t rq_addrlen; 210 ··· 214 struct auth_ops * rq_authop; /* authentication flavour */ 215 u32 rq_flavor; /* pseudoflavor */ 216 struct svc_cred rq_cred; /* auth info */ 217 - struct sk_buff * rq_skbuff; /* fast recv inet buffer */ 218 struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ 219 220 struct xdr_buf rq_arg; 221 struct xdr_buf rq_res; 222 struct page * rq_pages[RPCSVC_MAXPAGES]; ··· 318 319 struct svc_deferred_req { 320 u32 prot; /* protocol (UDP or TCP) */ 321 - struct svc_sock *svsk; 322 struct sockaddr_storage addr; /* where reply must go */ 323 size_t addrlen; 324 union svc_addr_u daddr; /* where reply must come from */ 325 struct cache_deferred_req handle; 326 int argslen; 327 __be32 args[0]; 328 }; ··· 384 */ 385 struct svc_serv * svc_create(struct svc_program *, unsigned int, 386 void (*shutdown)(struct svc_serv*)); 387 int svc_create_thread(svc_thread_fn, struct svc_serv *); 388 void svc_exit_thread(struct svc_rqst *); 389 struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,

··· 204 struct svc_rqst { 205 struct list_head rq_list; /* idle list */ 206 struct list_head rq_all; /* all threads list */ 207 + struct svc_xprt * rq_xprt; /* transport ptr */ 208 struct sockaddr_storage rq_addr; /* peer address */ 209 size_t rq_addrlen; 210 ··· 214 struct auth_ops * rq_authop; /* authentication flavour */ 215 u32 rq_flavor; /* pseudoflavor */ 216 struct svc_cred rq_cred; /* auth info */ 217 + void * rq_xprt_ctxt; /* transport specific context ptr */ 218 struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ 219 220 + size_t rq_xprt_hlen; /* xprt header len */ 221 struct xdr_buf rq_arg; 222 struct xdr_buf rq_res; 223 struct page * rq_pages[RPCSVC_MAXPAGES]; ··· 317 318 struct svc_deferred_req { 319 u32 prot; /* protocol (UDP or TCP) */ 320 + struct svc_xprt *xprt; 321 struct sockaddr_storage addr; /* where reply must go */ 322 size_t addrlen; 323 union svc_addr_u daddr; /* where reply must come from */ 324 struct cache_deferred_req handle; 325 + size_t xprt_hlen; 326 int argslen; 327 __be32 args[0]; 328 }; ··· 382 */ 383 struct svc_serv * svc_create(struct svc_program *, unsigned int, 384 void (*shutdown)(struct svc_serv*)); 385 + struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, 386 + struct svc_pool *pool); 387 int svc_create_thread(svc_thread_fn, struct svc_serv *); 388 void svc_exit_thread(struct svc_rqst *); 389 struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,

+262

include/linux/sunrpc/svc_rdma.h

···

··· 1 + /* 2 + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + * 39 + * Author: Tom Tucker <tom@opengridcomputing.com> 40 + */ 41 + 42 + #ifndef SVC_RDMA_H 43 + #define SVC_RDMA_H 44 + #include <linux/sunrpc/xdr.h> 45 + #include <linux/sunrpc/svcsock.h> 46 + #include <linux/sunrpc/rpc_rdma.h> 47 + #include <rdma/ib_verbs.h> 48 + #include <rdma/rdma_cm.h> 49 + #define SVCRDMA_DEBUG 50 + 51 + /* RPC/RDMA parameters and stats */ 52 + extern unsigned int svcrdma_ord; 53 + extern unsigned int svcrdma_max_requests; 54 + extern unsigned int svcrdma_max_req_size; 55 + 56 + extern atomic_t rdma_stat_recv; 57 + extern atomic_t rdma_stat_read; 58 + extern atomic_t rdma_stat_write; 59 + extern atomic_t rdma_stat_sq_starve; 60 + extern atomic_t rdma_stat_rq_starve; 61 + extern atomic_t rdma_stat_rq_poll; 62 + extern atomic_t rdma_stat_rq_prod; 63 + extern atomic_t rdma_stat_sq_poll; 64 + extern atomic_t rdma_stat_sq_prod; 65 + 66 + #define RPCRDMA_VERSION 1 67 + 68 + /* 69 + * Contexts are built when an RDMA request is created and are a 70 + * record of the resources that can be recovered when the request 71 + * completes. 72 + */ 73 + struct svc_rdma_op_ctxt { 74 + struct svc_rdma_op_ctxt *next; 75 + struct xdr_buf arg; 76 + struct list_head dto_q; 77 + enum ib_wr_opcode wr_op; 78 + enum ib_wc_status wc_status; 79 + u32 byte_len; 80 + struct svcxprt_rdma *xprt; 81 + unsigned long flags; 82 + enum dma_data_direction direction; 83 + int count; 84 + struct ib_sge sge[RPCSVC_MAXPAGES]; 85 + struct page *pages[RPCSVC_MAXPAGES]; 86 + }; 87 + 88 + #define RDMACTXT_F_READ_DONE 1 89 + #define RDMACTXT_F_LAST_CTXT 2 90 + 91 + struct svcxprt_rdma { 92 + struct svc_xprt sc_xprt; /* SVC transport structure */ 93 + struct rdma_cm_id *sc_cm_id; /* RDMA connection id */ 94 + struct list_head sc_accept_q; /* Conn. waiting accept */ 95 + int sc_ord; /* RDMA read limit */ 96 + wait_queue_head_t sc_read_wait; 97 + int sc_max_sge; 98 + 99 + int sc_sq_depth; /* Depth of SQ */ 100 + atomic_t sc_sq_count; /* Number of SQ WR on queue */ 101 + 102 + int sc_max_requests; /* Depth of RQ */ 103 + int sc_max_req_size; /* Size of each RQ WR buf */ 104 + 105 + struct ib_pd *sc_pd; 106 + 107 + struct svc_rdma_op_ctxt *sc_ctxt_head; 108 + int sc_ctxt_cnt; 109 + int sc_ctxt_bump; 110 + int sc_ctxt_max; 111 + spinlock_t sc_ctxt_lock; 112 + struct list_head sc_rq_dto_q; 113 + spinlock_t sc_rq_dto_lock; 114 + struct ib_qp *sc_qp; 115 + struct ib_cq *sc_rq_cq; 116 + struct ib_cq *sc_sq_cq; 117 + struct ib_mr *sc_phys_mr; /* MR for server memory */ 118 + 119 + spinlock_t sc_lock; /* transport lock */ 120 + 121 + wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */ 122 + unsigned long sc_flags; 123 + struct list_head sc_dto_q; /* DTO tasklet I/O pending Q */ 124 + struct list_head sc_read_complete_q; 125 + spinlock_t sc_read_complete_lock; 126 + }; 127 + /* sc_flags */ 128 + #define RDMAXPRT_RQ_PENDING 1 129 + #define RDMAXPRT_SQ_PENDING 2 130 + #define RDMAXPRT_CONN_PENDING 3 131 + 132 + #define RPCRDMA_LISTEN_BACKLOG 10 133 + /* The default ORD value is based on two outstanding full-size writes with a 134 + * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */ 135 + #define RPCRDMA_ORD (64/4) 136 + #define RPCRDMA_SQ_DEPTH_MULT 8 137 + #define RPCRDMA_MAX_THREADS 16 138 + #define RPCRDMA_MAX_REQUESTS 16 139 + #define RPCRDMA_MAX_REQ_SIZE 4096 140 + 141 + /* svc_rdma_marshal.c */ 142 + extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *, 143 + int *, int *); 144 + extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); 145 + extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *); 146 + extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, 147 + struct rpcrdma_msg *, 148 + enum rpcrdma_errcode, u32 *); 149 + extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); 150 + extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); 151 + extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, 152 + u32, u64, u32); 153 + extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *, 154 + struct rpcrdma_msg *, 155 + struct rpcrdma_msg *, 156 + enum rpcrdma_proc); 157 + extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *); 158 + 159 + /* svc_rdma_recvfrom.c */ 160 + extern int svc_rdma_recvfrom(struct svc_rqst *); 161 + 162 + /* svc_rdma_sendto.c */ 163 + extern int svc_rdma_sendto(struct svc_rqst *); 164 + 165 + /* svc_rdma_transport.c */ 166 + extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); 167 + extern int svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, 168 + enum rpcrdma_errcode); 169 + struct page *svc_rdma_get_page(void); 170 + extern int svc_rdma_post_recv(struct svcxprt_rdma *); 171 + extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); 172 + extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); 173 + extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); 174 + extern void svc_sq_reap(struct svcxprt_rdma *); 175 + extern void svc_rq_reap(struct svcxprt_rdma *); 176 + extern struct svc_xprt_class svc_rdma_class; 177 + extern void svc_rdma_prep_reply_hdr(struct svc_rqst *); 178 + 179 + /* svc_rdma.c */ 180 + extern int svc_rdma_init(void); 181 + extern void svc_rdma_cleanup(void); 182 + 183 + /* 184 + * Returns the address of the first read chunk or <nul> if no read chunk is 185 + * present 186 + */ 187 + static inline struct rpcrdma_read_chunk * 188 + svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) 189 + { 190 + struct rpcrdma_read_chunk *ch = 191 + (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; 192 + 193 + if (ch->rc_discrim == 0) 194 + return NULL; 195 + 196 + return ch; 197 + } 198 + 199 + /* 200 + * Returns the address of the first read write array element or <nul> if no 201 + * write array list is present 202 + */ 203 + static inline struct rpcrdma_write_array * 204 + svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) 205 + { 206 + if (rmsgp->rm_body.rm_chunks[0] != 0 207 + || rmsgp->rm_body.rm_chunks[1] == 0) 208 + return NULL; 209 + 210 + return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1]; 211 + } 212 + 213 + /* 214 + * Returns the address of the first reply array element or <nul> if no 215 + * reply array is present 216 + */ 217 + static inline struct rpcrdma_write_array * 218 + svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp) 219 + { 220 + struct rpcrdma_read_chunk *rch; 221 + struct rpcrdma_write_array *wr_ary; 222 + struct rpcrdma_write_array *rp_ary; 223 + 224 + /* XXX: Need to fix when reply list may occur with read-list and/or 225 + * write list */ 226 + if (rmsgp->rm_body.rm_chunks[0] != 0 || 227 + rmsgp->rm_body.rm_chunks[1] != 0) 228 + return NULL; 229 + 230 + rch = svc_rdma_get_read_chunk(rmsgp); 231 + if (rch) { 232 + while (rch->rc_discrim) 233 + rch++; 234 + 235 + /* The reply list follows an empty write array located 236 + * at 'rc_position' here. The reply array is at rc_target. 237 + */ 238 + rp_ary = (struct rpcrdma_write_array *)&rch->rc_target; 239 + 240 + goto found_it; 241 + } 242 + 243 + wr_ary = svc_rdma_get_write_array(rmsgp); 244 + if (wr_ary) { 245 + rp_ary = (struct rpcrdma_write_array *) 246 + &wr_ary-> 247 + wc_array[wr_ary->wc_nchunks].wc_target.rs_length; 248 + 249 + goto found_it; 250 + } 251 + 252 + /* No read list, no write list */ 253 + rp_ary = (struct rpcrdma_write_array *) 254 + &rmsgp->rm_body.rm_chunks[2]; 255 + 256 + found_it: 257 + if (rp_ary->wc_discrim == 0) 258 + return NULL; 259 + 260 + return rp_ary; 261 + } 262 + #endif

+159

include/linux/sunrpc/svc_xprt.h

···

··· 1 + /* 2 + * linux/include/linux/sunrpc/svc_xprt.h 3 + * 4 + * RPC server transport I/O 5 + */ 6 + 7 + #ifndef SUNRPC_SVC_XPRT_H 8 + #define SUNRPC_SVC_XPRT_H 9 + 10 + #include <linux/sunrpc/svc.h> 11 + #include <linux/module.h> 12 + 13 + struct svc_xprt_ops { 14 + struct svc_xprt *(*xpo_create)(struct svc_serv *, 15 + struct sockaddr *, int, 16 + int); 17 + struct svc_xprt *(*xpo_accept)(struct svc_xprt *); 18 + int (*xpo_has_wspace)(struct svc_xprt *); 19 + int (*xpo_recvfrom)(struct svc_rqst *); 20 + void (*xpo_prep_reply_hdr)(struct svc_rqst *); 21 + int (*xpo_sendto)(struct svc_rqst *); 22 + void (*xpo_release_rqst)(struct svc_rqst *); 23 + void (*xpo_detach)(struct svc_xprt *); 24 + void (*xpo_free)(struct svc_xprt *); 25 + }; 26 + 27 + struct svc_xprt_class { 28 + const char *xcl_name; 29 + struct module *xcl_owner; 30 + struct svc_xprt_ops *xcl_ops; 31 + struct list_head xcl_list; 32 + u32 xcl_max_payload; 33 + }; 34 + 35 + struct svc_xprt { 36 + struct svc_xprt_class *xpt_class; 37 + struct svc_xprt_ops *xpt_ops; 38 + struct kref xpt_ref; 39 + struct list_head xpt_list; 40 + struct list_head xpt_ready; 41 + unsigned long xpt_flags; 42 + #define XPT_BUSY 0 /* enqueued/receiving */ 43 + #define XPT_CONN 1 /* conn pending */ 44 + #define XPT_CLOSE 2 /* dead or dying */ 45 + #define XPT_DATA 3 /* data pending */ 46 + #define XPT_TEMP 4 /* connected transport */ 47 + #define XPT_DEAD 6 /* transport closed */ 48 + #define XPT_CHNGBUF 7 /* need to change snd/rcv buf sizes */ 49 + #define XPT_DEFERRED 8 /* deferred request pending */ 50 + #define XPT_OLD 9 /* used for xprt aging mark+sweep */ 51 + #define XPT_DETACHED 10 /* detached from tempsocks list */ 52 + #define XPT_LISTENER 11 /* listening endpoint */ 53 + #define XPT_CACHE_AUTH 12 /* cache auth info */ 54 + 55 + struct svc_pool *xpt_pool; /* current pool iff queued */ 56 + struct svc_serv *xpt_server; /* service for transport */ 57 + atomic_t xpt_reserved; /* space on outq that is rsvd */ 58 + struct mutex xpt_mutex; /* to serialize sending data */ 59 + spinlock_t xpt_lock; /* protects sk_deferred 60 + * and xpt_auth_cache */ 61 + void *xpt_auth_cache;/* auth cache */ 62 + struct list_head xpt_deferred; /* deferred requests that need 63 + * to be revisted */ 64 + struct sockaddr_storage xpt_local; /* local address */ 65 + size_t xpt_locallen; /* length of address */ 66 + struct sockaddr_storage xpt_remote; /* remote peer's address */ 67 + size_t xpt_remotelen; /* length of address */ 68 + }; 69 + 70 + int svc_reg_xprt_class(struct svc_xprt_class *); 71 + void svc_unreg_xprt_class(struct svc_xprt_class *); 72 + void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *, 73 + struct svc_serv *); 74 + int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); 75 + void svc_xprt_enqueue(struct svc_xprt *xprt); 76 + void svc_xprt_received(struct svc_xprt *); 77 + void svc_xprt_put(struct svc_xprt *xprt); 78 + void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt); 79 + void svc_close_xprt(struct svc_xprt *xprt); 80 + void svc_delete_xprt(struct svc_xprt *xprt); 81 + int svc_port_is_privileged(struct sockaddr *sin); 82 + int svc_print_xprts(char *buf, int maxlen); 83 + struct svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int); 84 + int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen); 85 + 86 + static inline void svc_xprt_get(struct svc_xprt *xprt) 87 + { 88 + kref_get(&xprt->xpt_ref); 89 + } 90 + static inline void svc_xprt_set_local(struct svc_xprt *xprt, 91 + struct sockaddr *sa, int salen) 92 + { 93 + memcpy(&xprt->xpt_local, sa, salen); 94 + xprt->xpt_locallen = salen; 95 + } 96 + static inline void svc_xprt_set_remote(struct svc_xprt *xprt, 97 + struct sockaddr *sa, int salen) 98 + { 99 + memcpy(&xprt->xpt_remote, sa, salen); 100 + xprt->xpt_remotelen = salen; 101 + } 102 + static inline unsigned short svc_addr_port(struct sockaddr *sa) 103 + { 104 + unsigned short ret = 0; 105 + switch (sa->sa_family) { 106 + case AF_INET: 107 + ret = ntohs(((struct sockaddr_in *)sa)->sin_port); 108 + break; 109 + case AF_INET6: 110 + ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port); 111 + break; 112 + } 113 + return ret; 114 + } 115 + 116 + static inline size_t svc_addr_len(struct sockaddr *sa) 117 + { 118 + switch (sa->sa_family) { 119 + case AF_INET: 120 + return sizeof(struct sockaddr_in); 121 + case AF_INET6: 122 + return sizeof(struct sockaddr_in6); 123 + } 124 + return -EAFNOSUPPORT; 125 + } 126 + 127 + static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt) 128 + { 129 + return svc_addr_port((struct sockaddr *)&xprt->xpt_local); 130 + } 131 + 132 + static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt) 133 + { 134 + return svc_addr_port((struct sockaddr *)&xprt->xpt_remote); 135 + } 136 + 137 + static inline char *__svc_print_addr(struct sockaddr *addr, 138 + char *buf, size_t len) 139 + { 140 + switch (addr->sa_family) { 141 + case AF_INET: 142 + snprintf(buf, len, "%u.%u.%u.%u, port=%u", 143 + NIPQUAD(((struct sockaddr_in *) addr)->sin_addr), 144 + ntohs(((struct sockaddr_in *) addr)->sin_port)); 145 + break; 146 + 147 + case AF_INET6: 148 + snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u", 149 + NIP6(((struct sockaddr_in6 *) addr)->sin6_addr), 150 + ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); 151 + break; 152 + 153 + default: 154 + snprintf(buf, len, "unknown address type: %d", addr->sa_family); 155 + break; 156 + } 157 + return buf; 158 + } 159 + #endif /* SUNRPC_SVC_XPRT_H */

+5 -38

include/linux/sunrpc/svcsock.h

··· 10 #define SUNRPC_SVCSOCK_H 11 12 #include <linux/sunrpc/svc.h> 13 14 /* 15 * RPC server socket. 16 */ 17 struct svc_sock { 18 - struct list_head sk_ready; /* list of ready sockets */ 19 - struct list_head sk_list; /* list of all sockets */ 20 struct socket * sk_sock; /* berkeley socket layer */ 21 struct sock * sk_sk; /* INET layer */ 22 - 23 - struct svc_pool * sk_pool; /* current pool iff queued */ 24 - struct svc_serv * sk_server; /* service for this socket */ 25 - atomic_t sk_inuse; /* use count */ 26 - unsigned long sk_flags; 27 - #define SK_BUSY 0 /* enqueued/receiving */ 28 - #define SK_CONN 1 /* conn pending */ 29 - #define SK_CLOSE 2 /* dead or dying */ 30 - #define SK_DATA 3 /* data pending */ 31 - #define SK_TEMP 4 /* temp (TCP) socket */ 32 - #define SK_DEAD 6 /* socket closed */ 33 - #define SK_CHNGBUF 7 /* need to change snd/rcv buffer sizes */ 34 - #define SK_DEFERRED 8 /* request on sk_deferred */ 35 - #define SK_OLD 9 /* used for temp socket aging mark+sweep */ 36 - #define SK_DETACHED 10 /* detached from tempsocks list */ 37 - 38 - atomic_t sk_reserved; /* space on outq that is reserved */ 39 - 40 - spinlock_t sk_lock; /* protects sk_deferred and 41 - * sk_info_authunix */ 42 - struct list_head sk_deferred; /* deferred requests that need to 43 - * be revisted */ 44 - struct mutex sk_mutex; /* to serialize sending data */ 45 - 46 - int (*sk_recvfrom)(struct svc_rqst *rqstp); 47 - int (*sk_sendto)(struct svc_rqst *rqstp); 48 49 /* We keep the old state_change and data_ready CB's here */ 50 void (*sk_ostate)(struct sock *); ··· 28 /* private TCP part */ 29 int sk_reclen; /* length of record */ 30 int sk_tcplen; /* current read length */ 31 - time_t sk_lastrecv; /* time of last received request */ 32 - 33 - /* cache of various info for TCP sockets */ 34 - void *sk_info_authunix; 35 - 36 - struct sockaddr_storage sk_local; /* local address */ 37 - struct sockaddr_storage sk_remote; /* remote peer's address */ 38 - int sk_remotelen; /* length of address */ 39 }; 40 41 /* 42 * Function prototypes. 43 */ 44 - int svc_makesock(struct svc_serv *, int, unsigned short, int flags); 45 - void svc_force_close_socket(struct svc_sock *); 46 int svc_recv(struct svc_rqst *, long); 47 int svc_send(struct svc_rqst *); 48 void svc_drop(struct svc_rqst *); ··· 43 int fd, 44 char *name_return, 45 int *proto); 46 47 /* 48 * svc_makesock socket characteristics

··· 10 #define SUNRPC_SVCSOCK_H 11 12 #include <linux/sunrpc/svc.h> 13 + #include <linux/sunrpc/svc_xprt.h> 14 15 /* 16 * RPC server socket. 17 */ 18 struct svc_sock { 19 + struct svc_xprt sk_xprt; 20 struct socket * sk_sock; /* berkeley socket layer */ 21 struct sock * sk_sk; /* INET layer */ 22 23 /* We keep the old state_change and data_ready CB's here */ 24 void (*sk_ostate)(struct sock *); ··· 54 /* private TCP part */ 55 int sk_reclen; /* length of record */ 56 int sk_tcplen; /* current read length */ 57 }; 58 59 /* 60 * Function prototypes. 61 */ 62 + void svc_close_all(struct list_head *); 63 int svc_recv(struct svc_rqst *, long); 64 int svc_send(struct svc_rqst *); 65 void svc_drop(struct svc_rqst *); ··· 78 int fd, 79 char *name_return, 80 int *proto); 81 + void svc_init_xprt_sock(void); 82 + void svc_cleanup_xprt_sock(void); 83 84 /* 85 * svc_makesock socket characteristics

+2 -1

include/linux/sunrpc/xdr.h

··· 112 __be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len); 113 __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len); 114 __be32 *xdr_encode_string(__be32 *p, const char *s); 115 - __be32 *xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen); 116 __be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *); 117 __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *); 118

··· 112 __be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len); 113 __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len); 114 __be32 *xdr_encode_string(__be32 *p, const char *s); 115 + __be32 *xdr_decode_string_inplace(__be32 *p, char **sp, unsigned int *lenp, 116 + unsigned int maxlen); 117 __be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *); 118 __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *); 119

+2 -1

net/sunrpc/Makefile

··· 11 auth.o auth_null.o auth_unix.o \ 12 svc.o svcsock.o svcauth.o svcauth_unix.o \ 13 rpcb_clnt.o timer.o xdr.o \ 14 - sunrpc_syms.o cache.o rpc_pipe.o 15 sunrpc-$(CONFIG_PROC_FS) += stats.o 16 sunrpc-$(CONFIG_SYSCTL) += sysctl.o

··· 11 auth.o auth_null.o auth_unix.o \ 12 svc.o svcsock.o svcauth.o svcauth_unix.o \ 13 rpcb_clnt.o timer.o xdr.o \ 14 + sunrpc_syms.o cache.o rpc_pipe.o \ 15 + svc_xprt.o 16 sunrpc-$(CONFIG_PROC_FS) += stats.o 17 sunrpc-$(CONFIG_SYSCTL) += sysctl.o

+51 -42

net/sunrpc/auth_gss/svcauth_gss.c

··· 224 225 /* major/minor */ 226 len = qword_get(&mesg, buf, mlen); 227 if (len < 0) 228 goto out; 229 - if (len == 0) { 230 goto out; 231 - } else { 232 - rsii.major_status = simple_strtoul(buf, &ep, 10); 233 - if (*ep) 234 - goto out; 235 - len = qword_get(&mesg, buf, mlen); 236 - if (len <= 0) 237 - goto out; 238 - rsii.minor_status = simple_strtoul(buf, &ep, 10); 239 - if (*ep) 240 - goto out; 241 242 - /* out_handle */ 243 - len = qword_get(&mesg, buf, mlen); 244 - if (len < 0) 245 - goto out; 246 - status = -ENOMEM; 247 - if (dup_to_netobj(&rsii.out_handle, buf, len)) 248 - goto out; 249 - 250 - /* out_token */ 251 - len = qword_get(&mesg, buf, mlen); 252 - status = -EINVAL; 253 - if (len < 0) 254 - goto out; 255 - status = -ENOMEM; 256 - if (dup_to_netobj(&rsii.out_token, buf, len)) 257 - goto out; 258 - } 259 rsii.h.expiry_time = expiry; 260 rsip = rsi_update(&rsii, rsip); 261 status = 0; ··· 971 struct kvec *resv = &rqstp->rq_res.head[0]; 972 struct xdr_netobj tmpobj; 973 struct rsi *rsip, rsikey; 974 975 /* Read the verifier; should be NULL: */ 976 *authp = rpc_autherr_badverf; ··· 1011 /* No upcall result: */ 1012 return SVC_DROP; 1013 case 0: 1014 /* Got an answer to the upcall; use it: */ 1015 if (gss_write_init_verf(rqstp, rsip)) 1016 - return SVC_DROP; 1017 if (resv->iov_len + 4 > PAGE_SIZE) 1018 - return SVC_DROP; 1019 svc_putnl(resv, RPC_SUCCESS); 1020 if (svc_safe_putnetobj(resv, &rsip->out_handle)) 1021 - return SVC_DROP; 1022 if (resv->iov_len + 3 * 4 > PAGE_SIZE) 1023 - return SVC_DROP; 1024 svc_putnl(resv, rsip->major_status); 1025 svc_putnl(resv, rsip->minor_status); 1026 svc_putnl(resv, GSS_SEQ_WIN); 1027 if (svc_safe_putnetobj(resv, &rsip->out_token)) 1028 - return SVC_DROP; 1029 } 1030 - return SVC_COMPLETE; 1031 } 1032 1033 /* ··· 1126 case RPC_GSS_PROC_DESTROY: 1127 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) 1128 goto auth_err; 1129 set_bit(CACHE_NEGATIVE, &rsci->h.flags); 1130 if (resv->iov_len + 4 > PAGE_SIZE) 1131 goto drop; ··· 1388 gss_svc_init(void) 1389 { 1390 int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); 1391 - if (rv == 0) { 1392 - cache_register(&rsc_cache); 1393 - cache_register(&rsi_cache); 1394 - } 1395 return rv; 1396 } 1397 1398 void 1399 gss_svc_shutdown(void) 1400 { 1401 - if (cache_unregister(&rsc_cache)) 1402 - printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n"); 1403 - if (cache_unregister(&rsi_cache)) 1404 - printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n"); 1405 svc_auth_unregister(RPC_AUTH_GSS); 1406 }

··· 224 225 /* major/minor */ 226 len = qword_get(&mesg, buf, mlen); 227 + if (len <= 0) 228 + goto out; 229 + rsii.major_status = simple_strtoul(buf, &ep, 10); 230 + if (*ep) 231 + goto out; 232 + len = qword_get(&mesg, buf, mlen); 233 + if (len <= 0) 234 + goto out; 235 + rsii.minor_status = simple_strtoul(buf, &ep, 10); 236 + if (*ep) 237 + goto out; 238 + 239 + /* out_handle */ 240 + len = qword_get(&mesg, buf, mlen); 241 if (len < 0) 242 goto out; 243 + status = -ENOMEM; 244 + if (dup_to_netobj(&rsii.out_handle, buf, len)) 245 goto out; 246 247 + /* out_token */ 248 + len = qword_get(&mesg, buf, mlen); 249 + status = -EINVAL; 250 + if (len < 0) 251 + goto out; 252 + status = -ENOMEM; 253 + if (dup_to_netobj(&rsii.out_token, buf, len)) 254 + goto out; 255 rsii.h.expiry_time = expiry; 256 rsip = rsi_update(&rsii, rsip); 257 status = 0; ··· 975 struct kvec *resv = &rqstp->rq_res.head[0]; 976 struct xdr_netobj tmpobj; 977 struct rsi *rsip, rsikey; 978 + int ret; 979 980 /* Read the verifier; should be NULL: */ 981 *authp = rpc_autherr_badverf; ··· 1014 /* No upcall result: */ 1015 return SVC_DROP; 1016 case 0: 1017 + ret = SVC_DROP; 1018 /* Got an answer to the upcall; use it: */ 1019 if (gss_write_init_verf(rqstp, rsip)) 1020 + goto out; 1021 if (resv->iov_len + 4 > PAGE_SIZE) 1022 + goto out; 1023 svc_putnl(resv, RPC_SUCCESS); 1024 if (svc_safe_putnetobj(resv, &rsip->out_handle)) 1025 + goto out; 1026 if (resv->iov_len + 3 * 4 > PAGE_SIZE) 1027 + goto out; 1028 svc_putnl(resv, rsip->major_status); 1029 svc_putnl(resv, rsip->minor_status); 1030 svc_putnl(resv, GSS_SEQ_WIN); 1031 if (svc_safe_putnetobj(resv, &rsip->out_token)) 1032 + goto out; 1033 } 1034 + ret = SVC_COMPLETE; 1035 + out: 1036 + cache_put(&rsip->h, &rsi_cache); 1037 + return ret; 1038 } 1039 1040 /* ··· 1125 case RPC_GSS_PROC_DESTROY: 1126 if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) 1127 goto auth_err; 1128 + rsci->h.expiry_time = get_seconds(); 1129 set_bit(CACHE_NEGATIVE, &rsci->h.flags); 1130 if (resv->iov_len + 4 > PAGE_SIZE) 1131 goto drop; ··· 1386 gss_svc_init(void) 1387 { 1388 int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); 1389 + if (rv) 1390 + return rv; 1391 + rv = cache_register(&rsc_cache); 1392 + if (rv) 1393 + goto out1; 1394 + rv = cache_register(&rsi_cache); 1395 + if (rv) 1396 + goto out2; 1397 + return 0; 1398 + out2: 1399 + cache_unregister(&rsc_cache); 1400 + out1: 1401 + svc_auth_unregister(RPC_AUTH_GSS); 1402 return rv; 1403 } 1404 1405 void 1406 gss_svc_shutdown(void) 1407 { 1408 + cache_unregister(&rsc_cache); 1409 + cache_unregister(&rsi_cache); 1410 svc_auth_unregister(RPC_AUTH_GSS); 1411 }

+92 -58

net/sunrpc/cache.c

··· 245 cache_put(h, detail); 246 return rv; 247 } 248 249 /* 250 * caches need to be periodically cleaned. ··· 291 static void do_cache_clean(struct work_struct *work); 292 static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean); 293 294 - void cache_register(struct cache_detail *cd) 295 { 296 cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); 297 - if (cd->proc_ent) { 298 - struct proc_dir_entry *p; 299 - cd->proc_ent->owner = cd->owner; 300 - cd->channel_ent = cd->content_ent = NULL; 301 302 - p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, 303 cd->proc_ent); 304 - cd->flush_ent = p; 305 - if (p) { 306 - p->proc_fops = &cache_flush_operations; 307 - p->owner = cd->owner; 308 - p->data = cd; 309 - } 310 - 311 - if (cd->cache_request || cd->cache_parse) { 312 - p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, 313 - cd->proc_ent); 314 - cd->channel_ent = p; 315 - if (p) { 316 - p->proc_fops = &cache_file_operations; 317 - p->owner = cd->owner; 318 - p->data = cd; 319 - } 320 - } 321 - if (cd->cache_show) { 322 - p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, 323 - cd->proc_ent); 324 - cd->content_ent = p; 325 - if (p) { 326 - p->proc_fops = &content_file_operations; 327 - p->owner = cd->owner; 328 - p->data = cd; 329 - } 330 - } 331 } 332 rwlock_init(&cd->hash_lock); 333 INIT_LIST_HEAD(&cd->queue); 334 spin_lock(&cache_list_lock); ··· 376 377 /* start the cleaning process */ 378 schedule_delayed_work(&cache_cleaner, 0); 379 } 380 381 - int cache_unregister(struct cache_detail *cd) 382 { 383 cache_purge(cd); 384 spin_lock(&cache_list_lock); ··· 388 if (cd->entries || atomic_read(&cd->inuse)) { 389 write_unlock(&cd->hash_lock); 390 spin_unlock(&cache_list_lock); 391 - return -EBUSY; 392 } 393 if (current_detail == cd) 394 current_detail = NULL; 395 list_del_init(&cd->others); 396 write_unlock(&cd->hash_lock); 397 spin_unlock(&cache_list_lock); 398 - if (cd->proc_ent) { 399 - if (cd->flush_ent) 400 - remove_proc_entry("flush", cd->proc_ent); 401 - if (cd->channel_ent) 402 - remove_proc_entry("channel", cd->proc_ent); 403 - if (cd->content_ent) 404 - remove_proc_entry("content", cd->proc_ent); 405 - 406 - cd->proc_ent = NULL; 407 - remove_proc_entry(cd->name, proc_net_rpc); 408 - } 409 if (list_empty(&cache_list)) { 410 /* module must be being unloaded so its safe to kill the worker */ 411 cancel_delayed_work_sync(&cache_cleaner); 412 } 413 - return 0; 414 } 415 416 /* clean cache tries to find something to clean 417 * and cleans it. ··· 519 while (cache_clean() != -1) 520 cond_resched(); 521 } 522 523 void cache_purge(struct cache_detail *detail) 524 { ··· 528 cache_flush(); 529 detail->flush_time = 1; 530 } 531 - 532 533 534 /* ··· 665 /* 666 * communicate with user-space 667 * 668 - * We have a magic /proc file - /proc/sunrpc/cache 669 - * On read, you get a full request, or block 670 - * On write, an update request is processed 671 - * Poll works if anything to read, and always allows write 672 * 673 * Implemented by linked list of requests. Each open file has 674 - * a ->private that also exists in this list. New request are added 675 * to the end and may wakeup and preceding readers. 676 * New readers are added to the head. If, on read, an item is found with 677 * CACHE_UPCALLING clear, we free it from the list. ··· 994 *bpp = bp; 995 *lp = len; 996 } 997 998 void qword_addhex(char **bpp, int *lp, char *buf, int blen) 999 { ··· 1023 *bpp = bp; 1024 *lp = len; 1025 } 1026 1027 static void warn_no_listener(struct cache_detail *detail) 1028 { ··· 1146 *dest = '\0'; 1147 return len; 1148 } 1149 1150 1151 /* ··· 1278 struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data; 1279 char tbuf[20]; 1280 unsigned long p = *ppos; 1281 - int len; 1282 1283 sprintf(tbuf, "%lu\n", cd->flush_time); 1284 len = strlen(tbuf); 1285 if (p >= len) 1286 return 0; 1287 len -= p; 1288 - if (len > count) len = count; 1289 if (copy_to_user(buf, (void*)(tbuf+p), len)) 1290 - len = -EFAULT; 1291 - else 1292 - *ppos += len; 1293 return len; 1294 } 1295

··· 245 cache_put(h, detail); 246 return rv; 247 } 248 + EXPORT_SYMBOL(cache_check); 249 250 /* 251 * caches need to be periodically cleaned. ··· 290 static void do_cache_clean(struct work_struct *work); 291 static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean); 292 293 + static void remove_cache_proc_entries(struct cache_detail *cd) 294 { 295 + if (cd->proc_ent == NULL) 296 + return; 297 + if (cd->flush_ent) 298 + remove_proc_entry("flush", cd->proc_ent); 299 + if (cd->channel_ent) 300 + remove_proc_entry("channel", cd->proc_ent); 301 + if (cd->content_ent) 302 + remove_proc_entry("content", cd->proc_ent); 303 + cd->proc_ent = NULL; 304 + remove_proc_entry(cd->name, proc_net_rpc); 305 + } 306 + 307 + #ifdef CONFIG_PROC_FS 308 + static int create_cache_proc_entries(struct cache_detail *cd) 309 + { 310 + struct proc_dir_entry *p; 311 + 312 cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); 313 + if (cd->proc_ent == NULL) 314 + goto out_nomem; 315 + cd->proc_ent->owner = cd->owner; 316 + cd->channel_ent = cd->content_ent = NULL; 317 318 + p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent); 319 + cd->flush_ent = p; 320 + if (p == NULL) 321 + goto out_nomem; 322 + p->proc_fops = &cache_flush_operations; 323 + p->owner = cd->owner; 324 + p->data = cd; 325 + 326 + if (cd->cache_request || cd->cache_parse) { 327 + p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, 328 cd->proc_ent); 329 + cd->channel_ent = p; 330 + if (p == NULL) 331 + goto out_nomem; 332 + p->proc_fops = &cache_file_operations; 333 + p->owner = cd->owner; 334 + p->data = cd; 335 } 336 + if (cd->cache_show) { 337 + p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, 338 + cd->proc_ent); 339 + cd->content_ent = p; 340 + if (p == NULL) 341 + goto out_nomem; 342 + p->proc_fops = &content_file_operations; 343 + p->owner = cd->owner; 344 + p->data = cd; 345 + } 346 + return 0; 347 + out_nomem: 348 + remove_cache_proc_entries(cd); 349 + return -ENOMEM; 350 + } 351 + #else /* CONFIG_PROC_FS */ 352 + static int create_cache_proc_entries(struct cache_detail *cd) 353 + { 354 + return 0; 355 + } 356 + #endif 357 + 358 + int cache_register(struct cache_detail *cd) 359 + { 360 + int ret; 361 + 362 + ret = create_cache_proc_entries(cd); 363 + if (ret) 364 + return ret; 365 rwlock_init(&cd->hash_lock); 366 INIT_LIST_HEAD(&cd->queue); 367 spin_lock(&cache_list_lock); ··· 341 342 /* start the cleaning process */ 343 schedule_delayed_work(&cache_cleaner, 0); 344 + return 0; 345 } 346 + EXPORT_SYMBOL(cache_register); 347 348 + void cache_unregister(struct cache_detail *cd) 349 { 350 cache_purge(cd); 351 spin_lock(&cache_list_lock); ··· 351 if (cd->entries || atomic_read(&cd->inuse)) { 352 write_unlock(&cd->hash_lock); 353 spin_unlock(&cache_list_lock); 354 + goto out; 355 } 356 if (current_detail == cd) 357 current_detail = NULL; 358 list_del_init(&cd->others); 359 write_unlock(&cd->hash_lock); 360 spin_unlock(&cache_list_lock); 361 + remove_cache_proc_entries(cd); 362 if (list_empty(&cache_list)) { 363 /* module must be being unloaded so its safe to kill the worker */ 364 cancel_delayed_work_sync(&cache_cleaner); 365 } 366 + return; 367 + out: 368 + printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name); 369 } 370 + EXPORT_SYMBOL(cache_unregister); 371 372 /* clean cache tries to find something to clean 373 * and cleans it. ··· 489 while (cache_clean() != -1) 490 cond_resched(); 491 } 492 + EXPORT_SYMBOL(cache_flush); 493 494 void cache_purge(struct cache_detail *detail) 495 { ··· 497 cache_flush(); 498 detail->flush_time = 1; 499 } 500 + EXPORT_SYMBOL(cache_purge); 501 502 503 /* ··· 634 /* 635 * communicate with user-space 636 * 637 + * We have a magic /proc file - /proc/sunrpc/<cachename>/channel. 638 + * On read, you get a full request, or block. 639 + * On write, an update request is processed. 640 + * Poll works if anything to read, and always allows write. 641 * 642 * Implemented by linked list of requests. Each open file has 643 + * a ->private that also exists in this list. New requests are added 644 * to the end and may wakeup and preceding readers. 645 * New readers are added to the head. If, on read, an item is found with 646 * CACHE_UPCALLING clear, we free it from the list. ··· 963 *bpp = bp; 964 *lp = len; 965 } 966 + EXPORT_SYMBOL(qword_add); 967 968 void qword_addhex(char **bpp, int *lp, char *buf, int blen) 969 { ··· 991 *bpp = bp; 992 *lp = len; 993 } 994 + EXPORT_SYMBOL(qword_addhex); 995 996 static void warn_no_listener(struct cache_detail *detail) 997 { ··· 1113 *dest = '\0'; 1114 return len; 1115 } 1116 + EXPORT_SYMBOL(qword_get); 1117 1118 1119 /* ··· 1244 struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data; 1245 char tbuf[20]; 1246 unsigned long p = *ppos; 1247 + size_t len; 1248 1249 sprintf(tbuf, "%lu\n", cd->flush_time); 1250 len = strlen(tbuf); 1251 if (p >= len) 1252 return 0; 1253 len -= p; 1254 + if (len > count) 1255 + len = count; 1256 if (copy_to_user(buf, (void*)(tbuf+p), len)) 1257 + return -EFAULT; 1258 + *ppos += len; 1259 return len; 1260 } 1261

+5 -2

net/sunrpc/stats.c

··· 33 static int rpc_proc_show(struct seq_file *seq, void *v) { 34 const struct rpc_stat *statp = seq->private; 35 const struct rpc_program *prog = statp->program; 36 - int i, j; 37 38 seq_printf(seq, 39 "net %u %u %u %u\n", ··· 81 const struct svc_program *prog = statp->program; 82 const struct svc_procedure *proc; 83 const struct svc_version *vers; 84 - int i, j; 85 86 seq_printf(seq, 87 "net %u %u %u %u\n", ··· 106 seq_putc(seq, '\n'); 107 } 108 } 109 110 /** 111 * rpc_alloc_iostats - allocate an rpc_iostats structure ··· 256 { 257 return do_register(statp->program->pg_name, statp, fops); 258 } 259 260 void 261 svc_proc_unregister(const char *name) 262 { 263 remove_proc_entry(name, proc_net_rpc); 264 } 265 266 void 267 rpc_proc_init(void)

··· 33 static int rpc_proc_show(struct seq_file *seq, void *v) { 34 const struct rpc_stat *statp = seq->private; 35 const struct rpc_program *prog = statp->program; 36 + unsigned int i, j; 37 38 seq_printf(seq, 39 "net %u %u %u %u\n", ··· 81 const struct svc_program *prog = statp->program; 82 const struct svc_procedure *proc; 83 const struct svc_version *vers; 84 + unsigned int i, j; 85 86 seq_printf(seq, 87 "net %u %u %u %u\n", ··· 106 seq_putc(seq, '\n'); 107 } 108 } 109 + EXPORT_SYMBOL(svc_seq_show); 110 111 /** 112 * rpc_alloc_iostats - allocate an rpc_iostats structure ··· 255 { 256 return do_register(statp->program->pg_name, statp, fops); 257 } 258 + EXPORT_SYMBOL(svc_proc_register); 259 260 void 261 svc_proc_unregister(const char *name) 262 { 263 remove_proc_entry(name, proc_net_rpc); 264 } 265 + EXPORT_SYMBOL(svc_proc_unregister); 266 267 void 268 rpc_proc_init(void)

+5 -47

net/sunrpc/sunrpc_syms.c

··· 22 #include <linux/sunrpc/rpc_pipe_fs.h> 23 #include <linux/sunrpc/xprtsock.h> 24 25 - /* RPC server stuff */ 26 - EXPORT_SYMBOL(svc_create); 27 - EXPORT_SYMBOL(svc_create_thread); 28 - EXPORT_SYMBOL(svc_create_pooled); 29 - EXPORT_SYMBOL(svc_set_num_threads); 30 - EXPORT_SYMBOL(svc_exit_thread); 31 - EXPORT_SYMBOL(svc_destroy); 32 - EXPORT_SYMBOL(svc_drop); 33 - EXPORT_SYMBOL(svc_process); 34 - EXPORT_SYMBOL(svc_recv); 35 - EXPORT_SYMBOL(svc_wake_up); 36 - EXPORT_SYMBOL(svc_makesock); 37 - EXPORT_SYMBOL(svc_reserve); 38 - EXPORT_SYMBOL(svc_auth_register); 39 - EXPORT_SYMBOL(auth_domain_lookup); 40 - EXPORT_SYMBOL(svc_authenticate); 41 - EXPORT_SYMBOL(svc_set_client); 42 - 43 - /* RPC statistics */ 44 - #ifdef CONFIG_PROC_FS 45 - EXPORT_SYMBOL(svc_proc_register); 46 - EXPORT_SYMBOL(svc_proc_unregister); 47 - EXPORT_SYMBOL(svc_seq_show); 48 - #endif 49 - 50 - /* caching... */ 51 - EXPORT_SYMBOL(auth_domain_find); 52 - EXPORT_SYMBOL(auth_domain_put); 53 - EXPORT_SYMBOL(auth_unix_add_addr); 54 - EXPORT_SYMBOL(auth_unix_forget_old); 55 - EXPORT_SYMBOL(auth_unix_lookup); 56 - EXPORT_SYMBOL(cache_check); 57 - EXPORT_SYMBOL(cache_flush); 58 - EXPORT_SYMBOL(cache_purge); 59 - EXPORT_SYMBOL(cache_register); 60 - EXPORT_SYMBOL(cache_unregister); 61 - EXPORT_SYMBOL(qword_add); 62 - EXPORT_SYMBOL(qword_addhex); 63 - EXPORT_SYMBOL(qword_get); 64 - EXPORT_SYMBOL(svcauth_unix_purge); 65 - EXPORT_SYMBOL(unix_domain_find); 66 - 67 extern struct cache_detail ip_map_cache, unix_gid_cache; 68 69 static int __init ··· 43 #endif 44 cache_register(&ip_map_cache); 45 cache_register(&unix_gid_cache); 46 - init_socket_xprt(); 47 rpcauth_init_module(); 48 out: 49 return err; ··· 55 { 56 rpcauth_remove_module(); 57 cleanup_socket_xprt(); 58 unregister_rpc_pipefs(); 59 rpc_destroy_mempool(); 60 - if (cache_unregister(&ip_map_cache)) 61 - printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n"); 62 - if (cache_unregister(&unix_gid_cache)) 63 - printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n"); 64 #ifdef RPC_DEBUG 65 rpc_unregister_sysctl(); 66 #endif

··· 22 #include <linux/sunrpc/rpc_pipe_fs.h> 23 #include <linux/sunrpc/xprtsock.h> 24 25 extern struct cache_detail ip_map_cache, unix_gid_cache; 26 27 static int __init ··· 85 #endif 86 cache_register(&ip_map_cache); 87 cache_register(&unix_gid_cache); 88 + svc_init_xprt_sock(); /* svc sock transport */ 89 + init_socket_xprt(); /* clnt sock transport */ 90 rpcauth_init_module(); 91 out: 92 return err; ··· 96 { 97 rpcauth_remove_module(); 98 cleanup_socket_xprt(); 99 + svc_cleanup_xprt_sock(); 100 unregister_rpc_pipefs(); 101 rpc_destroy_mempool(); 102 + cache_unregister(&ip_map_cache); 103 + cache_unregister(&unix_gid_cache); 104 #ifdef RPC_DEBUG 105 rpc_unregister_sysctl(); 106 #endif

+58 -32

net/sunrpc/svc.c

··· 364 void (*shutdown)(struct svc_serv *serv)) 365 { 366 struct svc_serv *serv; 367 - int vers; 368 unsigned int xdrsize; 369 unsigned int i; 370 ··· 433 { 434 return __svc_create(prog, bufsize, /*npools*/1, shutdown); 435 } 436 437 struct svc_serv * 438 svc_create_pooled(struct svc_program *prog, unsigned int bufsize, ··· 453 454 return serv; 455 } 456 457 /* 458 * Destroy an RPC service. Should be called with the BKL held ··· 461 void 462 svc_destroy(struct svc_serv *serv) 463 { 464 - struct svc_sock *svsk; 465 - struct svc_sock *tmp; 466 - 467 dprintk("svc: svc_destroy(%s, %d)\n", 468 serv->sv_program->pg_name, 469 serv->sv_nrthreads); ··· 475 476 del_timer_sync(&serv->sv_temptimer); 477 478 - list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list) 479 - svc_force_close_socket(svsk); 480 481 if (serv->sv_shutdown) 482 serv->sv_shutdown(serv); 483 484 - list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list) 485 - svc_force_close_socket(svsk); 486 487 BUG_ON(!list_empty(&serv->sv_permsocks)); 488 BUG_ON(!list_empty(&serv->sv_tempsocks)); ··· 495 kfree(serv->sv_pools); 496 kfree(serv); 497 } 498 499 /* 500 * Allocate an RPC server's buffer space. ··· 534 put_page(rqstp->rq_pages[i]); 535 } 536 537 /* 538 * Create a thread in the given pool. Caller must hold BKL. 539 * On a NUMA or SMP machine, with a multi-pool serv, the thread ··· 586 int have_oldmask = 0; 587 cpumask_t oldmask; 588 589 - rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); 590 - if (!rqstp) 591 goto out; 592 - 593 - init_waitqueue_head(&rqstp->rq_wait); 594 - 595 - if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL)) 596 - || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL)) 597 - || !svc_init_buffer(rqstp, serv->sv_max_mesg)) 598 - goto out_thread; 599 - 600 - serv->sv_nrthreads++; 601 - spin_lock_bh(&pool->sp_lock); 602 - pool->sp_nrthreads++; 603 - list_add(&rqstp->rq_all, &pool->sp_all_threads); 604 - spin_unlock_bh(&pool->sp_lock); 605 - rqstp->rq_server = serv; 606 - rqstp->rq_pool = pool; 607 608 if (serv->sv_nrpools > 1) 609 have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); ··· 620 { 621 return __svc_create_thread(func, serv, &serv->sv_pools[0]); 622 } 623 624 /* 625 * Choose a pool in which to create a new thread, for svc_set_num_threads ··· 724 725 return error; 726 } 727 728 /* 729 * Called from a server thread as it's exiting. Caller must hold BKL. ··· 751 if (serv) 752 svc_destroy(serv); 753 } 754 755 /* 756 * Register an RPC service with the local portmapper. ··· 763 { 764 struct svc_program *progp; 765 unsigned long flags; 766 - int i, error = 0, dummy; 767 768 if (!port) 769 clear_thread_flag(TIF_SIGPENDING); ··· 867 rqstp->rq_res.tail[0].iov_len = 0; 868 /* Will be turned off only in gss privacy case: */ 869 rqstp->rq_splice_ok = 1; 870 - /* tcp needs a space for the record length... */ 871 - if (rqstp->rq_prot == IPPROTO_TCP) 872 - svc_putnl(resv, 0); 873 874 rqstp->rq_xid = svc_getu32(argv); 875 svc_putu32(resv, rqstp->rq_xid); ··· 1076 svc_putnl(resv, ntohl(rpc_stat)); 1077 goto sendit; 1078 } 1079 1080 /* 1081 * Return (transport-specific) limit on the rpc payload. 1082 */ 1083 u32 svc_max_payload(const struct svc_rqst *rqstp) 1084 { 1085 - int max = RPCSVC_MAXPAYLOAD_TCP; 1086 1087 - if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM) 1088 - max = RPCSVC_MAXPAYLOAD_UDP; 1089 if (rqstp->rq_server->sv_max_payload < max) 1090 max = rqstp->rq_server->sv_max_payload; 1091 return max;

··· 364 void (*shutdown)(struct svc_serv *serv)) 365 { 366 struct svc_serv *serv; 367 + unsigned int vers; 368 unsigned int xdrsize; 369 unsigned int i; 370 ··· 433 { 434 return __svc_create(prog, bufsize, /*npools*/1, shutdown); 435 } 436 + EXPORT_SYMBOL(svc_create); 437 438 struct svc_serv * 439 svc_create_pooled(struct svc_program *prog, unsigned int bufsize, ··· 452 453 return serv; 454 } 455 + EXPORT_SYMBOL(svc_create_pooled); 456 457 /* 458 * Destroy an RPC service. Should be called with the BKL held ··· 459 void 460 svc_destroy(struct svc_serv *serv) 461 { 462 dprintk("svc: svc_destroy(%s, %d)\n", 463 serv->sv_program->pg_name, 464 serv->sv_nrthreads); ··· 476 477 del_timer_sync(&serv->sv_temptimer); 478 479 + svc_close_all(&serv->sv_tempsocks); 480 481 if (serv->sv_shutdown) 482 serv->sv_shutdown(serv); 483 484 + svc_close_all(&serv->sv_permsocks); 485 486 BUG_ON(!list_empty(&serv->sv_permsocks)); 487 BUG_ON(!list_empty(&serv->sv_tempsocks)); ··· 498 kfree(serv->sv_pools); 499 kfree(serv); 500 } 501 + EXPORT_SYMBOL(svc_destroy); 502 503 /* 504 * Allocate an RPC server's buffer space. ··· 536 put_page(rqstp->rq_pages[i]); 537 } 538 539 + struct svc_rqst * 540 + svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool) 541 + { 542 + struct svc_rqst *rqstp; 543 + 544 + rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); 545 + if (!rqstp) 546 + goto out_enomem; 547 + 548 + init_waitqueue_head(&rqstp->rq_wait); 549 + 550 + serv->sv_nrthreads++; 551 + spin_lock_bh(&pool->sp_lock); 552 + pool->sp_nrthreads++; 553 + list_add(&rqstp->rq_all, &pool->sp_all_threads); 554 + spin_unlock_bh(&pool->sp_lock); 555 + rqstp->rq_server = serv; 556 + rqstp->rq_pool = pool; 557 + 558 + rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL); 559 + if (!rqstp->rq_argp) 560 + goto out_thread; 561 + 562 + rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL); 563 + if (!rqstp->rq_resp) 564 + goto out_thread; 565 + 566 + if (!svc_init_buffer(rqstp, serv->sv_max_mesg)) 567 + goto out_thread; 568 + 569 + return rqstp; 570 + out_thread: 571 + svc_exit_thread(rqstp); 572 + out_enomem: 573 + return ERR_PTR(-ENOMEM); 574 + } 575 + EXPORT_SYMBOL(svc_prepare_thread); 576 + 577 /* 578 * Create a thread in the given pool. Caller must hold BKL. 579 * On a NUMA or SMP machine, with a multi-pool serv, the thread ··· 550 int have_oldmask = 0; 551 cpumask_t oldmask; 552 553 + rqstp = svc_prepare_thread(serv, pool); 554 + if (IS_ERR(rqstp)) { 555 + error = PTR_ERR(rqstp); 556 goto out; 557 + } 558 559 if (serv->sv_nrpools > 1) 560 have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); ··· 597 { 598 return __svc_create_thread(func, serv, &serv->sv_pools[0]); 599 } 600 + EXPORT_SYMBOL(svc_create_thread); 601 602 /* 603 * Choose a pool in which to create a new thread, for svc_set_num_threads ··· 700 701 return error; 702 } 703 + EXPORT_SYMBOL(svc_set_num_threads); 704 705 /* 706 * Called from a server thread as it's exiting. Caller must hold BKL. ··· 726 if (serv) 727 svc_destroy(serv); 728 } 729 + EXPORT_SYMBOL(svc_exit_thread); 730 731 /* 732 * Register an RPC service with the local portmapper. ··· 737 { 738 struct svc_program *progp; 739 unsigned long flags; 740 + unsigned int i; 741 + int error = 0, dummy; 742 743 if (!port) 744 clear_thread_flag(TIF_SIGPENDING); ··· 840 rqstp->rq_res.tail[0].iov_len = 0; 841 /* Will be turned off only in gss privacy case: */ 842 rqstp->rq_splice_ok = 1; 843 + 844 + /* Setup reply header */ 845 + rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); 846 847 rqstp->rq_xid = svc_getu32(argv); 848 svc_putu32(resv, rqstp->rq_xid); ··· 1049 svc_putnl(resv, ntohl(rpc_stat)); 1050 goto sendit; 1051 } 1052 + EXPORT_SYMBOL(svc_process); 1053 1054 /* 1055 * Return (transport-specific) limit on the rpc payload. 1056 */ 1057 u32 svc_max_payload(const struct svc_rqst *rqstp) 1058 { 1059 + u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload; 1060 1061 if (rqstp->rq_server->sv_max_payload < max) 1062 max = rqstp->rq_server->sv_max_payload; 1063 return max;

+1055

net/sunrpc/svc_xprt.c

···

··· 1 + /* 2 + * linux/net/sunrpc/svc_xprt.c 3 + * 4 + * Author: Tom Tucker <tom@opengridcomputing.com> 5 + */ 6 + 7 + #include <linux/sched.h> 8 + #include <linux/errno.h> 9 + #include <linux/fcntl.h> 10 + #include <linux/net.h> 11 + #include <linux/in.h> 12 + #include <linux/inet.h> 13 + #include <linux/udp.h> 14 + #include <linux/tcp.h> 15 + #include <linux/unistd.h> 16 + #include <linux/slab.h> 17 + #include <linux/netdevice.h> 18 + #include <linux/skbuff.h> 19 + #include <linux/file.h> 20 + #include <linux/freezer.h> 21 + #include <net/sock.h> 22 + #include <net/checksum.h> 23 + #include <net/ip.h> 24 + #include <net/ipv6.h> 25 + #include <net/tcp_states.h> 26 + #include <linux/uaccess.h> 27 + #include <asm/ioctls.h> 28 + 29 + #include <linux/sunrpc/types.h> 30 + #include <linux/sunrpc/clnt.h> 31 + #include <linux/sunrpc/xdr.h> 32 + #include <linux/sunrpc/stats.h> 33 + #include <linux/sunrpc/svc_xprt.h> 34 + 35 + #define RPCDBG_FACILITY RPCDBG_SVCXPRT 36 + 37 + static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); 38 + static int svc_deferred_recv(struct svc_rqst *rqstp); 39 + static struct cache_deferred_req *svc_defer(struct cache_req *req); 40 + static void svc_age_temp_xprts(unsigned long closure); 41 + 42 + /* apparently the "standard" is that clients close 43 + * idle connections after 5 minutes, servers after 44 + * 6 minutes 45 + * http://www.connectathon.org/talks96/nfstcp.pdf 46 + */ 47 + static int svc_conn_age_period = 6*60; 48 + 49 + /* List of registered transport classes */ 50 + static DEFINE_SPINLOCK(svc_xprt_class_lock); 51 + static LIST_HEAD(svc_xprt_class_list); 52 + 53 + /* SMP locking strategy: 54 + * 55 + * svc_pool->sp_lock protects most of the fields of that pool. 56 + * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. 57 + * when both need to be taken (rare), svc_serv->sv_lock is first. 58 + * BKL protects svc_serv->sv_nrthread. 59 + * svc_sock->sk_lock protects the svc_sock->sk_deferred list 60 + * and the ->sk_info_authunix cache. 61 + * 62 + * The XPT_BUSY bit in xprt->xpt_flags prevents a transport being 63 + * enqueued multiply. During normal transport processing this bit 64 + * is set by svc_xprt_enqueue and cleared by svc_xprt_received. 65 + * Providers should not manipulate this bit directly. 66 + * 67 + * Some flags can be set to certain values at any time 68 + * providing that certain rules are followed: 69 + * 70 + * XPT_CONN, XPT_DATA: 71 + * - Can be set or cleared at any time. 72 + * - After a set, svc_xprt_enqueue must be called to enqueue 73 + * the transport for processing. 74 + * - After a clear, the transport must be read/accepted. 75 + * If this succeeds, it must be set again. 76 + * XPT_CLOSE: 77 + * - Can set at any time. It is never cleared. 78 + * XPT_DEAD: 79 + * - Can only be set while XPT_BUSY is held which ensures 80 + * that no other thread will be using the transport or will 81 + * try to set XPT_DEAD. 82 + */ 83 + 84 + int svc_reg_xprt_class(struct svc_xprt_class *xcl) 85 + { 86 + struct svc_xprt_class *cl; 87 + int res = -EEXIST; 88 + 89 + dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name); 90 + 91 + INIT_LIST_HEAD(&xcl->xcl_list); 92 + spin_lock(&svc_xprt_class_lock); 93 + /* Make sure there isn't already a class with the same name */ 94 + list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) { 95 + if (strcmp(xcl->xcl_name, cl->xcl_name) == 0) 96 + goto out; 97 + } 98 + list_add_tail(&xcl->xcl_list, &svc_xprt_class_list); 99 + res = 0; 100 + out: 101 + spin_unlock(&svc_xprt_class_lock); 102 + return res; 103 + } 104 + EXPORT_SYMBOL_GPL(svc_reg_xprt_class); 105 + 106 + void svc_unreg_xprt_class(struct svc_xprt_class *xcl) 107 + { 108 + dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name); 109 + spin_lock(&svc_xprt_class_lock); 110 + list_del_init(&xcl->xcl_list); 111 + spin_unlock(&svc_xprt_class_lock); 112 + } 113 + EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); 114 + 115 + /* 116 + * Format the transport list for printing 117 + */ 118 + int svc_print_xprts(char *buf, int maxlen) 119 + { 120 + struct list_head *le; 121 + char tmpstr[80]; 122 + int len = 0; 123 + buf[0] = '\0'; 124 + 125 + spin_lock(&svc_xprt_class_lock); 126 + list_for_each(le, &svc_xprt_class_list) { 127 + int slen; 128 + struct svc_xprt_class *xcl = 129 + list_entry(le, struct svc_xprt_class, xcl_list); 130 + 131 + sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); 132 + slen = strlen(tmpstr); 133 + if (len + slen > maxlen) 134 + break; 135 + len += slen; 136 + strcat(buf, tmpstr); 137 + } 138 + spin_unlock(&svc_xprt_class_lock); 139 + 140 + return len; 141 + } 142 + 143 + static void svc_xprt_free(struct kref *kref) 144 + { 145 + struct svc_xprt *xprt = 146 + container_of(kref, struct svc_xprt, xpt_ref); 147 + struct module *owner = xprt->xpt_class->xcl_owner; 148 + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags) 149 + && xprt->xpt_auth_cache != NULL) 150 + svcauth_unix_info_release(xprt->xpt_auth_cache); 151 + xprt->xpt_ops->xpo_free(xprt); 152 + module_put(owner); 153 + } 154 + 155 + void svc_xprt_put(struct svc_xprt *xprt) 156 + { 157 + kref_put(&xprt->xpt_ref, svc_xprt_free); 158 + } 159 + EXPORT_SYMBOL_GPL(svc_xprt_put); 160 + 161 + /* 162 + * Called by transport drivers to initialize the transport independent 163 + * portion of the transport instance. 164 + */ 165 + void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, 166 + struct svc_serv *serv) 167 + { 168 + memset(xprt, 0, sizeof(*xprt)); 169 + xprt->xpt_class = xcl; 170 + xprt->xpt_ops = xcl->xcl_ops; 171 + kref_init(&xprt->xpt_ref); 172 + xprt->xpt_server = serv; 173 + INIT_LIST_HEAD(&xprt->xpt_list); 174 + INIT_LIST_HEAD(&xprt->xpt_ready); 175 + INIT_LIST_HEAD(&xprt->xpt_deferred); 176 + mutex_init(&xprt->xpt_mutex); 177 + spin_lock_init(&xprt->xpt_lock); 178 + set_bit(XPT_BUSY, &xprt->xpt_flags); 179 + } 180 + EXPORT_SYMBOL_GPL(svc_xprt_init); 181 + 182 + int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, 183 + int flags) 184 + { 185 + struct svc_xprt_class *xcl; 186 + struct sockaddr_in sin = { 187 + .sin_family = AF_INET, 188 + .sin_addr.s_addr = INADDR_ANY, 189 + .sin_port = htons(port), 190 + }; 191 + dprintk("svc: creating transport %s[%d]\n", xprt_name, port); 192 + spin_lock(&svc_xprt_class_lock); 193 + list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { 194 + struct svc_xprt *newxprt; 195 + 196 + if (strcmp(xprt_name, xcl->xcl_name)) 197 + continue; 198 + 199 + if (!try_module_get(xcl->xcl_owner)) 200 + goto err; 201 + 202 + spin_unlock(&svc_xprt_class_lock); 203 + newxprt = xcl->xcl_ops-> 204 + xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin), 205 + flags); 206 + if (IS_ERR(newxprt)) { 207 + module_put(xcl->xcl_owner); 208 + return PTR_ERR(newxprt); 209 + } 210 + 211 + clear_bit(XPT_TEMP, &newxprt->xpt_flags); 212 + spin_lock_bh(&serv->sv_lock); 213 + list_add(&newxprt->xpt_list, &serv->sv_permsocks); 214 + spin_unlock_bh(&serv->sv_lock); 215 + clear_bit(XPT_BUSY, &newxprt->xpt_flags); 216 + return svc_xprt_local_port(newxprt); 217 + } 218 + err: 219 + spin_unlock(&svc_xprt_class_lock); 220 + dprintk("svc: transport %s not found\n", xprt_name); 221 + return -ENOENT; 222 + } 223 + EXPORT_SYMBOL_GPL(svc_create_xprt); 224 + 225 + /* 226 + * Copy the local and remote xprt addresses to the rqstp structure 227 + */ 228 + void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt) 229 + { 230 + struct sockaddr *sin; 231 + 232 + memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen); 233 + rqstp->rq_addrlen = xprt->xpt_remotelen; 234 + 235 + /* 236 + * Destination address in request is needed for binding the 237 + * source address in RPC replies/callbacks later. 238 + */ 239 + sin = (struct sockaddr *)&xprt->xpt_local; 240 + switch (sin->sa_family) { 241 + case AF_INET: 242 + rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; 243 + break; 244 + case AF_INET6: 245 + rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; 246 + break; 247 + } 248 + } 249 + EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs); 250 + 251 + /** 252 + * svc_print_addr - Format rq_addr field for printing 253 + * @rqstp: svc_rqst struct containing address to print 254 + * @buf: target buffer for formatted address 255 + * @len: length of target buffer 256 + * 257 + */ 258 + char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) 259 + { 260 + return __svc_print_addr(svc_addr(rqstp), buf, len); 261 + } 262 + EXPORT_SYMBOL_GPL(svc_print_addr); 263 + 264 + /* 265 + * Queue up an idle server thread. Must have pool->sp_lock held. 266 + * Note: this is really a stack rather than a queue, so that we only 267 + * use as many different threads as we need, and the rest don't pollute 268 + * the cache. 269 + */ 270 + static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) 271 + { 272 + list_add(&rqstp->rq_list, &pool->sp_threads); 273 + } 274 + 275 + /* 276 + * Dequeue an nfsd thread. Must have pool->sp_lock held. 277 + */ 278 + static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) 279 + { 280 + list_del(&rqstp->rq_list); 281 + } 282 + 283 + /* 284 + * Queue up a transport with data pending. If there are idle nfsd 285 + * processes, wake 'em up. 286 + * 287 + */ 288 + void svc_xprt_enqueue(struct svc_xprt *xprt) 289 + { 290 + struct svc_serv *serv = xprt->xpt_server; 291 + struct svc_pool *pool; 292 + struct svc_rqst *rqstp; 293 + int cpu; 294 + 295 + if (!(xprt->xpt_flags & 296 + ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED)))) 297 + return; 298 + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) 299 + return; 300 + 301 + cpu = get_cpu(); 302 + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); 303 + put_cpu(); 304 + 305 + spin_lock_bh(&pool->sp_lock); 306 + 307 + if (!list_empty(&pool->sp_threads) && 308 + !list_empty(&pool->sp_sockets)) 309 + printk(KERN_ERR 310 + "svc_xprt_enqueue: " 311 + "threads and transports both waiting??\n"); 312 + 313 + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { 314 + /* Don't enqueue dead transports */ 315 + dprintk("svc: transport %p is dead, not enqueued\n", xprt); 316 + goto out_unlock; 317 + } 318 + 319 + /* Mark transport as busy. It will remain in this state until 320 + * the provider calls svc_xprt_received. We update XPT_BUSY 321 + * atomically because it also guards against trying to enqueue 322 + * the transport twice. 323 + */ 324 + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { 325 + /* Don't enqueue transport while already enqueued */ 326 + dprintk("svc: transport %p busy, not enqueued\n", xprt); 327 + goto out_unlock; 328 + } 329 + BUG_ON(xprt->xpt_pool != NULL); 330 + xprt->xpt_pool = pool; 331 + 332 + /* Handle pending connection */ 333 + if (test_bit(XPT_CONN, &xprt->xpt_flags)) 334 + goto process; 335 + 336 + /* Handle close in-progress */ 337 + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) 338 + goto process; 339 + 340 + /* Check if we have space to reply to a request */ 341 + if (!xprt->xpt_ops->xpo_has_wspace(xprt)) { 342 + /* Don't enqueue while not enough space for reply */ 343 + dprintk("svc: no write space, transport %p not enqueued\n", 344 + xprt); 345 + xprt->xpt_pool = NULL; 346 + clear_bit(XPT_BUSY, &xprt->xpt_flags); 347 + goto out_unlock; 348 + } 349 + 350 + process: 351 + if (!list_empty(&pool->sp_threads)) { 352 + rqstp = list_entry(pool->sp_threads.next, 353 + struct svc_rqst, 354 + rq_list); 355 + dprintk("svc: transport %p served by daemon %p\n", 356 + xprt, rqstp); 357 + svc_thread_dequeue(pool, rqstp); 358 + if (rqstp->rq_xprt) 359 + printk(KERN_ERR 360 + "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", 361 + rqstp, rqstp->rq_xprt); 362 + rqstp->rq_xprt = xprt; 363 + svc_xprt_get(xprt); 364 + rqstp->rq_reserved = serv->sv_max_mesg; 365 + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); 366 + BUG_ON(xprt->xpt_pool != pool); 367 + wake_up(&rqstp->rq_wait); 368 + } else { 369 + dprintk("svc: transport %p put into queue\n", xprt); 370 + list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); 371 + BUG_ON(xprt->xpt_pool != pool); 372 + } 373 + 374 + out_unlock: 375 + spin_unlock_bh(&pool->sp_lock); 376 + } 377 + EXPORT_SYMBOL_GPL(svc_xprt_enqueue); 378 + 379 + /* 380 + * Dequeue the first transport. Must be called with the pool->sp_lock held. 381 + */ 382 + static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) 383 + { 384 + struct svc_xprt *xprt; 385 + 386 + if (list_empty(&pool->sp_sockets)) 387 + return NULL; 388 + 389 + xprt = list_entry(pool->sp_sockets.next, 390 + struct svc_xprt, xpt_ready); 391 + list_del_init(&xprt->xpt_ready); 392 + 393 + dprintk("svc: transport %p dequeued, inuse=%d\n", 394 + xprt, atomic_read(&xprt->xpt_ref.refcount)); 395 + 396 + return xprt; 397 + } 398 + 399 + /* 400 + * svc_xprt_received conditionally queues the transport for processing 401 + * by another thread. The caller must hold the XPT_BUSY bit and must 402 + * not thereafter touch transport data. 403 + * 404 + * Note: XPT_DATA only gets cleared when a read-attempt finds no (or 405 + * insufficient) data. 406 + */ 407 + void svc_xprt_received(struct svc_xprt *xprt) 408 + { 409 + BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); 410 + xprt->xpt_pool = NULL; 411 + clear_bit(XPT_BUSY, &xprt->xpt_flags); 412 + svc_xprt_enqueue(xprt); 413 + } 414 + EXPORT_SYMBOL_GPL(svc_xprt_received); 415 + 416 + /** 417 + * svc_reserve - change the space reserved for the reply to a request. 418 + * @rqstp: The request in question 419 + * @space: new max space to reserve 420 + * 421 + * Each request reserves some space on the output queue of the transport 422 + * to make sure the reply fits. This function reduces that reserved 423 + * space to be the amount of space used already, plus @space. 424 + * 425 + */ 426 + void svc_reserve(struct svc_rqst *rqstp, int space) 427 + { 428 + space += rqstp->rq_res.head[0].iov_len; 429 + 430 + if (space < rqstp->rq_reserved) { 431 + struct svc_xprt *xprt = rqstp->rq_xprt; 432 + atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); 433 + rqstp->rq_reserved = space; 434 + 435 + svc_xprt_enqueue(xprt); 436 + } 437 + } 438 + EXPORT_SYMBOL(svc_reserve); 439 + 440 + static void svc_xprt_release(struct svc_rqst *rqstp) 441 + { 442 + struct svc_xprt *xprt = rqstp->rq_xprt; 443 + 444 + rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); 445 + 446 + svc_free_res_pages(rqstp); 447 + rqstp->rq_res.page_len = 0; 448 + rqstp->rq_res.page_base = 0; 449 + 450 + /* Reset response buffer and release 451 + * the reservation. 452 + * But first, check that enough space was reserved 453 + * for the reply, otherwise we have a bug! 454 + */ 455 + if ((rqstp->rq_res.len) > rqstp->rq_reserved) 456 + printk(KERN_ERR "RPC request reserved %d but used %d\n", 457 + rqstp->rq_reserved, 458 + rqstp->rq_res.len); 459 + 460 + rqstp->rq_res.head[0].iov_len = 0; 461 + svc_reserve(rqstp, 0); 462 + rqstp->rq_xprt = NULL; 463 + 464 + svc_xprt_put(xprt); 465 + } 466 + 467 + /* 468 + * External function to wake up a server waiting for data 469 + * This really only makes sense for services like lockd 470 + * which have exactly one thread anyway. 471 + */ 472 + void svc_wake_up(struct svc_serv *serv) 473 + { 474 + struct svc_rqst *rqstp; 475 + unsigned int i; 476 + struct svc_pool *pool; 477 + 478 + for (i = 0; i < serv->sv_nrpools; i++) { 479 + pool = &serv->sv_pools[i]; 480 + 481 + spin_lock_bh(&pool->sp_lock); 482 + if (!list_empty(&pool->sp_threads)) { 483 + rqstp = list_entry(pool->sp_threads.next, 484 + struct svc_rqst, 485 + rq_list); 486 + dprintk("svc: daemon %p woken up.\n", rqstp); 487 + /* 488 + svc_thread_dequeue(pool, rqstp); 489 + rqstp->rq_xprt = NULL; 490 + */ 491 + wake_up(&rqstp->rq_wait); 492 + } 493 + spin_unlock_bh(&pool->sp_lock); 494 + } 495 + } 496 + EXPORT_SYMBOL(svc_wake_up); 497 + 498 + int svc_port_is_privileged(struct sockaddr *sin) 499 + { 500 + switch (sin->sa_family) { 501 + case AF_INET: 502 + return ntohs(((struct sockaddr_in *)sin)->sin_port) 503 + < PROT_SOCK; 504 + case AF_INET6: 505 + return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) 506 + < PROT_SOCK; 507 + default: 508 + return 0; 509 + } 510 + } 511 + 512 + /* 513 + * Make sure that we don't have too many active connections. If we 514 + * have, something must be dropped. 515 + * 516 + * There's no point in trying to do random drop here for DoS 517 + * prevention. The NFS clients does 1 reconnect in 15 seconds. An 518 + * attacker can easily beat that. 519 + * 520 + * The only somewhat efficient mechanism would be if drop old 521 + * connections from the same IP first. But right now we don't even 522 + * record the client IP in svc_sock. 523 + */ 524 + static void svc_check_conn_limits(struct svc_serv *serv) 525 + { 526 + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { 527 + struct svc_xprt *xprt = NULL; 528 + spin_lock_bh(&serv->sv_lock); 529 + if (!list_empty(&serv->sv_tempsocks)) { 530 + if (net_ratelimit()) { 531 + /* Try to help the admin */ 532 + printk(KERN_NOTICE "%s: too many open " 533 + "connections, consider increasing the " 534 + "number of nfsd threads\n", 535 + serv->sv_name); 536 + } 537 + /* 538 + * Always select the oldest connection. It's not fair, 539 + * but so is life 540 + */ 541 + xprt = list_entry(serv->sv_tempsocks.prev, 542 + struct svc_xprt, 543 + xpt_list); 544 + set_bit(XPT_CLOSE, &xprt->xpt_flags); 545 + svc_xprt_get(xprt); 546 + } 547 + spin_unlock_bh(&serv->sv_lock); 548 + 549 + if (xprt) { 550 + svc_xprt_enqueue(xprt); 551 + svc_xprt_put(xprt); 552 + } 553 + } 554 + } 555 + 556 + /* 557 + * Receive the next request on any transport. This code is carefully 558 + * organised not to touch any cachelines in the shared svc_serv 559 + * structure, only cachelines in the local svc_pool. 560 + */ 561 + int svc_recv(struct svc_rqst *rqstp, long timeout) 562 + { 563 + struct svc_xprt *xprt = NULL; 564 + struct svc_serv *serv = rqstp->rq_server; 565 + struct svc_pool *pool = rqstp->rq_pool; 566 + int len, i; 567 + int pages; 568 + struct xdr_buf *arg; 569 + DECLARE_WAITQUEUE(wait, current); 570 + 571 + dprintk("svc: server %p waiting for data (to = %ld)\n", 572 + rqstp, timeout); 573 + 574 + if (rqstp->rq_xprt) 575 + printk(KERN_ERR 576 + "svc_recv: service %p, transport not NULL!\n", 577 + rqstp); 578 + if (waitqueue_active(&rqstp->rq_wait)) 579 + printk(KERN_ERR 580 + "svc_recv: service %p, wait queue active!\n", 581 + rqstp); 582 + 583 + /* now allocate needed pages. If we get a failure, sleep briefly */ 584 + pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; 585 + for (i = 0; i < pages ; i++) 586 + while (rqstp->rq_pages[i] == NULL) { 587 + struct page *p = alloc_page(GFP_KERNEL); 588 + if (!p) { 589 + int j = msecs_to_jiffies(500); 590 + schedule_timeout_uninterruptible(j); 591 + } 592 + rqstp->rq_pages[i] = p; 593 + } 594 + rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ 595 + BUG_ON(pages >= RPCSVC_MAXPAGES); 596 + 597 + /* Make arg->head point to first page and arg->pages point to rest */ 598 + arg = &rqstp->rq_arg; 599 + arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); 600 + arg->head[0].iov_len = PAGE_SIZE; 601 + arg->pages = rqstp->rq_pages + 1; 602 + arg->page_base = 0; 603 + /* save at least one page for response */ 604 + arg->page_len = (pages-2)*PAGE_SIZE; 605 + arg->len = (pages-1)*PAGE_SIZE; 606 + arg->tail[0].iov_len = 0; 607 + 608 + try_to_freeze(); 609 + cond_resched(); 610 + if (signalled()) 611 + return -EINTR; 612 + 613 + spin_lock_bh(&pool->sp_lock); 614 + xprt = svc_xprt_dequeue(pool); 615 + if (xprt) { 616 + rqstp->rq_xprt = xprt; 617 + svc_xprt_get(xprt); 618 + rqstp->rq_reserved = serv->sv_max_mesg; 619 + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); 620 + } else { 621 + /* No data pending. Go to sleep */ 622 + svc_thread_enqueue(pool, rqstp); 623 + 624 + /* 625 + * We have to be able to interrupt this wait 626 + * to bring down the daemons ... 627 + */ 628 + set_current_state(TASK_INTERRUPTIBLE); 629 + add_wait_queue(&rqstp->rq_wait, &wait); 630 + spin_unlock_bh(&pool->sp_lock); 631 + 632 + schedule_timeout(timeout); 633 + 634 + try_to_freeze(); 635 + 636 + spin_lock_bh(&pool->sp_lock); 637 + remove_wait_queue(&rqstp->rq_wait, &wait); 638 + 639 + xprt = rqstp->rq_xprt; 640 + if (!xprt) { 641 + svc_thread_dequeue(pool, rqstp); 642 + spin_unlock_bh(&pool->sp_lock); 643 + dprintk("svc: server %p, no data yet\n", rqstp); 644 + return signalled()? -EINTR : -EAGAIN; 645 + } 646 + } 647 + spin_unlock_bh(&pool->sp_lock); 648 + 649 + len = 0; 650 + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { 651 + dprintk("svc_recv: found XPT_CLOSE\n"); 652 + svc_delete_xprt(xprt); 653 + } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { 654 + struct svc_xprt *newxpt; 655 + newxpt = xprt->xpt_ops->xpo_accept(xprt); 656 + if (newxpt) { 657 + /* 658 + * We know this module_get will succeed because the 659 + * listener holds a reference too 660 + */ 661 + __module_get(newxpt->xpt_class->xcl_owner); 662 + svc_check_conn_limits(xprt->xpt_server); 663 + spin_lock_bh(&serv->sv_lock); 664 + set_bit(XPT_TEMP, &newxpt->xpt_flags); 665 + list_add(&newxpt->xpt_list, &serv->sv_tempsocks); 666 + serv->sv_tmpcnt++; 667 + if (serv->sv_temptimer.function == NULL) { 668 + /* setup timer to age temp transports */ 669 + setup_timer(&serv->sv_temptimer, 670 + svc_age_temp_xprts, 671 + (unsigned long)serv); 672 + mod_timer(&serv->sv_temptimer, 673 + jiffies + svc_conn_age_period * HZ); 674 + } 675 + spin_unlock_bh(&serv->sv_lock); 676 + svc_xprt_received(newxpt); 677 + } 678 + svc_xprt_received(xprt); 679 + } else { 680 + dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", 681 + rqstp, pool->sp_id, xprt, 682 + atomic_read(&xprt->xpt_ref.refcount)); 683 + rqstp->rq_deferred = svc_deferred_dequeue(xprt); 684 + if (rqstp->rq_deferred) { 685 + svc_xprt_received(xprt); 686 + len = svc_deferred_recv(rqstp); 687 + } else 688 + len = xprt->xpt_ops->xpo_recvfrom(rqstp); 689 + dprintk("svc: got len=%d\n", len); 690 + } 691 + 692 + /* No data, incomplete (TCP) read, or accept() */ 693 + if (len == 0 || len == -EAGAIN) { 694 + rqstp->rq_res.len = 0; 695 + svc_xprt_release(rqstp); 696 + return -EAGAIN; 697 + } 698 + clear_bit(XPT_OLD, &xprt->xpt_flags); 699 + 700 + rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); 701 + rqstp->rq_chandle.defer = svc_defer; 702 + 703 + if (serv->sv_stats) 704 + serv->sv_stats->netcnt++; 705 + return len; 706 + } 707 + EXPORT_SYMBOL(svc_recv); 708 + 709 + /* 710 + * Drop request 711 + */ 712 + void svc_drop(struct svc_rqst *rqstp) 713 + { 714 + dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); 715 + svc_xprt_release(rqstp); 716 + } 717 + EXPORT_SYMBOL(svc_drop); 718 + 719 + /* 720 + * Return reply to client. 721 + */ 722 + int svc_send(struct svc_rqst *rqstp) 723 + { 724 + struct svc_xprt *xprt; 725 + int len; 726 + struct xdr_buf *xb; 727 + 728 + xprt = rqstp->rq_xprt; 729 + if (!xprt) 730 + return -EFAULT; 731 + 732 + /* release the receive skb before sending the reply */ 733 + rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); 734 + 735 + /* calculate over-all length */ 736 + xb = &rqstp->rq_res; 737 + xb->len = xb->head[0].iov_len + 738 + xb->page_len + 739 + xb->tail[0].iov_len; 740 + 741 + /* Grab mutex to serialize outgoing data. */ 742 + mutex_lock(&xprt->xpt_mutex); 743 + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) 744 + len = -ENOTCONN; 745 + else 746 + len = xprt->xpt_ops->xpo_sendto(rqstp); 747 + mutex_unlock(&xprt->xpt_mutex); 748 + svc_xprt_release(rqstp); 749 + 750 + if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) 751 + return 0; 752 + return len; 753 + } 754 + 755 + /* 756 + * Timer function to close old temporary transports, using 757 + * a mark-and-sweep algorithm. 758 + */ 759 + static void svc_age_temp_xprts(unsigned long closure) 760 + { 761 + struct svc_serv *serv = (struct svc_serv *)closure; 762 + struct svc_xprt *xprt; 763 + struct list_head *le, *next; 764 + LIST_HEAD(to_be_aged); 765 + 766 + dprintk("svc_age_temp_xprts\n"); 767 + 768 + if (!spin_trylock_bh(&serv->sv_lock)) { 769 + /* busy, try again 1 sec later */ 770 + dprintk("svc_age_temp_xprts: busy\n"); 771 + mod_timer(&serv->sv_temptimer, jiffies + HZ); 772 + return; 773 + } 774 + 775 + list_for_each_safe(le, next, &serv->sv_tempsocks) { 776 + xprt = list_entry(le, struct svc_xprt, xpt_list); 777 + 778 + /* First time through, just mark it OLD. Second time 779 + * through, close it. */ 780 + if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) 781 + continue; 782 + if (atomic_read(&xprt->xpt_ref.refcount) > 1 783 + || test_bit(XPT_BUSY, &xprt->xpt_flags)) 784 + continue; 785 + svc_xprt_get(xprt); 786 + list_move(le, &to_be_aged); 787 + set_bit(XPT_CLOSE, &xprt->xpt_flags); 788 + set_bit(XPT_DETACHED, &xprt->xpt_flags); 789 + } 790 + spin_unlock_bh(&serv->sv_lock); 791 + 792 + while (!list_empty(&to_be_aged)) { 793 + le = to_be_aged.next; 794 + /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */ 795 + list_del_init(le); 796 + xprt = list_entry(le, struct svc_xprt, xpt_list); 797 + 798 + dprintk("queuing xprt %p for closing\n", xprt); 799 + 800 + /* a thread will dequeue and close it soon */ 801 + svc_xprt_enqueue(xprt); 802 + svc_xprt_put(xprt); 803 + } 804 + 805 + mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); 806 + } 807 + 808 + /* 809 + * Remove a dead transport 810 + */ 811 + void svc_delete_xprt(struct svc_xprt *xprt) 812 + { 813 + struct svc_serv *serv = xprt->xpt_server; 814 + 815 + dprintk("svc: svc_delete_xprt(%p)\n", xprt); 816 + xprt->xpt_ops->xpo_detach(xprt); 817 + 818 + spin_lock_bh(&serv->sv_lock); 819 + if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) 820 + list_del_init(&xprt->xpt_list); 821 + /* 822 + * We used to delete the transport from whichever list 823 + * it's sk_xprt.xpt_ready node was on, but we don't actually 824 + * need to. This is because the only time we're called 825 + * while still attached to a queue, the queue itself 826 + * is about to be destroyed (in svc_destroy). 827 + */ 828 + if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) { 829 + BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2); 830 + if (test_bit(XPT_TEMP, &xprt->xpt_flags)) 831 + serv->sv_tmpcnt--; 832 + svc_xprt_put(xprt); 833 + } 834 + spin_unlock_bh(&serv->sv_lock); 835 + } 836 + 837 + void svc_close_xprt(struct svc_xprt *xprt) 838 + { 839 + set_bit(XPT_CLOSE, &xprt->xpt_flags); 840 + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) 841 + /* someone else will have to effect the close */ 842 + return; 843 + 844 + svc_xprt_get(xprt); 845 + svc_delete_xprt(xprt); 846 + clear_bit(XPT_BUSY, &xprt->xpt_flags); 847 + svc_xprt_put(xprt); 848 + } 849 + EXPORT_SYMBOL_GPL(svc_close_xprt); 850 + 851 + void svc_close_all(struct list_head *xprt_list) 852 + { 853 + struct svc_xprt *xprt; 854 + struct svc_xprt *tmp; 855 + 856 + list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { 857 + set_bit(XPT_CLOSE, &xprt->xpt_flags); 858 + if (test_bit(XPT_BUSY, &xprt->xpt_flags)) { 859 + /* Waiting to be processed, but no threads left, 860 + * So just remove it from the waiting list 861 + */ 862 + list_del_init(&xprt->xpt_ready); 863 + clear_bit(XPT_BUSY, &xprt->xpt_flags); 864 + } 865 + svc_close_xprt(xprt); 866 + } 867 + } 868 + 869 + /* 870 + * Handle defer and revisit of requests 871 + */ 872 + 873 + static void svc_revisit(struct cache_deferred_req *dreq, int too_many) 874 + { 875 + struct svc_deferred_req *dr = 876 + container_of(dreq, struct svc_deferred_req, handle); 877 + struct svc_xprt *xprt = dr->xprt; 878 + 879 + if (too_many) { 880 + svc_xprt_put(xprt); 881 + kfree(dr); 882 + return; 883 + } 884 + dprintk("revisit queued\n"); 885 + dr->xprt = NULL; 886 + spin_lock(&xprt->xpt_lock); 887 + list_add(&dr->handle.recent, &xprt->xpt_deferred); 888 + spin_unlock(&xprt->xpt_lock); 889 + set_bit(XPT_DEFERRED, &xprt->xpt_flags); 890 + svc_xprt_enqueue(xprt); 891 + svc_xprt_put(xprt); 892 + } 893 + 894 + /* 895 + * Save the request off for later processing. The request buffer looks 896 + * like this: 897 + * 898 + * <xprt-header><rpc-header><rpc-pagelist><rpc-tail> 899 + * 900 + * This code can only handle requests that consist of an xprt-header 901 + * and rpc-header. 902 + */ 903 + static struct cache_deferred_req *svc_defer(struct cache_req *req) 904 + { 905 + struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); 906 + struct svc_deferred_req *dr; 907 + 908 + if (rqstp->rq_arg.page_len) 909 + return NULL; /* if more than a page, give up FIXME */ 910 + if (rqstp->rq_deferred) { 911 + dr = rqstp->rq_deferred; 912 + rqstp->rq_deferred = NULL; 913 + } else { 914 + size_t skip; 915 + size_t size; 916 + /* FIXME maybe discard if size too large */ 917 + size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len; 918 + dr = kmalloc(size, GFP_KERNEL); 919 + if (dr == NULL) 920 + return NULL; 921 + 922 + dr->handle.owner = rqstp->rq_server; 923 + dr->prot = rqstp->rq_prot; 924 + memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); 925 + dr->addrlen = rqstp->rq_addrlen; 926 + dr->daddr = rqstp->rq_daddr; 927 + dr->argslen = rqstp->rq_arg.len >> 2; 928 + dr->xprt_hlen = rqstp->rq_xprt_hlen; 929 + 930 + /* back up head to the start of the buffer and copy */ 931 + skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; 932 + memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, 933 + dr->argslen << 2); 934 + } 935 + svc_xprt_get(rqstp->rq_xprt); 936 + dr->xprt = rqstp->rq_xprt; 937 + 938 + dr->handle.revisit = svc_revisit; 939 + return &dr->handle; 940 + } 941 + 942 + /* 943 + * recv data from a deferred request into an active one 944 + */ 945 + static int svc_deferred_recv(struct svc_rqst *rqstp) 946 + { 947 + struct svc_deferred_req *dr = rqstp->rq_deferred; 948 + 949 + /* setup iov_base past transport header */ 950 + rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2); 951 + /* The iov_len does not include the transport header bytes */ 952 + rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen; 953 + rqstp->rq_arg.page_len = 0; 954 + /* The rq_arg.len includes the transport header bytes */ 955 + rqstp->rq_arg.len = dr->argslen<<2; 956 + rqstp->rq_prot = dr->prot; 957 + memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); 958 + rqstp->rq_addrlen = dr->addrlen; 959 + /* Save off transport header len in case we get deferred again */ 960 + rqstp->rq_xprt_hlen = dr->xprt_hlen; 961 + rqstp->rq_daddr = dr->daddr; 962 + rqstp->rq_respages = rqstp->rq_pages; 963 + return (dr->argslen<<2) - dr->xprt_hlen; 964 + } 965 + 966 + 967 + static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) 968 + { 969 + struct svc_deferred_req *dr = NULL; 970 + 971 + if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags)) 972 + return NULL; 973 + spin_lock(&xprt->xpt_lock); 974 + clear_bit(XPT_DEFERRED, &xprt->xpt_flags); 975 + if (!list_empty(&xprt->xpt_deferred)) { 976 + dr = list_entry(xprt->xpt_deferred.next, 977 + struct svc_deferred_req, 978 + handle.recent); 979 + list_del_init(&dr->handle.recent); 980 + set_bit(XPT_DEFERRED, &xprt->xpt_flags); 981 + } 982 + spin_unlock(&xprt->xpt_lock); 983 + return dr; 984 + } 985 + 986 + /* 987 + * Return the transport instance pointer for the endpoint accepting 988 + * connections/peer traffic from the specified transport class, 989 + * address family and port. 990 + * 991 + * Specifying 0 for the address family or port is effectively a 992 + * wild-card, and will result in matching the first transport in the 993 + * service's list that has a matching class name. 994 + */ 995 + struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, 996 + int af, int port) 997 + { 998 + struct svc_xprt *xprt; 999 + struct svc_xprt *found = NULL; 1000 + 1001 + /* Sanity check the args */ 1002 + if (!serv || !xcl_name) 1003 + return found; 1004 + 1005 + spin_lock_bh(&serv->sv_lock); 1006 + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { 1007 + if (strcmp(xprt->xpt_class->xcl_name, xcl_name)) 1008 + continue; 1009 + if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family) 1010 + continue; 1011 + if (port && port != svc_xprt_local_port(xprt)) 1012 + continue; 1013 + found = xprt; 1014 + svc_xprt_get(xprt); 1015 + break; 1016 + } 1017 + spin_unlock_bh(&serv->sv_lock); 1018 + return found; 1019 + } 1020 + EXPORT_SYMBOL_GPL(svc_find_xprt); 1021 + 1022 + /* 1023 + * Format a buffer with a list of the active transports. A zero for 1024 + * the buflen parameter disables target buffer overflow checking. 1025 + */ 1026 + int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen) 1027 + { 1028 + struct svc_xprt *xprt; 1029 + char xprt_str[64]; 1030 + int totlen = 0; 1031 + int len; 1032 + 1033 + /* Sanity check args */ 1034 + if (!serv) 1035 + return 0; 1036 + 1037 + spin_lock_bh(&serv->sv_lock); 1038 + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { 1039 + len = snprintf(xprt_str, sizeof(xprt_str), 1040 + "%s %d\n", xprt->xpt_class->xcl_name, 1041 + svc_xprt_local_port(xprt)); 1042 + /* If the string was truncated, replace with error string */ 1043 + if (len >= sizeof(xprt_str)) 1044 + strcpy(xprt_str, "name-too-long\n"); 1045 + /* Don't overflow buffer */ 1046 + len = strlen(xprt_str); 1047 + if (buflen && (len + totlen >= buflen)) 1048 + break; 1049 + strcpy(buf+totlen, xprt_str); 1050 + totlen += len; 1051 + } 1052 + spin_unlock_bh(&serv->sv_lock); 1053 + return totlen; 1054 + } 1055 + EXPORT_SYMBOL_GPL(svc_xprt_names);

+6

net/sunrpc/svcauth.c

··· 57 rqstp->rq_authop = aops; 58 return aops->accept(rqstp, authp); 59 } 60 61 int svc_set_client(struct svc_rqst *rqstp) 62 { 63 return rqstp->rq_authop->set_client(rqstp); 64 } 65 66 /* A request, which was authenticated, has now executed. 67 * Time to finalise the credentials and verifier ··· 95 spin_unlock(&authtab_lock); 96 return rv; 97 } 98 99 void 100 svc_auth_unregister(rpc_authflavor_t flavor) ··· 132 spin_unlock(&auth_domain_lock); 133 } 134 } 135 136 struct auth_domain * 137 auth_domain_lookup(char *name, struct auth_domain *new) ··· 157 spin_unlock(&auth_domain_lock); 158 return new; 159 } 160 161 struct auth_domain *auth_domain_find(char *name) 162 { 163 return auth_domain_lookup(name, NULL); 164 }

··· 57 rqstp->rq_authop = aops; 58 return aops->accept(rqstp, authp); 59 } 60 + EXPORT_SYMBOL(svc_authenticate); 61 62 int svc_set_client(struct svc_rqst *rqstp) 63 { 64 return rqstp->rq_authop->set_client(rqstp); 65 } 66 + EXPORT_SYMBOL(svc_set_client); 67 68 /* A request, which was authenticated, has now executed. 69 * Time to finalise the credentials and verifier ··· 93 spin_unlock(&authtab_lock); 94 return rv; 95 } 96 + EXPORT_SYMBOL(svc_auth_register); 97 98 void 99 svc_auth_unregister(rpc_authflavor_t flavor) ··· 129 spin_unlock(&auth_domain_lock); 130 } 131 } 132 + EXPORT_SYMBOL(auth_domain_put); 133 134 struct auth_domain * 135 auth_domain_lookup(char *name, struct auth_domain *new) ··· 153 spin_unlock(&auth_domain_lock); 154 return new; 155 } 156 + EXPORT_SYMBOL(auth_domain_lookup); 157 158 struct auth_domain *auth_domain_find(char *name) 159 { 160 return auth_domain_lookup(name, NULL); 161 } 162 + EXPORT_SYMBOL(auth_domain_find);

+34 -25

net/sunrpc/svcauth_unix.c

··· 63 rv = auth_domain_lookup(name, &new->h); 64 } 65 } 66 67 static void svcauth_unix_domain_release(struct auth_domain *dom) 68 { ··· 341 else 342 return -ENOMEM; 343 } 344 345 int auth_unix_forget_old(struct auth_domain *dom) 346 { ··· 353 udom->addr_changes++; 354 return 0; 355 } 356 357 struct auth_domain *auth_unix_lookup(struct in_addr addr) 358 { ··· 378 cache_put(&ipm->h, &ip_map_cache); 379 return rv; 380 } 381 382 void svcauth_unix_purge(void) 383 { 384 cache_purge(&ip_map_cache); 385 } 386 387 static inline struct ip_map * 388 ip_map_cached_get(struct svc_rqst *rqstp) 389 { 390 - struct ip_map *ipm; 391 - struct svc_sock *svsk = rqstp->rq_sock; 392 - spin_lock(&svsk->sk_lock); 393 - ipm = svsk->sk_info_authunix; 394 - if (ipm != NULL) { 395 - if (!cache_valid(&ipm->h)) { 396 - /* 397 - * The entry has been invalidated since it was 398 - * remembered, e.g. by a second mount from the 399 - * same IP address. 400 - */ 401 - svsk->sk_info_authunix = NULL; 402 - spin_unlock(&svsk->sk_lock); 403 - cache_put(&ipm->h, &ip_map_cache); 404 - return NULL; 405 } 406 - cache_get(&ipm->h); 407 } 408 - spin_unlock(&svsk->sk_lock); 409 return ipm; 410 } 411 412 static inline void 413 ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) 414 { 415 - struct svc_sock *svsk = rqstp->rq_sock; 416 417 - spin_lock(&svsk->sk_lock); 418 - if (svsk->sk_sock->type == SOCK_STREAM && 419 - svsk->sk_info_authunix == NULL) { 420 - /* newly cached, keep the reference */ 421 - svsk->sk_info_authunix = ipm; 422 - ipm = NULL; 423 } 424 - spin_unlock(&svsk->sk_lock); 425 if (ipm) 426 cache_put(&ipm->h, &ip_map_cache); 427 }

··· 63 rv = auth_domain_lookup(name, &new->h); 64 } 65 } 66 + EXPORT_SYMBOL(unix_domain_find); 67 68 static void svcauth_unix_domain_release(struct auth_domain *dom) 69 { ··· 340 else 341 return -ENOMEM; 342 } 343 + EXPORT_SYMBOL(auth_unix_add_addr); 344 345 int auth_unix_forget_old(struct auth_domain *dom) 346 { ··· 351 udom->addr_changes++; 352 return 0; 353 } 354 + EXPORT_SYMBOL(auth_unix_forget_old); 355 356 struct auth_domain *auth_unix_lookup(struct in_addr addr) 357 { ··· 375 cache_put(&ipm->h, &ip_map_cache); 376 return rv; 377 } 378 + EXPORT_SYMBOL(auth_unix_lookup); 379 380 void svcauth_unix_purge(void) 381 { 382 cache_purge(&ip_map_cache); 383 } 384 + EXPORT_SYMBOL(svcauth_unix_purge); 385 386 static inline struct ip_map * 387 ip_map_cached_get(struct svc_rqst *rqstp) 388 { 389 + struct ip_map *ipm = NULL; 390 + struct svc_xprt *xprt = rqstp->rq_xprt; 391 + 392 + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { 393 + spin_lock(&xprt->xpt_lock); 394 + ipm = xprt->xpt_auth_cache; 395 + if (ipm != NULL) { 396 + if (!cache_valid(&ipm->h)) { 397 + /* 398 + * The entry has been invalidated since it was 399 + * remembered, e.g. by a second mount from the 400 + * same IP address. 401 + */ 402 + xprt->xpt_auth_cache = NULL; 403 + spin_unlock(&xprt->xpt_lock); 404 + cache_put(&ipm->h, &ip_map_cache); 405 + return NULL; 406 + } 407 + cache_get(&ipm->h); 408 } 409 + spin_unlock(&xprt->xpt_lock); 410 } 411 return ipm; 412 } 413 414 static inline void 415 ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) 416 { 417 + struct svc_xprt *xprt = rqstp->rq_xprt; 418 419 + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { 420 + spin_lock(&xprt->xpt_lock); 421 + if (xprt->xpt_auth_cache == NULL) { 422 + /* newly cached, keep the reference */ 423 + xprt->xpt_auth_cache = ipm; 424 + ipm = NULL; 425 + } 426 + spin_unlock(&xprt->xpt_lock); 427 } 428 if (ipm) 429 cache_put(&ipm->h, &ip_map_cache); 430 }

+316 -995

net/sunrpc/svcsock.c

··· 5 * 6 * The server scheduling algorithm does not always distribute the load 7 * evenly when servicing a single client. May need to modify the 8 - * svc_sock_enqueue procedure... 9 * 10 * TCP support is largely untested and may be a little slow. The problem 11 * is that we currently do two separate recvfrom's, one for the 4-byte ··· 48 #include <linux/sunrpc/svcsock.h> 49 #include <linux/sunrpc/stats.h> 50 51 - /* SMP locking strategy: 52 - * 53 - * svc_pool->sp_lock protects most of the fields of that pool. 54 - * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. 55 - * when both need to be taken (rare), svc_serv->sv_lock is first. 56 - * BKL protects svc_serv->sv_nrthread. 57 - * svc_sock->sk_lock protects the svc_sock->sk_deferred list 58 - * and the ->sk_info_authunix cache. 59 - * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. 60 - * 61 - * Some flags can be set to certain values at any time 62 - * providing that certain rules are followed: 63 - * 64 - * SK_CONN, SK_DATA, can be set or cleared at any time. 65 - * after a set, svc_sock_enqueue must be called. 66 - * after a clear, the socket must be read/accepted 67 - * if this succeeds, it must be set again. 68 - * SK_CLOSE can set at any time. It is never cleared. 69 - * sk_inuse contains a bias of '1' until SK_DEAD is set. 70 - * so when sk_inuse hits zero, we know the socket is dead 71 - * and no-one is using it. 72 - * SK_DEAD can only be set while SK_BUSY is held which ensures 73 - * no other thread will be using the socket or will try to 74 - * set SK_DEAD. 75 - * 76 - */ 77 - 78 - #define RPCDBG_FACILITY RPCDBG_SVCSOCK 79 80 81 static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, 82 int *errp, int flags); 83 - static void svc_delete_socket(struct svc_sock *svsk); 84 static void svc_udp_data_ready(struct sock *, int); 85 static int svc_udp_recvfrom(struct svc_rqst *); 86 static int svc_udp_sendto(struct svc_rqst *); 87 - static void svc_close_socket(struct svc_sock *svsk); 88 89 - static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); 90 - static int svc_deferred_recv(struct svc_rqst *rqstp); 91 - static struct cache_deferred_req *svc_defer(struct cache_req *req); 92 - 93 - /* apparently the "standard" is that clients close 94 - * idle connections after 5 minutes, servers after 95 - * 6 minutes 96 - * http://www.connectathon.org/talks96/nfstcp.pdf 97 - */ 98 - static int svc_conn_age_period = 6*60; 99 - 100 #ifdef CONFIG_DEBUG_LOCK_ALLOC 101 static struct lock_class_key svc_key[2]; 102 static struct lock_class_key svc_slock_key[2]; 103 104 - static inline void svc_reclassify_socket(struct socket *sock) 105 { 106 struct sock *sk = sock->sk; 107 BUG_ON(sock_owned_by_user(sk)); 108 switch (sk->sk_family) { 109 case AF_INET: 110 sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", 111 - &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]); 112 break; 113 114 case AF_INET6: 115 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", 116 - &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]); 117 break; 118 119 default: ··· 89 } 90 } 91 #else 92 - static inline void svc_reclassify_socket(struct socket *sock) 93 { 94 } 95 #endif 96 97 - static char *__svc_print_addr(struct sockaddr *addr, char *buf, size_t len) 98 - { 99 - switch (addr->sa_family) { 100 - case AF_INET: 101 - snprintf(buf, len, "%u.%u.%u.%u, port=%u", 102 - NIPQUAD(((struct sockaddr_in *) addr)->sin_addr), 103 - ntohs(((struct sockaddr_in *) addr)->sin_port)); 104 - break; 105 - 106 - case AF_INET6: 107 - snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u", 108 - NIP6(((struct sockaddr_in6 *) addr)->sin6_addr), 109 - ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); 110 - break; 111 - 112 - default: 113 - snprintf(buf, len, "unknown address type: %d", addr->sa_family); 114 - break; 115 - } 116 - return buf; 117 - } 118 - 119 - /** 120 - * svc_print_addr - Format rq_addr field for printing 121 - * @rqstp: svc_rqst struct containing address to print 122 - * @buf: target buffer for formatted address 123 - * @len: length of target buffer 124 - * 125 - */ 126 - char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) 127 - { 128 - return __svc_print_addr(svc_addr(rqstp), buf, len); 129 - } 130 - EXPORT_SYMBOL_GPL(svc_print_addr); 131 - 132 - /* 133 - * Queue up an idle server thread. Must have pool->sp_lock held. 134 - * Note: this is really a stack rather than a queue, so that we only 135 - * use as many different threads as we need, and the rest don't pollute 136 - * the cache. 137 - */ 138 - static inline void 139 - svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) 140 - { 141 - list_add(&rqstp->rq_list, &pool->sp_threads); 142 - } 143 - 144 - /* 145 - * Dequeue an nfsd thread. Must have pool->sp_lock held. 146 - */ 147 - static inline void 148 - svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) 149 - { 150 - list_del(&rqstp->rq_list); 151 - } 152 - 153 /* 154 * Release an skbuff after use 155 */ 156 - static inline void 157 - svc_release_skb(struct svc_rqst *rqstp) 158 { 159 - struct sk_buff *skb = rqstp->rq_skbuff; 160 struct svc_deferred_req *dr = rqstp->rq_deferred; 161 162 if (skb) { 163 - rqstp->rq_skbuff = NULL; 164 165 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); 166 - skb_free_datagram(rqstp->rq_sock->sk_sk, skb); 167 } 168 if (dr) { 169 rqstp->rq_deferred = NULL; 170 kfree(dr); 171 - } 172 - } 173 - 174 - /* 175 - * Any space to write? 176 - */ 177 - static inline unsigned long 178 - svc_sock_wspace(struct svc_sock *svsk) 179 - { 180 - int wspace; 181 - 182 - if (svsk->sk_sock->type == SOCK_STREAM) 183 - wspace = sk_stream_wspace(svsk->sk_sk); 184 - else 185 - wspace = sock_wspace(svsk->sk_sk); 186 - 187 - return wspace; 188 - } 189 - 190 - /* 191 - * Queue up a socket with data pending. If there are idle nfsd 192 - * processes, wake 'em up. 193 - * 194 - */ 195 - static void 196 - svc_sock_enqueue(struct svc_sock *svsk) 197 - { 198 - struct svc_serv *serv = svsk->sk_server; 199 - struct svc_pool *pool; 200 - struct svc_rqst *rqstp; 201 - int cpu; 202 - 203 - if (!(svsk->sk_flags & 204 - ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) 205 - return; 206 - if (test_bit(SK_DEAD, &svsk->sk_flags)) 207 - return; 208 - 209 - cpu = get_cpu(); 210 - pool = svc_pool_for_cpu(svsk->sk_server, cpu); 211 - put_cpu(); 212 - 213 - spin_lock_bh(&pool->sp_lock); 214 - 215 - if (!list_empty(&pool->sp_threads) && 216 - !list_empty(&pool->sp_sockets)) 217 - printk(KERN_ERR 218 - "svc_sock_enqueue: threads and sockets both waiting??\n"); 219 - 220 - if (test_bit(SK_DEAD, &svsk->sk_flags)) { 221 - /* Don't enqueue dead sockets */ 222 - dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); 223 - goto out_unlock; 224 - } 225 - 226 - /* Mark socket as busy. It will remain in this state until the 227 - * server has processed all pending data and put the socket back 228 - * on the idle list. We update SK_BUSY atomically because 229 - * it also guards against trying to enqueue the svc_sock twice. 230 - */ 231 - if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { 232 - /* Don't enqueue socket while already enqueued */ 233 - dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); 234 - goto out_unlock; 235 - } 236 - BUG_ON(svsk->sk_pool != NULL); 237 - svsk->sk_pool = pool; 238 - 239 - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); 240 - if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2 241 - > svc_sock_wspace(svsk)) 242 - && !test_bit(SK_CLOSE, &svsk->sk_flags) 243 - && !test_bit(SK_CONN, &svsk->sk_flags)) { 244 - /* Don't enqueue while not enough space for reply */ 245 - dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", 246 - svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg, 247 - svc_sock_wspace(svsk)); 248 - svsk->sk_pool = NULL; 249 - clear_bit(SK_BUSY, &svsk->sk_flags); 250 - goto out_unlock; 251 - } 252 - clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); 253 - 254 - 255 - if (!list_empty(&pool->sp_threads)) { 256 - rqstp = list_entry(pool->sp_threads.next, 257 - struct svc_rqst, 258 - rq_list); 259 - dprintk("svc: socket %p served by daemon %p\n", 260 - svsk->sk_sk, rqstp); 261 - svc_thread_dequeue(pool, rqstp); 262 - if (rqstp->rq_sock) 263 - printk(KERN_ERR 264 - "svc_sock_enqueue: server %p, rq_sock=%p!\n", 265 - rqstp, rqstp->rq_sock); 266 - rqstp->rq_sock = svsk; 267 - atomic_inc(&svsk->sk_inuse); 268 - rqstp->rq_reserved = serv->sv_max_mesg; 269 - atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); 270 - BUG_ON(svsk->sk_pool != pool); 271 - wake_up(&rqstp->rq_wait); 272 - } else { 273 - dprintk("svc: socket %p put into queue\n", svsk->sk_sk); 274 - list_add_tail(&svsk->sk_ready, &pool->sp_sockets); 275 - BUG_ON(svsk->sk_pool != pool); 276 - } 277 - 278 - out_unlock: 279 - spin_unlock_bh(&pool->sp_lock); 280 - } 281 - 282 - /* 283 - * Dequeue the first socket. Must be called with the pool->sp_lock held. 284 - */ 285 - static inline struct svc_sock * 286 - svc_sock_dequeue(struct svc_pool *pool) 287 - { 288 - struct svc_sock *svsk; 289 - 290 - if (list_empty(&pool->sp_sockets)) 291 - return NULL; 292 - 293 - svsk = list_entry(pool->sp_sockets.next, 294 - struct svc_sock, sk_ready); 295 - list_del_init(&svsk->sk_ready); 296 - 297 - dprintk("svc: socket %p dequeued, inuse=%d\n", 298 - svsk->sk_sk, atomic_read(&svsk->sk_inuse)); 299 - 300 - return svsk; 301 - } 302 - 303 - /* 304 - * Having read something from a socket, check whether it 305 - * needs to be re-enqueued. 306 - * Note: SK_DATA only gets cleared when a read-attempt finds 307 - * no (or insufficient) data. 308 - */ 309 - static inline void 310 - svc_sock_received(struct svc_sock *svsk) 311 - { 312 - svsk->sk_pool = NULL; 313 - clear_bit(SK_BUSY, &svsk->sk_flags); 314 - svc_sock_enqueue(svsk); 315 - } 316 - 317 - 318 - /** 319 - * svc_reserve - change the space reserved for the reply to a request. 320 - * @rqstp: The request in question 321 - * @space: new max space to reserve 322 - * 323 - * Each request reserves some space on the output queue of the socket 324 - * to make sure the reply fits. This function reduces that reserved 325 - * space to be the amount of space used already, plus @space. 326 - * 327 - */ 328 - void svc_reserve(struct svc_rqst *rqstp, int space) 329 - { 330 - space += rqstp->rq_res.head[0].iov_len; 331 - 332 - if (space < rqstp->rq_reserved) { 333 - struct svc_sock *svsk = rqstp->rq_sock; 334 - atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); 335 - rqstp->rq_reserved = space; 336 - 337 - svc_sock_enqueue(svsk); 338 - } 339 - } 340 - 341 - /* 342 - * Release a socket after use. 343 - */ 344 - static inline void 345 - svc_sock_put(struct svc_sock *svsk) 346 - { 347 - if (atomic_dec_and_test(&svsk->sk_inuse)) { 348 - BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags)); 349 - 350 - dprintk("svc: releasing dead socket\n"); 351 - if (svsk->sk_sock->file) 352 - sockfd_put(svsk->sk_sock); 353 - else 354 - sock_release(svsk->sk_sock); 355 - if (svsk->sk_info_authunix != NULL) 356 - svcauth_unix_info_release(svsk->sk_info_authunix); 357 - kfree(svsk); 358 - } 359 - } 360 - 361 - static void 362 - svc_sock_release(struct svc_rqst *rqstp) 363 - { 364 - struct svc_sock *svsk = rqstp->rq_sock; 365 - 366 - svc_release_skb(rqstp); 367 - 368 - svc_free_res_pages(rqstp); 369 - rqstp->rq_res.page_len = 0; 370 - rqstp->rq_res.page_base = 0; 371 - 372 - 373 - /* Reset response buffer and release 374 - * the reservation. 375 - * But first, check that enough space was reserved 376 - * for the reply, otherwise we have a bug! 377 - */ 378 - if ((rqstp->rq_res.len) > rqstp->rq_reserved) 379 - printk(KERN_ERR "RPC request reserved %d but used %d\n", 380 - rqstp->rq_reserved, 381 - rqstp->rq_res.len); 382 - 383 - rqstp->rq_res.head[0].iov_len = 0; 384 - svc_reserve(rqstp, 0); 385 - rqstp->rq_sock = NULL; 386 - 387 - svc_sock_put(svsk); 388 - } 389 - 390 - /* 391 - * External function to wake up a server waiting for data 392 - * This really only makes sense for services like lockd 393 - * which have exactly one thread anyway. 394 - */ 395 - void 396 - svc_wake_up(struct svc_serv *serv) 397 - { 398 - struct svc_rqst *rqstp; 399 - unsigned int i; 400 - struct svc_pool *pool; 401 - 402 - for (i = 0; i < serv->sv_nrpools; i++) { 403 - pool = &serv->sv_pools[i]; 404 - 405 - spin_lock_bh(&pool->sp_lock); 406 - if (!list_empty(&pool->sp_threads)) { 407 - rqstp = list_entry(pool->sp_threads.next, 408 - struct svc_rqst, 409 - rq_list); 410 - dprintk("svc: daemon %p woken up.\n", rqstp); 411 - /* 412 - svc_thread_dequeue(pool, rqstp); 413 - rqstp->rq_sock = NULL; 414 - */ 415 - wake_up(&rqstp->rq_wait); 416 - } 417 - spin_unlock_bh(&pool->sp_lock); 418 } 419 } 420 ··· 125 126 static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) 127 { 128 - switch (rqstp->rq_sock->sk_sk->sk_family) { 129 case AF_INET: { 130 struct in_pktinfo *pki = CMSG_DATA(cmh); 131 ··· 157 /* 158 * Generic sendto routine 159 */ 160 - static int 161 - svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) 162 { 163 - struct svc_sock *svsk = rqstp->rq_sock; 164 struct socket *sock = svsk->sk_sock; 165 int slen; 166 union { ··· 233 } 234 out: 235 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", 236 - rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, 237 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); 238 239 return len; ··· 270 if (!serv) 271 return 0; 272 spin_lock_bh(&serv->sv_lock); 273 - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { 274 int onelen = one_sock_name(buf+len, svsk); 275 if (toclose && strcmp(toclose, buf+len) == 0) 276 closesk = svsk; ··· 282 /* Should unregister with portmap, but you cannot 283 * unregister just one protocol... 284 */ 285 - svc_close_socket(closesk); 286 else if (toclose) 287 return -ENOENT; 288 return len; ··· 292 /* 293 * Check input queue length 294 */ 295 - static int 296 - svc_recv_available(struct svc_sock *svsk) 297 { 298 struct socket *sock = svsk->sk_sock; 299 int avail, err; ··· 305 /* 306 * Generic recvfrom routine. 307 */ 308 - static int 309 - svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) 310 { 311 - struct svc_sock *svsk = rqstp->rq_sock; 312 struct msghdr msg = { 313 .msg_flags = MSG_DONTWAIT, 314 }; 315 - struct sockaddr *sin; 316 int len; 317 318 len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, 319 msg.msg_flags); 320 321 - /* sock_recvmsg doesn't fill in the name/namelen, so we must.. 322 - */ 323 - memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen); 324 - rqstp->rq_addrlen = svsk->sk_remotelen; 325 - 326 - /* Destination address in request is needed for binding the 327 - * source address in RPC callbacks later. 328 - */ 329 - sin = (struct sockaddr *)&svsk->sk_local; 330 - switch (sin->sa_family) { 331 - case AF_INET: 332 - rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; 333 - break; 334 - case AF_INET6: 335 - rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; 336 - break; 337 - } 338 - 339 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", 340 svsk, iov[0].iov_base, iov[0].iov_len, len); 341 - 342 return len; 343 } 344 345 /* 346 * Set socket snd and rcv buffer lengths 347 */ 348 - static inline void 349 - svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) 350 { 351 #if 0 352 mm_segment_t oldfs; ··· 354 /* 355 * INET callback when data has been received on the socket. 356 */ 357 - static void 358 - svc_udp_data_ready(struct sock *sk, int count) 359 { 360 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 361 362 if (svsk) { 363 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", 364 - svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); 365 - set_bit(SK_DATA, &svsk->sk_flags); 366 - svc_sock_enqueue(svsk); 367 } 368 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 369 wake_up_interruptible(sk->sk_sleep); ··· 372 /* 373 * INET callback when space is newly available on the socket. 374 */ 375 - static void 376 - svc_write_space(struct sock *sk) 377 { 378 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); 379 380 if (svsk) { 381 dprintk("svc: socket %p(inet %p), write_space busy=%d\n", 382 - svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); 383 - svc_sock_enqueue(svsk); 384 } 385 386 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { ··· 389 } 390 } 391 392 - static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp, 393 - struct cmsghdr *cmh) 394 { 395 - switch (rqstp->rq_sock->sk_sk->sk_family) { 396 case AF_INET: { 397 struct in_pktinfo *pki = CMSG_DATA(cmh); 398 rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; ··· 418 /* 419 * Receive a datagram from a UDP socket. 420 */ 421 - static int 422 - svc_udp_recvfrom(struct svc_rqst *rqstp) 423 { 424 - struct svc_sock *svsk = rqstp->rq_sock; 425 - struct svc_serv *serv = svsk->sk_server; 426 struct sk_buff *skb; 427 union { 428 struct cmsghdr hdr; ··· 437 .msg_flags = MSG_DONTWAIT, 438 }; 439 440 - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) 441 /* udp sockets need large rcvbuf as all pending 442 * requests are still in that buffer. sndbuf must 443 * also be large enough that there is enough space ··· 450 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 451 (serv->sv_nrthreads+3) * serv->sv_max_mesg); 452 453 - if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 454 - svc_sock_received(svsk); 455 - return svc_deferred_recv(rqstp); 456 - } 457 - 458 - if (test_bit(SK_CLOSE, &svsk->sk_flags)) { 459 - svc_delete_socket(svsk); 460 - return 0; 461 - } 462 - 463 - clear_bit(SK_DATA, &svsk->sk_flags); 464 skb = NULL; 465 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 466 0, 0, MSG_PEEK | MSG_DONTWAIT); ··· 461 if (err != -EAGAIN) { 462 /* possibly an icmp error */ 463 dprintk("svc: recvfrom returned error %d\n", -err); 464 - set_bit(SK_DATA, &svsk->sk_flags); 465 } 466 - svc_sock_received(svsk); 467 return -EAGAIN; 468 } 469 - rqstp->rq_addrlen = sizeof(rqstp->rq_addr); 470 if (skb->tstamp.tv64 == 0) { 471 skb->tstamp = ktime_get_real(); 472 /* Don't enable netstamp, sunrpc doesn't 473 need that much accuracy */ 474 } 475 svsk->sk_sk->sk_stamp = skb->tstamp; 476 - set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ 477 478 /* 479 * Maybe more packets - kick another thread ASAP. 480 */ 481 - svc_sock_received(svsk); 482 483 len = skb->len - sizeof(struct udphdr); 484 rqstp->rq_arg.len = len; ··· 512 skb_free_datagram(svsk->sk_sk, skb); 513 } else { 514 /* we can use it in-place */ 515 - rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); 516 rqstp->rq_arg.head[0].iov_len = len; 517 if (skb_checksum_complete(skb)) { 518 skb_free_datagram(svsk->sk_sk, skb); 519 return 0; 520 } 521 - rqstp->rq_skbuff = skb; 522 } 523 524 rqstp->rq_arg.page_base = 0; ··· 552 return error; 553 } 554 555 - static void 556 - svc_udp_init(struct svc_sock *svsk) 557 { 558 int one = 1; 559 mm_segment_t oldfs; 560 561 svsk->sk_sk->sk_data_ready = svc_udp_data_ready; 562 svsk->sk_sk->sk_write_space = svc_write_space; 563 - svsk->sk_recvfrom = svc_udp_recvfrom; 564 - svsk->sk_sendto = svc_udp_sendto; 565 566 /* initialise setting must have enough space to 567 * receive and respond to one request. 568 * svc_udp_recvfrom will re-adjust if necessary 569 */ 570 svc_sock_setbufsize(svsk->sk_sock, 571 - 3 * svsk->sk_server->sv_max_mesg, 572 - 3 * svsk->sk_server->sv_max_mesg); 573 574 - set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ 575 - set_bit(SK_CHNGBUF, &svsk->sk_flags); 576 577 oldfs = get_fs(); 578 set_fs(KERNEL_DS); ··· 640 * A data_ready event on a listening socket means there's a connection 641 * pending. Do not use state_change as a substitute for it. 642 */ 643 - static void 644 - svc_tcp_listen_data_ready(struct sock *sk, int count_unused) 645 { 646 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 647 ··· 659 */ 660 if (sk->sk_state == TCP_LISTEN) { 661 if (svsk) { 662 - set_bit(SK_CONN, &svsk->sk_flags); 663 - svc_sock_enqueue(svsk); 664 } else 665 printk("svc: socket %p: no user data\n", sk); 666 } ··· 672 /* 673 * A state change on a connected socket means it's dying or dead. 674 */ 675 - static void 676 - svc_tcp_state_change(struct sock *sk) 677 { 678 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 679 ··· 682 if (!svsk) 683 printk("svc: socket %p: no user data\n", sk); 684 else { 685 - set_bit(SK_CLOSE, &svsk->sk_flags); 686 - svc_sock_enqueue(svsk); 687 } 688 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 689 wake_up_interruptible_all(sk->sk_sleep); 690 } 691 692 - static void 693 - svc_tcp_data_ready(struct sock *sk, int count) 694 { 695 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 696 697 dprintk("svc: socket %p TCP data ready (svsk %p)\n", 698 sk, sk->sk_user_data); 699 if (svsk) { 700 - set_bit(SK_DATA, &svsk->sk_flags); 701 - svc_sock_enqueue(svsk); 702 } 703 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 704 wake_up_interruptible(sk->sk_sleep); 705 } 706 707 - static inline int svc_port_is_privileged(struct sockaddr *sin) 708 - { 709 - switch (sin->sa_family) { 710 - case AF_INET: 711 - return ntohs(((struct sockaddr_in *)sin)->sin_port) 712 - < PROT_SOCK; 713 - case AF_INET6: 714 - return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) 715 - < PROT_SOCK; 716 - default: 717 - return 0; 718 - } 719 - } 720 - 721 /* 722 * Accept a TCP connection 723 */ 724 - static void 725 - svc_tcp_accept(struct svc_sock *svsk) 726 { 727 struct sockaddr_storage addr; 728 struct sockaddr *sin = (struct sockaddr *) &addr; 729 - struct svc_serv *serv = svsk->sk_server; 730 struct socket *sock = svsk->sk_sock; 731 struct socket *newsock; 732 struct svc_sock *newsvsk; ··· 720 721 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); 722 if (!sock) 723 - return; 724 725 - clear_bit(SK_CONN, &svsk->sk_flags); 726 err = kernel_accept(sock, &newsock, O_NONBLOCK); 727 if (err < 0) { 728 if (err == -ENOMEM) ··· 731 else if (err != -EAGAIN && net_ratelimit()) 732 printk(KERN_WARNING "%s: accept failed (err %d)!\n", 733 serv->sv_name, -err); 734 - return; 735 } 736 - 737 - set_bit(SK_CONN, &svsk->sk_flags); 738 - svc_sock_enqueue(svsk); 739 740 err = kernel_getpeername(newsock, sin, &slen); 741 if (err < 0) { ··· 764 if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 765 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) 766 goto failed; 767 - memcpy(&newsvsk->sk_remote, sin, slen); 768 - newsvsk->sk_remotelen = slen; 769 err = kernel_getsockname(newsock, sin, &slen); 770 if (unlikely(err < 0)) { 771 dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); 772 slen = offsetof(struct sockaddr, sa_data); 773 } 774 - memcpy(&newsvsk->sk_local, sin, slen); 775 - 776 - svc_sock_received(newsvsk); 777 - 778 - /* make sure that we don't have too many active connections. 779 - * If we have, something must be dropped. 780 - * 781 - * There's no point in trying to do random drop here for 782 - * DoS prevention. The NFS clients does 1 reconnect in 15 783 - * seconds. An attacker can easily beat that. 784 - * 785 - * The only somewhat efficient mechanism would be if drop 786 - * old connections from the same IP first. But right now 787 - * we don't even record the client IP in svc_sock. 788 - */ 789 - if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { 790 - struct svc_sock *svsk = NULL; 791 - spin_lock_bh(&serv->sv_lock); 792 - if (!list_empty(&serv->sv_tempsocks)) { 793 - if (net_ratelimit()) { 794 - /* Try to help the admin */ 795 - printk(KERN_NOTICE "%s: too many open TCP " 796 - "sockets, consider increasing the " 797 - "number of nfsd threads\n", 798 - serv->sv_name); 799 - printk(KERN_NOTICE 800 - "%s: last TCP connect from %s\n", 801 - serv->sv_name, __svc_print_addr(sin, 802 - buf, sizeof(buf))); 803 - } 804 - /* 805 - * Always select the oldest socket. It's not fair, 806 - * but so is life 807 - */ 808 - svsk = list_entry(serv->sv_tempsocks.prev, 809 - struct svc_sock, 810 - sk_list); 811 - set_bit(SK_CLOSE, &svsk->sk_flags); 812 - atomic_inc(&svsk->sk_inuse); 813 - } 814 - spin_unlock_bh(&serv->sv_lock); 815 - 816 - if (svsk) { 817 - svc_sock_enqueue(svsk); 818 - svc_sock_put(svsk); 819 - } 820 - 821 - } 822 823 if (serv->sv_stats) 824 serv->sv_stats->nettcpconn++; 825 826 - return; 827 828 failed: 829 sock_release(newsock); 830 - return; 831 } 832 833 /* 834 * Receive data from a TCP socket. 835 */ 836 - static int 837 - svc_tcp_recvfrom(struct svc_rqst *rqstp) 838 { 839 - struct svc_sock *svsk = rqstp->rq_sock; 840 - struct svc_serv *serv = svsk->sk_server; 841 int len; 842 struct kvec *vec; 843 int pnum, vlen; 844 845 dprintk("svc: tcp_recv %p data %d conn %d close %d\n", 846 - svsk, test_bit(SK_DATA, &svsk->sk_flags), 847 - test_bit(SK_CONN, &svsk->sk_flags), 848 - test_bit(SK_CLOSE, &svsk->sk_flags)); 849 850 - if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { 851 - svc_sock_received(svsk); 852 - return svc_deferred_recv(rqstp); 853 - } 854 - 855 - if (test_bit(SK_CLOSE, &svsk->sk_flags)) { 856 - svc_delete_socket(svsk); 857 - return 0; 858 - } 859 - 860 - if (svsk->sk_sk->sk_state == TCP_LISTEN) { 861 - svc_tcp_accept(svsk); 862 - svc_sock_received(svsk); 863 - return 0; 864 - } 865 - 866 - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) 867 /* sndbuf needs to have room for one request 868 * per thread, otherwise we can stall even when the 869 * network isn't a bottleneck. ··· 816 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 817 3 * serv->sv_max_mesg); 818 819 - clear_bit(SK_DATA, &svsk->sk_flags); 820 821 /* Receive data. If we haven't got the record length yet, get 822 * the next four bytes. Otherwise try to gobble up as much as ··· 835 if (len < want) { 836 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", 837 len, want); 838 - svc_sock_received(svsk); 839 return -EAGAIN; /* record header not complete */ 840 } 841 ··· 871 if (len < svsk->sk_reclen) { 872 dprintk("svc: incomplete TCP record (%d of %d)\n", 873 len, svsk->sk_reclen); 874 - svc_sock_received(svsk); 875 return -EAGAIN; /* record not complete */ 876 } 877 len = svsk->sk_reclen; 878 - set_bit(SK_DATA, &svsk->sk_flags); 879 880 vec = rqstp->rq_vec; 881 vec[0] = rqstp->rq_arg.head[0]; ··· 904 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; 905 } 906 907 - rqstp->rq_skbuff = NULL; 908 rqstp->rq_prot = IPPROTO_TCP; 909 910 /* Reset TCP read info */ 911 svsk->sk_reclen = 0; 912 svsk->sk_tcplen = 0; 913 914 - svc_sock_received(svsk); 915 if (serv->sv_stats) 916 serv->sv_stats->nettcpcnt++; 917 918 return len; 919 920 err_delete: 921 - svc_delete_socket(svsk); 922 return -EAGAIN; 923 924 error: 925 if (len == -EAGAIN) { 926 dprintk("RPC: TCP recvfrom got EAGAIN\n"); 927 - svc_sock_received(svsk); 928 } else { 929 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", 930 - svsk->sk_server->sv_name, -len); 931 goto err_delete; 932 } 933 ··· 938 /* 939 * Send out data on TCP socket. 940 */ 941 - static int 942 - svc_tcp_sendto(struct svc_rqst *rqstp) 943 { 944 struct xdr_buf *xbufp = &rqstp->rq_res; 945 int sent; ··· 951 reclen = htonl(0x80000000|((xbufp->len ) - 4)); 952 memcpy(xbufp->head[0].iov_base, &reclen, 4); 953 954 - if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) 955 return -ENOTCONN; 956 957 sent = svc_sendto(rqstp, &rqstp->rq_res); 958 if (sent != xbufp->len) { 959 - printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", 960 - rqstp->rq_sock->sk_server->sv_name, 961 (sent<0)?"got error":"sent only", 962 sent, xbufp->len); 963 - set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags); 964 - svc_sock_enqueue(rqstp->rq_sock); 965 sent = -EAGAIN; 966 } 967 return sent; 968 } 969 970 - static void 971 - svc_tcp_init(struct svc_sock *svsk) 972 { 973 struct sock *sk = svsk->sk_sk; 974 struct tcp_sock *tp = tcp_sk(sk); 975 976 - svsk->sk_recvfrom = svc_tcp_recvfrom; 977 - svsk->sk_sendto = svc_tcp_sendto; 978 - 979 if (sk->sk_state == TCP_LISTEN) { 980 dprintk("setting up TCP socket for listening\n"); 981 sk->sk_data_ready = svc_tcp_listen_data_ready; 982 - set_bit(SK_CONN, &svsk->sk_flags); 983 } else { 984 dprintk("setting up TCP socket for reading\n"); 985 sk->sk_state_change = svc_tcp_state_change; ··· 1070 * svc_tcp_recvfrom will re-adjust if necessary 1071 */ 1072 svc_sock_setbufsize(svsk->sk_sock, 1073 - 3 * svsk->sk_server->sv_max_mesg, 1074 - 3 * svsk->sk_server->sv_max_mesg); 1075 1076 - set_bit(SK_CHNGBUF, &svsk->sk_flags); 1077 - set_bit(SK_DATA, &svsk->sk_flags); 1078 if (sk->sk_state != TCP_ESTABLISHED) 1079 - set_bit(SK_CLOSE, &svsk->sk_flags); 1080 } 1081 } 1082 1083 - void 1084 - svc_sock_update_bufs(struct svc_serv *serv) 1085 { 1086 /* 1087 * The number of server threads has changed. Update ··· 1091 spin_lock_bh(&serv->sv_lock); 1092 list_for_each(le, &serv->sv_permsocks) { 1093 struct svc_sock *svsk = 1094 - list_entry(le, struct svc_sock, sk_list); 1095 - set_bit(SK_CHNGBUF, &svsk->sk_flags); 1096 } 1097 list_for_each(le, &serv->sv_tempsocks) { 1098 struct svc_sock *svsk = 1099 - list_entry(le, struct svc_sock, sk_list); 1100 - set_bit(SK_CHNGBUF, &svsk->sk_flags); 1101 } 1102 spin_unlock_bh(&serv->sv_lock); 1103 - } 1104 - 1105 - /* 1106 - * Receive the next request on any socket. This code is carefully 1107 - * organised not to touch any cachelines in the shared svc_serv 1108 - * structure, only cachelines in the local svc_pool. 1109 - */ 1110 - int 1111 - svc_recv(struct svc_rqst *rqstp, long timeout) 1112 - { 1113 - struct svc_sock *svsk = NULL; 1114 - struct svc_serv *serv = rqstp->rq_server; 1115 - struct svc_pool *pool = rqstp->rq_pool; 1116 - int len, i; 1117 - int pages; 1118 - struct xdr_buf *arg; 1119 - DECLARE_WAITQUEUE(wait, current); 1120 - 1121 - dprintk("svc: server %p waiting for data (to = %ld)\n", 1122 - rqstp, timeout); 1123 - 1124 - if (rqstp->rq_sock) 1125 - printk(KERN_ERR 1126 - "svc_recv: service %p, socket not NULL!\n", 1127 - rqstp); 1128 - if (waitqueue_active(&rqstp->rq_wait)) 1129 - printk(KERN_ERR 1130 - "svc_recv: service %p, wait queue active!\n", 1131 - rqstp); 1132 - 1133 - 1134 - /* now allocate needed pages. If we get a failure, sleep briefly */ 1135 - pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; 1136 - for (i=0; i < pages ; i++) 1137 - while (rqstp->rq_pages[i] == NULL) { 1138 - struct page *p = alloc_page(GFP_KERNEL); 1139 - if (!p) 1140 - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); 1141 - rqstp->rq_pages[i] = p; 1142 - } 1143 - rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ 1144 - BUG_ON(pages >= RPCSVC_MAXPAGES); 1145 - 1146 - /* Make arg->head point to first page and arg->pages point to rest */ 1147 - arg = &rqstp->rq_arg; 1148 - arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); 1149 - arg->head[0].iov_len = PAGE_SIZE; 1150 - arg->pages = rqstp->rq_pages + 1; 1151 - arg->page_base = 0; 1152 - /* save at least one page for response */ 1153 - arg->page_len = (pages-2)*PAGE_SIZE; 1154 - arg->len = (pages-1)*PAGE_SIZE; 1155 - arg->tail[0].iov_len = 0; 1156 - 1157 - try_to_freeze(); 1158 - cond_resched(); 1159 - if (signalled()) 1160 - return -EINTR; 1161 - 1162 - spin_lock_bh(&pool->sp_lock); 1163 - if ((svsk = svc_sock_dequeue(pool)) != NULL) { 1164 - rqstp->rq_sock = svsk; 1165 - atomic_inc(&svsk->sk_inuse); 1166 - rqstp->rq_reserved = serv->sv_max_mesg; 1167 - atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); 1168 - } else { 1169 - /* No data pending. Go to sleep */ 1170 - svc_thread_enqueue(pool, rqstp); 1171 - 1172 - /* 1173 - * We have to be able to interrupt this wait 1174 - * to bring down the daemons ... 1175 - */ 1176 - set_current_state(TASK_INTERRUPTIBLE); 1177 - add_wait_queue(&rqstp->rq_wait, &wait); 1178 - spin_unlock_bh(&pool->sp_lock); 1179 - 1180 - schedule_timeout(timeout); 1181 - 1182 - try_to_freeze(); 1183 - 1184 - spin_lock_bh(&pool->sp_lock); 1185 - remove_wait_queue(&rqstp->rq_wait, &wait); 1186 - 1187 - if (!(svsk = rqstp->rq_sock)) { 1188 - svc_thread_dequeue(pool, rqstp); 1189 - spin_unlock_bh(&pool->sp_lock); 1190 - dprintk("svc: server %p, no data yet\n", rqstp); 1191 - return signalled()? -EINTR : -EAGAIN; 1192 - } 1193 - } 1194 - spin_unlock_bh(&pool->sp_lock); 1195 - 1196 - dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", 1197 - rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); 1198 - len = svsk->sk_recvfrom(rqstp); 1199 - dprintk("svc: got len=%d\n", len); 1200 - 1201 - /* No data, incomplete (TCP) read, or accept() */ 1202 - if (len == 0 || len == -EAGAIN) { 1203 - rqstp->rq_res.len = 0; 1204 - svc_sock_release(rqstp); 1205 - return -EAGAIN; 1206 - } 1207 - svsk->sk_lastrecv = get_seconds(); 1208 - clear_bit(SK_OLD, &svsk->sk_flags); 1209 - 1210 - rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); 1211 - rqstp->rq_chandle.defer = svc_defer; 1212 - 1213 - if (serv->sv_stats) 1214 - serv->sv_stats->netcnt++; 1215 - return len; 1216 - } 1217 - 1218 - /* 1219 - * Drop request 1220 - */ 1221 - void 1222 - svc_drop(struct svc_rqst *rqstp) 1223 - { 1224 - dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); 1225 - svc_sock_release(rqstp); 1226 - } 1227 - 1228 - /* 1229 - * Return reply to client. 1230 - */ 1231 - int 1232 - svc_send(struct svc_rqst *rqstp) 1233 - { 1234 - struct svc_sock *svsk; 1235 - int len; 1236 - struct xdr_buf *xb; 1237 - 1238 - if ((svsk = rqstp->rq_sock) == NULL) { 1239 - printk(KERN_WARNING "NULL socket pointer in %s:%d\n", 1240 - __FILE__, __LINE__); 1241 - return -EFAULT; 1242 - } 1243 - 1244 - /* release the receive skb before sending the reply */ 1245 - svc_release_skb(rqstp); 1246 - 1247 - /* calculate over-all length */ 1248 - xb = & rqstp->rq_res; 1249 - xb->len = xb->head[0].iov_len + 1250 - xb->page_len + 1251 - xb->tail[0].iov_len; 1252 - 1253 - /* Grab svsk->sk_mutex to serialize outgoing data. */ 1254 - mutex_lock(&svsk->sk_mutex); 1255 - if (test_bit(SK_DEAD, &svsk->sk_flags)) 1256 - len = -ENOTCONN; 1257 - else 1258 - len = svsk->sk_sendto(rqstp); 1259 - mutex_unlock(&svsk->sk_mutex); 1260 - svc_sock_release(rqstp); 1261 - 1262 - if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) 1263 - return 0; 1264 - return len; 1265 - } 1266 - 1267 - /* 1268 - * Timer function to close old temporary sockets, using 1269 - * a mark-and-sweep algorithm. 1270 - */ 1271 - static void 1272 - svc_age_temp_sockets(unsigned long closure) 1273 - { 1274 - struct svc_serv *serv = (struct svc_serv *)closure; 1275 - struct svc_sock *svsk; 1276 - struct list_head *le, *next; 1277 - LIST_HEAD(to_be_aged); 1278 - 1279 - dprintk("svc_age_temp_sockets\n"); 1280 - 1281 - if (!spin_trylock_bh(&serv->sv_lock)) { 1282 - /* busy, try again 1 sec later */ 1283 - dprintk("svc_age_temp_sockets: busy\n"); 1284 - mod_timer(&serv->sv_temptimer, jiffies + HZ); 1285 - return; 1286 - } 1287 - 1288 - list_for_each_safe(le, next, &serv->sv_tempsocks) { 1289 - svsk = list_entry(le, struct svc_sock, sk_list); 1290 - 1291 - if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) 1292 - continue; 1293 - if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags)) 1294 - continue; 1295 - atomic_inc(&svsk->sk_inuse); 1296 - list_move(le, &to_be_aged); 1297 - set_bit(SK_CLOSE, &svsk->sk_flags); 1298 - set_bit(SK_DETACHED, &svsk->sk_flags); 1299 - } 1300 - spin_unlock_bh(&serv->sv_lock); 1301 - 1302 - while (!list_empty(&to_be_aged)) { 1303 - le = to_be_aged.next; 1304 - /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ 1305 - list_del_init(le); 1306 - svsk = list_entry(le, struct svc_sock, sk_list); 1307 - 1308 - dprintk("queuing svsk %p for closing, %lu seconds old\n", 1309 - svsk, get_seconds() - svsk->sk_lastrecv); 1310 - 1311 - /* a thread will dequeue and close it soon */ 1312 - svc_sock_enqueue(svsk); 1313 - svc_sock_put(svsk); 1314 - } 1315 - 1316 - mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); 1317 } 1318 1319 /* ··· 1113 struct svc_sock *svsk; 1114 struct sock *inet; 1115 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); 1116 - int is_temporary = flags & SVC_SOCK_TEMPORARY; 1117 1118 dprintk("svc: svc_setup_socket %p\n", sock); 1119 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { ··· 1132 return NULL; 1133 } 1134 1135 - set_bit(SK_BUSY, &svsk->sk_flags); 1136 inet->sk_user_data = svsk; 1137 svsk->sk_sock = sock; 1138 svsk->sk_sk = inet; 1139 svsk->sk_ostate = inet->sk_state_change; 1140 svsk->sk_odata = inet->sk_data_ready; 1141 svsk->sk_owspace = inet->sk_write_space; 1142 - svsk->sk_server = serv; 1143 - atomic_set(&svsk->sk_inuse, 1); 1144 - svsk->sk_lastrecv = get_seconds(); 1145 - spin_lock_init(&svsk->sk_lock); 1146 - INIT_LIST_HEAD(&svsk->sk_deferred); 1147 - INIT_LIST_HEAD(&svsk->sk_ready); 1148 - mutex_init(&svsk->sk_mutex); 1149 1150 /* Initialize the socket */ 1151 if (sock->type == SOCK_DGRAM) 1152 - svc_udp_init(svsk); 1153 else 1154 - svc_tcp_init(svsk); 1155 - 1156 - spin_lock_bh(&serv->sv_lock); 1157 - if (is_temporary) { 1158 - set_bit(SK_TEMP, &svsk->sk_flags); 1159 - list_add(&svsk->sk_list, &serv->sv_tempsocks); 1160 - serv->sv_tmpcnt++; 1161 - if (serv->sv_temptimer.function == NULL) { 1162 - /* setup timer to age temp sockets */ 1163 - setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, 1164 - (unsigned long)serv); 1165 - mod_timer(&serv->sv_temptimer, 1166 - jiffies + svc_conn_age_period * HZ); 1167 - } 1168 - } else { 1169 - clear_bit(SK_TEMP, &svsk->sk_flags); 1170 - list_add(&svsk->sk_list, &serv->sv_permsocks); 1171 - } 1172 - spin_unlock_bh(&serv->sv_lock); 1173 1174 dprintk("svc: svc_setup_socket created %p (inet %p)\n", 1175 svsk, svsk->sk_sk); ··· 1172 else { 1173 svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); 1174 if (svsk) { 1175 - svc_sock_received(svsk); 1176 err = 0; 1177 } 1178 } ··· 1197 /* 1198 * Create socket for RPC service. 1199 */ 1200 - static int svc_create_socket(struct svc_serv *serv, int protocol, 1201 - struct sockaddr *sin, int len, int flags) 1202 { 1203 struct svc_sock *svsk; 1204 struct socket *sock; 1205 int error; 1206 int type; 1207 char buf[RPC_MAX_ADDRBUFLEN]; 1208 1209 dprintk("svc: svc_create_socket(%s, %d, %s)\n", 1210 serv->sv_program->pg_name, protocol, ··· 1218 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { 1219 printk(KERN_WARNING "svc: only UDP and TCP " 1220 "sockets supported\n"); 1221 - return -EINVAL; 1222 } 1223 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; 1224 1225 error = sock_create_kern(sin->sa_family, type, protocol, &sock); 1226 if (error < 0) 1227 - return error; 1228 1229 svc_reclassify_socket(sock); 1230 ··· 1234 if (error < 0) 1235 goto bummer; 1236 1237 if (protocol == IPPROTO_TCP) { 1238 if ((error = kernel_listen(sock, 64)) < 0) 1239 goto bummer; 1240 } 1241 1242 if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { 1243 - svc_sock_received(svsk); 1244 - return ntohs(inet_sk(svsk->sk_sk)->sport); 1245 } 1246 1247 bummer: 1248 dprintk("svc: svc_create_socket error = %d\n", -error); 1249 sock_release(sock); 1250 - return error; 1251 } 1252 1253 /* 1254 - * Remove a dead socket 1255 */ 1256 - static void 1257 - svc_delete_socket(struct svc_sock *svsk) 1258 { 1259 - struct svc_serv *serv; 1260 - struct sock *sk; 1261 1262 - dprintk("svc: svc_delete_socket(%p)\n", svsk); 1263 1264 - serv = svsk->sk_server; 1265 - sk = svsk->sk_sk; 1266 - 1267 sk->sk_state_change = svsk->sk_ostate; 1268 sk->sk_data_ready = svsk->sk_odata; 1269 sk->sk_write_space = svsk->sk_owspace; 1270 - 1271 - spin_lock_bh(&serv->sv_lock); 1272 - 1273 - if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) 1274 - list_del_init(&svsk->sk_list); 1275 - /* 1276 - * We used to delete the svc_sock from whichever list 1277 - * it's sk_ready node was on, but we don't actually 1278 - * need to. This is because the only time we're called 1279 - * while still attached to a queue, the queue itself 1280 - * is about to be destroyed (in svc_destroy). 1281 - */ 1282 - if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) { 1283 - BUG_ON(atomic_read(&svsk->sk_inuse)<2); 1284 - atomic_dec(&svsk->sk_inuse); 1285 - if (test_bit(SK_TEMP, &svsk->sk_flags)) 1286 - serv->sv_tmpcnt--; 1287 - } 1288 - 1289 - spin_unlock_bh(&serv->sv_lock); 1290 - } 1291 - 1292 - static void svc_close_socket(struct svc_sock *svsk) 1293 - { 1294 - set_bit(SK_CLOSE, &svsk->sk_flags); 1295 - if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) 1296 - /* someone else will have to effect the close */ 1297 - return; 1298 - 1299 - atomic_inc(&svsk->sk_inuse); 1300 - svc_delete_socket(svsk); 1301 - clear_bit(SK_BUSY, &svsk->sk_flags); 1302 - svc_sock_put(svsk); 1303 - } 1304 - 1305 - void svc_force_close_socket(struct svc_sock *svsk) 1306 - { 1307 - set_bit(SK_CLOSE, &svsk->sk_flags); 1308 - if (test_bit(SK_BUSY, &svsk->sk_flags)) { 1309 - /* Waiting to be processed, but no threads left, 1310 - * So just remove it from the waiting list 1311 - */ 1312 - list_del_init(&svsk->sk_ready); 1313 - clear_bit(SK_BUSY, &svsk->sk_flags); 1314 - } 1315 - svc_close_socket(svsk); 1316 - } 1317 - 1318 - /** 1319 - * svc_makesock - Make a socket for nfsd and lockd 1320 - * @serv: RPC server structure 1321 - * @protocol: transport protocol to use 1322 - * @port: port to use 1323 - * @flags: requested socket characteristics 1324 - * 1325 - */ 1326 - int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port, 1327 - int flags) 1328 - { 1329 - struct sockaddr_in sin = { 1330 - .sin_family = AF_INET, 1331 - .sin_addr.s_addr = INADDR_ANY, 1332 - .sin_port = htons(port), 1333 - }; 1334 - 1335 - dprintk("svc: creating socket proto = %d\n", protocol); 1336 - return svc_create_socket(serv, protocol, (struct sockaddr *) &sin, 1337 - sizeof(sin), flags); 1338 } 1339 1340 /* 1341 - * Handle defer and revisit of requests 1342 */ 1343 - 1344 - static void svc_revisit(struct cache_deferred_req *dreq, int too_many) 1345 { 1346 - struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); 1347 - struct svc_sock *svsk; 1348 1349 - if (too_many) { 1350 - svc_sock_put(dr->svsk); 1351 - kfree(dr); 1352 - return; 1353 - } 1354 - dprintk("revisit queued\n"); 1355 - svsk = dr->svsk; 1356 - dr->svsk = NULL; 1357 - spin_lock(&svsk->sk_lock); 1358 - list_add(&dr->handle.recent, &svsk->sk_deferred); 1359 - spin_unlock(&svsk->sk_lock); 1360 - set_bit(SK_DEFERRED, &svsk->sk_flags); 1361 - svc_sock_enqueue(svsk); 1362 - svc_sock_put(svsk); 1363 - } 1364 - 1365 - static struct cache_deferred_req * 1366 - svc_defer(struct cache_req *req) 1367 - { 1368 - struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); 1369 - int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); 1370 - struct svc_deferred_req *dr; 1371 - 1372 - if (rqstp->rq_arg.page_len) 1373 - return NULL; /* if more than a page, give up FIXME */ 1374 - if (rqstp->rq_deferred) { 1375 - dr = rqstp->rq_deferred; 1376 - rqstp->rq_deferred = NULL; 1377 - } else { 1378 - int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; 1379 - /* FIXME maybe discard if size too large */ 1380 - dr = kmalloc(size, GFP_KERNEL); 1381 - if (dr == NULL) 1382 - return NULL; 1383 - 1384 - dr->handle.owner = rqstp->rq_server; 1385 - dr->prot = rqstp->rq_prot; 1386 - memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); 1387 - dr->addrlen = rqstp->rq_addrlen; 1388 - dr->daddr = rqstp->rq_daddr; 1389 - dr->argslen = rqstp->rq_arg.len >> 2; 1390 - memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); 1391 - } 1392 - atomic_inc(&rqstp->rq_sock->sk_inuse); 1393 - dr->svsk = rqstp->rq_sock; 1394 - 1395 - dr->handle.revisit = svc_revisit; 1396 - return &dr->handle; 1397 - } 1398 - 1399 - /* 1400 - * recv data from a deferred request into an active one 1401 - */ 1402 - static int svc_deferred_recv(struct svc_rqst *rqstp) 1403 - { 1404 - struct svc_deferred_req *dr = rqstp->rq_deferred; 1405 - 1406 - rqstp->rq_arg.head[0].iov_base = dr->args; 1407 - rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; 1408 - rqstp->rq_arg.page_len = 0; 1409 - rqstp->rq_arg.len = dr->argslen<<2; 1410 - rqstp->rq_prot = dr->prot; 1411 - memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); 1412 - rqstp->rq_addrlen = dr->addrlen; 1413 - rqstp->rq_daddr = dr->daddr; 1414 - rqstp->rq_respages = rqstp->rq_pages; 1415 - return dr->argslen<<2; 1416 - } 1417 - 1418 - 1419 - static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) 1420 - { 1421 - struct svc_deferred_req *dr = NULL; 1422 - 1423 - if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) 1424 - return NULL; 1425 - spin_lock(&svsk->sk_lock); 1426 - clear_bit(SK_DEFERRED, &svsk->sk_flags); 1427 - if (!list_empty(&svsk->sk_deferred)) { 1428 - dr = list_entry(svsk->sk_deferred.next, 1429 - struct svc_deferred_req, 1430 - handle.recent); 1431 - list_del_init(&dr->handle.recent); 1432 - set_bit(SK_DEFERRED, &svsk->sk_flags); 1433 - } 1434 - spin_unlock(&svsk->sk_lock); 1435 - return dr; 1436 }

··· 5 * 6 * The server scheduling algorithm does not always distribute the load 7 * evenly when servicing a single client. May need to modify the 8 + * svc_xprt_enqueue procedure... 9 * 10 * TCP support is largely untested and may be a little slow. The problem 11 * is that we currently do two separate recvfrom's, one for the 4-byte ··· 48 #include <linux/sunrpc/svcsock.h> 49 #include <linux/sunrpc/stats.h> 50 51 + #define RPCDBG_FACILITY RPCDBG_SVCXPRT 52 53 54 static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, 55 int *errp, int flags); 56 static void svc_udp_data_ready(struct sock *, int); 57 static int svc_udp_recvfrom(struct svc_rqst *); 58 static int svc_udp_sendto(struct svc_rqst *); 59 + static void svc_sock_detach(struct svc_xprt *); 60 + static void svc_sock_free(struct svc_xprt *); 61 62 + static struct svc_xprt *svc_create_socket(struct svc_serv *, int, 63 + struct sockaddr *, int, int); 64 #ifdef CONFIG_DEBUG_LOCK_ALLOC 65 static struct lock_class_key svc_key[2]; 66 static struct lock_class_key svc_slock_key[2]; 67 68 + static void svc_reclassify_socket(struct socket *sock) 69 { 70 struct sock *sk = sock->sk; 71 BUG_ON(sock_owned_by_user(sk)); 72 switch (sk->sk_family) { 73 case AF_INET: 74 sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", 75 + &svc_slock_key[0], 76 + "sk_xprt.xpt_lock-AF_INET-NFSD", 77 + &svc_key[0]); 78 break; 79 80 case AF_INET6: 81 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", 82 + &svc_slock_key[1], 83 + "sk_xprt.xpt_lock-AF_INET6-NFSD", 84 + &svc_key[1]); 85 break; 86 87 default: ··· 121 } 122 } 123 #else 124 + static void svc_reclassify_socket(struct socket *sock) 125 { 126 } 127 #endif 128 129 /* 130 * Release an skbuff after use 131 */ 132 + static void svc_release_skb(struct svc_rqst *rqstp) 133 { 134 + struct sk_buff *skb = rqstp->rq_xprt_ctxt; 135 struct svc_deferred_req *dr = rqstp->rq_deferred; 136 137 if (skb) { 138 + struct svc_sock *svsk = 139 + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); 140 + rqstp->rq_xprt_ctxt = NULL; 141 142 dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); 143 + skb_free_datagram(svsk->sk_sk, skb); 144 } 145 if (dr) { 146 rqstp->rq_deferred = NULL; 147 kfree(dr); 148 } 149 } 150 ··· 459 460 static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) 461 { 462 + struct svc_sock *svsk = 463 + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); 464 + switch (svsk->sk_sk->sk_family) { 465 case AF_INET: { 466 struct in_pktinfo *pki = CMSG_DATA(cmh); 467 ··· 489 /* 490 * Generic sendto routine 491 */ 492 + static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) 493 { 494 + struct svc_sock *svsk = 495 + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); 496 struct socket *sock = svsk->sk_sock; 497 int slen; 498 union { ··· 565 } 566 out: 567 dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", 568 + svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, 569 xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); 570 571 return len; ··· 602 if (!serv) 603 return 0; 604 spin_lock_bh(&serv->sv_lock); 605 + list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) { 606 int onelen = one_sock_name(buf+len, svsk); 607 if (toclose && strcmp(toclose, buf+len) == 0) 608 closesk = svsk; ··· 614 /* Should unregister with portmap, but you cannot 615 * unregister just one protocol... 616 */ 617 + svc_close_xprt(&closesk->sk_xprt); 618 else if (toclose) 619 return -ENOENT; 620 return len; ··· 624 /* 625 * Check input queue length 626 */ 627 + static int svc_recv_available(struct svc_sock *svsk) 628 { 629 struct socket *sock = svsk->sk_sock; 630 int avail, err; ··· 638 /* 639 * Generic recvfrom routine. 640 */ 641 + static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, 642 + int buflen) 643 { 644 + struct svc_sock *svsk = 645 + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); 646 struct msghdr msg = { 647 .msg_flags = MSG_DONTWAIT, 648 }; 649 int len; 650 + 651 + rqstp->rq_xprt_hlen = 0; 652 653 len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, 654 msg.msg_flags); 655 656 dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", 657 svsk, iov[0].iov_base, iov[0].iov_len, len); 658 return len; 659 } 660 661 /* 662 * Set socket snd and rcv buffer lengths 663 */ 664 + static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, 665 + unsigned int rcv) 666 { 667 #if 0 668 mm_segment_t oldfs; ··· 704 /* 705 * INET callback when data has been received on the socket. 706 */ 707 + static void svc_udp_data_ready(struct sock *sk, int count) 708 { 709 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 710 711 if (svsk) { 712 dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", 713 + svsk, sk, count, 714 + test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); 715 + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); 716 + svc_xprt_enqueue(&svsk->sk_xprt); 717 } 718 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 719 wake_up_interruptible(sk->sk_sleep); ··· 722 /* 723 * INET callback when space is newly available on the socket. 724 */ 725 + static void svc_write_space(struct sock *sk) 726 { 727 struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); 728 729 if (svsk) { 730 dprintk("svc: socket %p(inet %p), write_space busy=%d\n", 731 + svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); 732 + svc_xprt_enqueue(&svsk->sk_xprt); 733 } 734 735 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { ··· 740 } 741 } 742 743 + /* 744 + * Copy the UDP datagram's destination address to the rqstp structure. 745 + * The 'destination' address in this case is the address to which the 746 + * peer sent the datagram, i.e. our local address. For multihomed 747 + * hosts, this can change from msg to msg. Note that only the IP 748 + * address changes, the port number should remain the same. 749 + */ 750 + static void svc_udp_get_dest_address(struct svc_rqst *rqstp, 751 + struct cmsghdr *cmh) 752 { 753 + struct svc_sock *svsk = 754 + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); 755 + switch (svsk->sk_sk->sk_family) { 756 case AF_INET: { 757 struct in_pktinfo *pki = CMSG_DATA(cmh); 758 rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; ··· 760 /* 761 * Receive a datagram from a UDP socket. 762 */ 763 + static int svc_udp_recvfrom(struct svc_rqst *rqstp) 764 { 765 + struct svc_sock *svsk = 766 + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); 767 + struct svc_serv *serv = svsk->sk_xprt.xpt_server; 768 struct sk_buff *skb; 769 union { 770 struct cmsghdr hdr; ··· 779 .msg_flags = MSG_DONTWAIT, 780 }; 781 782 + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) 783 /* udp sockets need large rcvbuf as all pending 784 * requests are still in that buffer. sndbuf must 785 * also be large enough that there is enough space ··· 792 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 793 (serv->sv_nrthreads+3) * serv->sv_max_mesg); 794 795 + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); 796 skb = NULL; 797 err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 798 0, 0, MSG_PEEK | MSG_DONTWAIT); ··· 813 if (err != -EAGAIN) { 814 /* possibly an icmp error */ 815 dprintk("svc: recvfrom returned error %d\n", -err); 816 + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); 817 } 818 + svc_xprt_received(&svsk->sk_xprt); 819 return -EAGAIN; 820 } 821 + len = svc_addr_len(svc_addr(rqstp)); 822 + if (len < 0) 823 + return len; 824 + rqstp->rq_addrlen = len; 825 if (skb->tstamp.tv64 == 0) { 826 skb->tstamp = ktime_get_real(); 827 /* Don't enable netstamp, sunrpc doesn't 828 need that much accuracy */ 829 } 830 svsk->sk_sk->sk_stamp = skb->tstamp; 831 + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ 832 833 /* 834 * Maybe more packets - kick another thread ASAP. 835 */ 836 + svc_xprt_received(&svsk->sk_xprt); 837 838 len = skb->len - sizeof(struct udphdr); 839 rqstp->rq_arg.len = len; ··· 861 skb_free_datagram(svsk->sk_sk, skb); 862 } else { 863 /* we can use it in-place */ 864 + rqstp->rq_arg.head[0].iov_base = skb->data + 865 + sizeof(struct udphdr); 866 rqstp->rq_arg.head[0].iov_len = len; 867 if (skb_checksum_complete(skb)) { 868 skb_free_datagram(svsk->sk_sk, skb); 869 return 0; 870 } 871 + rqstp->rq_xprt_ctxt = skb; 872 } 873 874 rqstp->rq_arg.page_base = 0; ··· 900 return error; 901 } 902 903 + static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) 904 + { 905 + } 906 + 907 + static int svc_udp_has_wspace(struct svc_xprt *xprt) 908 + { 909 + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); 910 + struct svc_serv *serv = xprt->xpt_server; 911 + unsigned long required; 912 + 913 + /* 914 + * Set the SOCK_NOSPACE flag before checking the available 915 + * sock space. 916 + */ 917 + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); 918 + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; 919 + if (required*2 > sock_wspace(svsk->sk_sk)) 920 + return 0; 921 + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); 922 + return 1; 923 + } 924 + 925 + static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt) 926 + { 927 + BUG(); 928 + return NULL; 929 + } 930 + 931 + static struct svc_xprt *svc_udp_create(struct svc_serv *serv, 932 + struct sockaddr *sa, int salen, 933 + int flags) 934 + { 935 + return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags); 936 + } 937 + 938 + static struct svc_xprt_ops svc_udp_ops = { 939 + .xpo_create = svc_udp_create, 940 + .xpo_recvfrom = svc_udp_recvfrom, 941 + .xpo_sendto = svc_udp_sendto, 942 + .xpo_release_rqst = svc_release_skb, 943 + .xpo_detach = svc_sock_detach, 944 + .xpo_free = svc_sock_free, 945 + .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, 946 + .xpo_has_wspace = svc_udp_has_wspace, 947 + .xpo_accept = svc_udp_accept, 948 + }; 949 + 950 + static struct svc_xprt_class svc_udp_class = { 951 + .xcl_name = "udp", 952 + .xcl_owner = THIS_MODULE, 953 + .xcl_ops = &svc_udp_ops, 954 + .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, 955 + }; 956 + 957 + static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) 958 { 959 int one = 1; 960 mm_segment_t oldfs; 961 962 + svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv); 963 + clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); 964 svsk->sk_sk->sk_data_ready = svc_udp_data_ready; 965 svsk->sk_sk->sk_write_space = svc_write_space; 966 967 /* initialise setting must have enough space to 968 * receive and respond to one request. 969 * svc_udp_recvfrom will re-adjust if necessary 970 */ 971 svc_sock_setbufsize(svsk->sk_sock, 972 + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, 973 + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); 974 975 + /* data might have come in before data_ready set up */ 976 + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); 977 + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); 978 979 oldfs = get_fs(); 980 set_fs(KERNEL_DS); ··· 934 * A data_ready event on a listening socket means there's a connection 935 * pending. Do not use state_change as a substitute for it. 936 */ 937 + static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused) 938 { 939 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 940 ··· 954 */ 955 if (sk->sk_state == TCP_LISTEN) { 956 if (svsk) { 957 + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); 958 + svc_xprt_enqueue(&svsk->sk_xprt); 959 } else 960 printk("svc: socket %p: no user data\n", sk); 961 } ··· 967 /* 968 * A state change on a connected socket means it's dying or dead. 969 */ 970 + static void svc_tcp_state_change(struct sock *sk) 971 { 972 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 973 ··· 978 if (!svsk) 979 printk("svc: socket %p: no user data\n", sk); 980 else { 981 + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); 982 + svc_xprt_enqueue(&svsk->sk_xprt); 983 } 984 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 985 wake_up_interruptible_all(sk->sk_sleep); 986 } 987 988 + static void svc_tcp_data_ready(struct sock *sk, int count) 989 { 990 struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; 991 992 dprintk("svc: socket %p TCP data ready (svsk %p)\n", 993 sk, sk->sk_user_data); 994 if (svsk) { 995 + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); 996 + svc_xprt_enqueue(&svsk->sk_xprt); 997 } 998 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) 999 wake_up_interruptible(sk->sk_sleep); 1000 } 1001 1002 /* 1003 * Accept a TCP connection 1004 */ 1005 + static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) 1006 { 1007 + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); 1008 struct sockaddr_storage addr; 1009 struct sockaddr *sin = (struct sockaddr *) &addr; 1010 + struct svc_serv *serv = svsk->sk_xprt.xpt_server; 1011 struct socket *sock = svsk->sk_sock; 1012 struct socket *newsock; 1013 struct svc_sock *newsvsk; ··· 1031 1032 dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); 1033 if (!sock) 1034 + return NULL; 1035 1036 + clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); 1037 err = kernel_accept(sock, &newsock, O_NONBLOCK); 1038 if (err < 0) { 1039 if (err == -ENOMEM) ··· 1042 else if (err != -EAGAIN && net_ratelimit()) 1043 printk(KERN_WARNING "%s: accept failed (err %d)!\n", 1044 serv->sv_name, -err); 1045 + return NULL; 1046 } 1047 + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); 1048 1049 err = kernel_getpeername(newsock, sin, &slen); 1050 if (err < 0) { ··· 1077 if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 1078 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) 1079 goto failed; 1080 + svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); 1081 err = kernel_getsockname(newsock, sin, &slen); 1082 if (unlikely(err < 0)) { 1083 dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); 1084 slen = offsetof(struct sockaddr, sa_data); 1085 } 1086 + svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); 1087 1088 if (serv->sv_stats) 1089 serv->sv_stats->nettcpconn++; 1090 1091 + return &newsvsk->sk_xprt; 1092 1093 failed: 1094 sock_release(newsock); 1095 + return NULL; 1096 } 1097 1098 /* 1099 * Receive data from a TCP socket. 1100 */ 1101 + static int svc_tcp_recvfrom(struct svc_rqst *rqstp) 1102 { 1103 + struct svc_sock *svsk = 1104 + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); 1105 + struct svc_serv *serv = svsk->sk_xprt.xpt_server; 1106 int len; 1107 struct kvec *vec; 1108 int pnum, vlen; 1109 1110 dprintk("svc: tcp_recv %p data %d conn %d close %d\n", 1111 + svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), 1112 + test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), 1113 + test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); 1114 1115 + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) 1116 /* sndbuf needs to have room for one request 1117 * per thread, otherwise we can stall even when the 1118 * network isn't a bottleneck. ··· 1193 (serv->sv_nrthreads+3) * serv->sv_max_mesg, 1194 3 * serv->sv_max_mesg); 1195 1196 + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); 1197 1198 /* Receive data. If we haven't got the record length yet, get 1199 * the next four bytes. Otherwise try to gobble up as much as ··· 1212 if (len < want) { 1213 dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", 1214 len, want); 1215 + svc_xprt_received(&svsk->sk_xprt); 1216 return -EAGAIN; /* record header not complete */ 1217 } 1218 ··· 1248 if (len < svsk->sk_reclen) { 1249 dprintk("svc: incomplete TCP record (%d of %d)\n", 1250 len, svsk->sk_reclen); 1251 + svc_xprt_received(&svsk->sk_xprt); 1252 return -EAGAIN; /* record not complete */ 1253 } 1254 len = svsk->sk_reclen; 1255 + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); 1256 1257 vec = rqstp->rq_vec; 1258 vec[0] = rqstp->rq_arg.head[0]; ··· 1281 rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; 1282 } 1283 1284 + rqstp->rq_xprt_ctxt = NULL; 1285 rqstp->rq_prot = IPPROTO_TCP; 1286 1287 /* Reset TCP read info */ 1288 svsk->sk_reclen = 0; 1289 svsk->sk_tcplen = 0; 1290 1291 + svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt); 1292 + svc_xprt_received(&svsk->sk_xprt); 1293 if (serv->sv_stats) 1294 serv->sv_stats->nettcpcnt++; 1295 1296 return len; 1297 1298 err_delete: 1299 + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); 1300 return -EAGAIN; 1301 1302 error: 1303 if (len == -EAGAIN) { 1304 dprintk("RPC: TCP recvfrom got EAGAIN\n"); 1305 + svc_xprt_received(&svsk->sk_xprt); 1306 } else { 1307 printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", 1308 + svsk->sk_xprt.xpt_server->sv_name, -len); 1309 goto err_delete; 1310 } 1311 ··· 1314 /* 1315 * Send out data on TCP socket. 1316 */ 1317 + static int svc_tcp_sendto(struct svc_rqst *rqstp) 1318 { 1319 struct xdr_buf *xbufp = &rqstp->rq_res; 1320 int sent; ··· 1328 reclen = htonl(0x80000000|((xbufp->len ) - 4)); 1329 memcpy(xbufp->head[0].iov_base, &reclen, 4); 1330 1331 + if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags)) 1332 return -ENOTCONN; 1333 1334 sent = svc_sendto(rqstp, &rqstp->rq_res); 1335 if (sent != xbufp->len) { 1336 + printk(KERN_NOTICE 1337 + "rpc-srv/tcp: %s: %s %d when sending %d bytes " 1338 + "- shutting down socket\n", 1339 + rqstp->rq_xprt->xpt_server->sv_name, 1340 (sent<0)?"got error":"sent only", 1341 sent, xbufp->len); 1342 + set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags); 1343 + svc_xprt_enqueue(rqstp->rq_xprt); 1344 sent = -EAGAIN; 1345 } 1346 return sent; 1347 } 1348 1349 + /* 1350 + * Setup response header. TCP has a 4B record length field. 1351 + */ 1352 + static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) 1353 + { 1354 + struct kvec *resv = &rqstp->rq_res.head[0]; 1355 + 1356 + /* tcp needs a space for the record length... */ 1357 + svc_putnl(resv, 0); 1358 + } 1359 + 1360 + static int svc_tcp_has_wspace(struct svc_xprt *xprt) 1361 + { 1362 + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); 1363 + struct svc_serv *serv = svsk->sk_xprt.xpt_server; 1364 + int required; 1365 + int wspace; 1366 + 1367 + /* 1368 + * Set the SOCK_NOSPACE flag before checking the available 1369 + * sock space. 1370 + */ 1371 + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); 1372 + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; 1373 + wspace = sk_stream_wspace(svsk->sk_sk); 1374 + 1375 + if (wspace < sk_stream_min_wspace(svsk->sk_sk)) 1376 + return 0; 1377 + if (required * 2 > wspace) 1378 + return 0; 1379 + 1380 + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); 1381 + return 1; 1382 + } 1383 + 1384 + static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, 1385 + struct sockaddr *sa, int salen, 1386 + int flags) 1387 + { 1388 + return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags); 1389 + } 1390 + 1391 + static struct svc_xprt_ops svc_tcp_ops = { 1392 + .xpo_create = svc_tcp_create, 1393 + .xpo_recvfrom = svc_tcp_recvfrom, 1394 + .xpo_sendto = svc_tcp_sendto, 1395 + .xpo_release_rqst = svc_release_skb, 1396 + .xpo_detach = svc_sock_detach, 1397 + .xpo_free = svc_sock_free, 1398 + .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, 1399 + .xpo_has_wspace = svc_tcp_has_wspace, 1400 + .xpo_accept = svc_tcp_accept, 1401 + }; 1402 + 1403 + static struct svc_xprt_class svc_tcp_class = { 1404 + .xcl_name = "tcp", 1405 + .xcl_owner = THIS_MODULE, 1406 + .xcl_ops = &svc_tcp_ops, 1407 + .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, 1408 + }; 1409 + 1410 + void svc_init_xprt_sock(void) 1411 + { 1412 + svc_reg_xprt_class(&svc_tcp_class); 1413 + svc_reg_xprt_class(&svc_udp_class); 1414 + } 1415 + 1416 + void svc_cleanup_xprt_sock(void) 1417 + { 1418 + svc_unreg_xprt_class(&svc_tcp_class); 1419 + svc_unreg_xprt_class(&svc_udp_class); 1420 + } 1421 + 1422 + static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) 1423 { 1424 struct sock *sk = svsk->sk_sk; 1425 struct tcp_sock *tp = tcp_sk(sk); 1426 1427 + svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv); 1428 + set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); 1429 if (sk->sk_state == TCP_LISTEN) { 1430 dprintk("setting up TCP socket for listening\n"); 1431 + set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); 1432 sk->sk_data_ready = svc_tcp_listen_data_ready; 1433 + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); 1434 } else { 1435 dprintk("setting up TCP socket for reading\n"); 1436 sk->sk_state_change = svc_tcp_state_change; ··· 1373 * svc_tcp_recvfrom will re-adjust if necessary 1374 */ 1375 svc_sock_setbufsize(svsk->sk_sock, 1376 + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, 1377 + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); 1378 1379 + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); 1380 + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); 1381 if (sk->sk_state != TCP_ESTABLISHED) 1382 + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); 1383 } 1384 } 1385 1386 + void svc_sock_update_bufs(struct svc_serv *serv) 1387 { 1388 /* 1389 * The number of server threads has changed. Update ··· 1395 spin_lock_bh(&serv->sv_lock); 1396 list_for_each(le, &serv->sv_permsocks) { 1397 struct svc_sock *svsk = 1398 + list_entry(le, struct svc_sock, sk_xprt.xpt_list); 1399 + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); 1400 } 1401 list_for_each(le, &serv->sv_tempsocks) { 1402 struct svc_sock *svsk = 1403 + list_entry(le, struct svc_sock, sk_xprt.xpt_list); 1404 + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); 1405 } 1406 spin_unlock_bh(&serv->sv_lock); 1407 } 1408 1409 /* ··· 1631 struct svc_sock *svsk; 1632 struct sock *inet; 1633 int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); 1634 1635 dprintk("svc: svc_setup_socket %p\n", sock); 1636 if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { ··· 1651 return NULL; 1652 } 1653 1654 inet->sk_user_data = svsk; 1655 svsk->sk_sock = sock; 1656 svsk->sk_sk = inet; 1657 svsk->sk_ostate = inet->sk_state_change; 1658 svsk->sk_odata = inet->sk_data_ready; 1659 svsk->sk_owspace = inet->sk_write_space; 1660 1661 /* Initialize the socket */ 1662 if (sock->type == SOCK_DGRAM) 1663 + svc_udp_init(svsk, serv); 1664 else 1665 + svc_tcp_init(svsk, serv); 1666 1667 dprintk("svc: svc_setup_socket created %p (inet %p)\n", 1668 svsk, svsk->sk_sk); ··· 1717 else { 1718 svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); 1719 if (svsk) { 1720 + struct sockaddr_storage addr; 1721 + struct sockaddr *sin = (struct sockaddr *)&addr; 1722 + int salen; 1723 + if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) 1724 + svc_xprt_set_local(&svsk->sk_xprt, sin, salen); 1725 + clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); 1726 + spin_lock_bh(&serv->sv_lock); 1727 + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); 1728 + spin_unlock_bh(&serv->sv_lock); 1729 + svc_xprt_received(&svsk->sk_xprt); 1730 err = 0; 1731 } 1732 } ··· 1733 /* 1734 * Create socket for RPC service. 1735 */ 1736 + static struct svc_xprt *svc_create_socket(struct svc_serv *serv, 1737 + int protocol, 1738 + struct sockaddr *sin, int len, 1739 + int flags) 1740 { 1741 struct svc_sock *svsk; 1742 struct socket *sock; 1743 int error; 1744 int type; 1745 char buf[RPC_MAX_ADDRBUFLEN]; 1746 + struct sockaddr_storage addr; 1747 + struct sockaddr *newsin = (struct sockaddr *)&addr; 1748 + int newlen; 1749 1750 dprintk("svc: svc_create_socket(%s, %d, %s)\n", 1751 serv->sv_program->pg_name, protocol, ··· 1749 if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { 1750 printk(KERN_WARNING "svc: only UDP and TCP " 1751 "sockets supported\n"); 1752 + return ERR_PTR(-EINVAL); 1753 } 1754 type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; 1755 1756 error = sock_create_kern(sin->sa_family, type, protocol, &sock); 1757 if (error < 0) 1758 + return ERR_PTR(error); 1759 1760 svc_reclassify_socket(sock); 1761 ··· 1765 if (error < 0) 1766 goto bummer; 1767 1768 + newlen = len; 1769 + error = kernel_getsockname(sock, newsin, &newlen); 1770 + if (error < 0) 1771 + goto bummer; 1772 + 1773 if (protocol == IPPROTO_TCP) { 1774 if ((error = kernel_listen(sock, 64)) < 0) 1775 goto bummer; 1776 } 1777 1778 if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { 1779 + svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); 1780 + return (struct svc_xprt *)svsk; 1781 } 1782 1783 bummer: 1784 dprintk("svc: svc_create_socket error = %d\n", -error); 1785 sock_release(sock); 1786 + return ERR_PTR(error); 1787 } 1788 1789 /* 1790 + * Detach the svc_sock from the socket so that no 1791 + * more callbacks occur. 1792 */ 1793 + static void svc_sock_detach(struct svc_xprt *xprt) 1794 { 1795 + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); 1796 + struct sock *sk = svsk->sk_sk; 1797 1798 + dprintk("svc: svc_sock_detach(%p)\n", svsk); 1799 1800 + /* put back the old socket callbacks */ 1801 sk->sk_state_change = svsk->sk_ostate; 1802 sk->sk_data_ready = svsk->sk_odata; 1803 sk->sk_write_space = svsk->sk_owspace; 1804 } 1805 1806 /* 1807 + * Free the svc_sock's socket resources and the svc_sock itself. 1808 */ 1809 + static void svc_sock_free(struct svc_xprt *xprt) 1810 { 1811 + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); 1812 + dprintk("svc: svc_sock_free(%p)\n", svsk); 1813 1814 + if (svsk->sk_sock->file) 1815 + sockfd_put(svsk->sk_sock); 1816 + else 1817 + sock_release(svsk->sk_sock); 1818 + kfree(svsk); 1819 }

+31

net/sunrpc/sysctl.c

··· 18 #include <linux/sunrpc/types.h> 19 #include <linux/sunrpc/sched.h> 20 #include <linux/sunrpc/stats.h> 21 22 /* 23 * Declare the debug flags here ··· 54 unregister_sysctl_table(sunrpc_table_header); 55 sunrpc_table_header = NULL; 56 } 57 } 58 59 static int ··· 171 .maxlen = sizeof(int), 172 .mode = 0644, 173 .proc_handler = &proc_dodebug 174 }, 175 { .ctl_name = 0 } 176 };

··· 18 #include <linux/sunrpc/types.h> 19 #include <linux/sunrpc/sched.h> 20 #include <linux/sunrpc/stats.h> 21 + #include <linux/sunrpc/svc_xprt.h> 22 23 /* 24 * Declare the debug flags here ··· 53 unregister_sysctl_table(sunrpc_table_header); 54 sunrpc_table_header = NULL; 55 } 56 + } 57 + 58 + static int proc_do_xprt(ctl_table *table, int write, struct file *file, 59 + void __user *buffer, size_t *lenp, loff_t *ppos) 60 + { 61 + char tmpbuf[256]; 62 + int len; 63 + if ((*ppos && !write) || !*lenp) { 64 + *lenp = 0; 65 + return 0; 66 + } 67 + if (write) 68 + return -EINVAL; 69 + else { 70 + len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); 71 + if (!access_ok(VERIFY_WRITE, buffer, len)) 72 + return -EFAULT; 73 + 74 + if (__copy_to_user(buffer, tmpbuf, len)) 75 + return -EFAULT; 76 + } 77 + *lenp -= len; 78 + *ppos += len; 79 + return 0; 80 } 81 82 static int ··· 146 .maxlen = sizeof(int), 147 .mode = 0644, 148 .proc_handler = &proc_dodebug 149 + }, 150 + { 151 + .procname = "transports", 152 + .maxlen = 256, 153 + .mode = 0444, 154 + .proc_handler = &proc_do_xprt, 155 }, 156 { .ctl_name = 0 } 157 };

+5 -3

net/sunrpc/xdr.c

··· 96 EXPORT_SYMBOL(xdr_encode_string); 97 98 __be32 * 99 - xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen) 100 { 101 - unsigned int len; 102 103 - if ((len = ntohl(*p++)) > maxlen) 104 return NULL; 105 *lenp = len; 106 *sp = (char *) p;

··· 96 EXPORT_SYMBOL(xdr_encode_string); 97 98 __be32 * 99 + xdr_decode_string_inplace(__be32 *p, char **sp, 100 + unsigned int *lenp, unsigned int maxlen) 101 { 102 + u32 len; 103 104 + len = ntohl(*p++); 105 + if (len > maxlen) 106 return NULL; 107 *lenp = len; 108 *sp = (char *) p;

+5

net/sunrpc/xprtrdma/Makefile

··· 1 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o 2 3 xprtrdma-y := transport.o rpc_rdma.o verbs.o

··· 1 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o 2 3 xprtrdma-y := transport.o rpc_rdma.o verbs.o 4 + 5 + obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o 6 + 7 + svcrdma-y := svc_rdma.o svc_rdma_transport.o \ 8 + svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o

+266

net/sunrpc/xprtrdma/svc_rdma.c

···

··· 1 + /* 2 + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + * 39 + * Author: Tom Tucker <tom@opengridcomputing.com> 40 + */ 41 + #include <linux/module.h> 42 + #include <linux/init.h> 43 + #include <linux/fs.h> 44 + #include <linux/sysctl.h> 45 + #include <linux/sunrpc/clnt.h> 46 + #include <linux/sunrpc/sched.h> 47 + #include <linux/sunrpc/svc_rdma.h> 48 + 49 + #define RPCDBG_FACILITY RPCDBG_SVCXPRT 50 + 51 + /* RPC/RDMA parameters */ 52 + unsigned int svcrdma_ord = RPCRDMA_ORD; 53 + static unsigned int min_ord = 1; 54 + static unsigned int max_ord = 4096; 55 + unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; 56 + static unsigned int min_max_requests = 4; 57 + static unsigned int max_max_requests = 16384; 58 + unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; 59 + static unsigned int min_max_inline = 4096; 60 + static unsigned int max_max_inline = 65536; 61 + 62 + atomic_t rdma_stat_recv; 63 + atomic_t rdma_stat_read; 64 + atomic_t rdma_stat_write; 65 + atomic_t rdma_stat_sq_starve; 66 + atomic_t rdma_stat_rq_starve; 67 + atomic_t rdma_stat_rq_poll; 68 + atomic_t rdma_stat_rq_prod; 69 + atomic_t rdma_stat_sq_poll; 70 + atomic_t rdma_stat_sq_prod; 71 + 72 + /* 73 + * This function implements reading and resetting an atomic_t stat 74 + * variable through read/write to a proc file. Any write to the file 75 + * resets the associated statistic to zero. Any read returns it's 76 + * current value. 77 + */ 78 + static int read_reset_stat(ctl_table *table, int write, 79 + struct file *filp, void __user *buffer, size_t *lenp, 80 + loff_t *ppos) 81 + { 82 + atomic_t *stat = (atomic_t *)table->data; 83 + 84 + if (!stat) 85 + return -EINVAL; 86 + 87 + if (write) 88 + atomic_set(stat, 0); 89 + else { 90 + char str_buf[32]; 91 + char *data; 92 + int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); 93 + if (len >= 32) 94 + return -EFAULT; 95 + len = strlen(str_buf); 96 + if (*ppos > len) { 97 + *lenp = 0; 98 + return 0; 99 + } 100 + data = &str_buf[*ppos]; 101 + len -= *ppos; 102 + if (len > *lenp) 103 + len = *lenp; 104 + if (len && copy_to_user(buffer, str_buf, len)) 105 + return -EFAULT; 106 + *lenp = len; 107 + *ppos += len; 108 + } 109 + return 0; 110 + } 111 + 112 + static struct ctl_table_header *svcrdma_table_header; 113 + static ctl_table svcrdma_parm_table[] = { 114 + { 115 + .procname = "max_requests", 116 + .data = &svcrdma_max_requests, 117 + .maxlen = sizeof(unsigned int), 118 + .mode = 0644, 119 + .proc_handler = &proc_dointvec_minmax, 120 + .strategy = &sysctl_intvec, 121 + .extra1 = &min_max_requests, 122 + .extra2 = &max_max_requests 123 + }, 124 + { 125 + .procname = "max_req_size", 126 + .data = &svcrdma_max_req_size, 127 + .maxlen = sizeof(unsigned int), 128 + .mode = 0644, 129 + .proc_handler = &proc_dointvec_minmax, 130 + .strategy = &sysctl_intvec, 131 + .extra1 = &min_max_inline, 132 + .extra2 = &max_max_inline 133 + }, 134 + { 135 + .procname = "max_outbound_read_requests", 136 + .data = &svcrdma_ord, 137 + .maxlen = sizeof(unsigned int), 138 + .mode = 0644, 139 + .proc_handler = &proc_dointvec_minmax, 140 + .strategy = &sysctl_intvec, 141 + .extra1 = &min_ord, 142 + .extra2 = &max_ord, 143 + }, 144 + 145 + { 146 + .procname = "rdma_stat_read", 147 + .data = &rdma_stat_read, 148 + .maxlen = sizeof(atomic_t), 149 + .mode = 0644, 150 + .proc_handler = &read_reset_stat, 151 + }, 152 + { 153 + .procname = "rdma_stat_recv", 154 + .data = &rdma_stat_recv, 155 + .maxlen = sizeof(atomic_t), 156 + .mode = 0644, 157 + .proc_handler = &read_reset_stat, 158 + }, 159 + { 160 + .procname = "rdma_stat_write", 161 + .data = &rdma_stat_write, 162 + .maxlen = sizeof(atomic_t), 163 + .mode = 0644, 164 + .proc_handler = &read_reset_stat, 165 + }, 166 + { 167 + .procname = "rdma_stat_sq_starve", 168 + .data = &rdma_stat_sq_starve, 169 + .maxlen = sizeof(atomic_t), 170 + .mode = 0644, 171 + .proc_handler = &read_reset_stat, 172 + }, 173 + { 174 + .procname = "rdma_stat_rq_starve", 175 + .data = &rdma_stat_rq_starve, 176 + .maxlen = sizeof(atomic_t), 177 + .mode = 0644, 178 + .proc_handler = &read_reset_stat, 179 + }, 180 + { 181 + .procname = "rdma_stat_rq_poll", 182 + .data = &rdma_stat_rq_poll, 183 + .maxlen = sizeof(atomic_t), 184 + .mode = 0644, 185 + .proc_handler = &read_reset_stat, 186 + }, 187 + { 188 + .procname = "rdma_stat_rq_prod", 189 + .data = &rdma_stat_rq_prod, 190 + .maxlen = sizeof(atomic_t), 191 + .mode = 0644, 192 + .proc_handler = &read_reset_stat, 193 + }, 194 + { 195 + .procname = "rdma_stat_sq_poll", 196 + .data = &rdma_stat_sq_poll, 197 + .maxlen = sizeof(atomic_t), 198 + .mode = 0644, 199 + .proc_handler = &read_reset_stat, 200 + }, 201 + { 202 + .procname = "rdma_stat_sq_prod", 203 + .data = &rdma_stat_sq_prod, 204 + .maxlen = sizeof(atomic_t), 205 + .mode = 0644, 206 + .proc_handler = &read_reset_stat, 207 + }, 208 + { 209 + .ctl_name = 0, 210 + }, 211 + }; 212 + 213 + static ctl_table svcrdma_table[] = { 214 + { 215 + .procname = "svc_rdma", 216 + .mode = 0555, 217 + .child = svcrdma_parm_table 218 + }, 219 + { 220 + .ctl_name = 0, 221 + }, 222 + }; 223 + 224 + static ctl_table svcrdma_root_table[] = { 225 + { 226 + .ctl_name = CTL_SUNRPC, 227 + .procname = "sunrpc", 228 + .mode = 0555, 229 + .child = svcrdma_table 230 + }, 231 + { 232 + .ctl_name = 0, 233 + }, 234 + }; 235 + 236 + void svc_rdma_cleanup(void) 237 + { 238 + dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); 239 + if (svcrdma_table_header) { 240 + unregister_sysctl_table(svcrdma_table_header); 241 + svcrdma_table_header = NULL; 242 + } 243 + svc_unreg_xprt_class(&svc_rdma_class); 244 + } 245 + 246 + int svc_rdma_init(void) 247 + { 248 + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); 249 + dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); 250 + dprintk("\tmax_requests : %d\n", svcrdma_max_requests); 251 + dprintk("\tsq_depth : %d\n", 252 + svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); 253 + dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); 254 + if (!svcrdma_table_header) 255 + svcrdma_table_header = 256 + register_sysctl_table(svcrdma_root_table); 257 + 258 + /* Register RDMA with the SVC transport switch */ 259 + svc_reg_xprt_class(&svc_rdma_class); 260 + return 0; 261 + } 262 + MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); 263 + MODULE_DESCRIPTION("SVC RDMA Transport"); 264 + MODULE_LICENSE("Dual BSD/GPL"); 265 + module_init(svc_rdma_init); 266 + module_exit(svc_rdma_cleanup);

+412

net/sunrpc/xprtrdma/svc_rdma_marshal.c

···

··· 1 + /* 2 + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + * 39 + * Author: Tom Tucker <tom@opengridcomputing.com> 40 + */ 41 + 42 + #include <linux/sunrpc/xdr.h> 43 + #include <linux/sunrpc/debug.h> 44 + #include <asm/unaligned.h> 45 + #include <linux/sunrpc/rpc_rdma.h> 46 + #include <linux/sunrpc/svc_rdma.h> 47 + 48 + #define RPCDBG_FACILITY RPCDBG_SVCXPRT 49 + 50 + /* 51 + * Decodes a read chunk list. The expected format is as follows: 52 + * descrim : xdr_one 53 + * position : u32 offset into XDR stream 54 + * handle : u32 RKEY 55 + * . . . 56 + * end-of-list: xdr_zero 57 + */ 58 + static u32 *decode_read_list(u32 *va, u32 *vaend) 59 + { 60 + struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; 61 + 62 + while (ch->rc_discrim != xdr_zero) { 63 + u64 ch_offset; 64 + 65 + if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > 66 + (unsigned long)vaend) { 67 + dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); 68 + return NULL; 69 + } 70 + 71 + ch->rc_discrim = ntohl(ch->rc_discrim); 72 + ch->rc_position = ntohl(ch->rc_position); 73 + ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle); 74 + ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length); 75 + va = (u32 *)&ch->rc_target.rs_offset; 76 + xdr_decode_hyper(va, &ch_offset); 77 + put_unaligned(ch_offset, (u64 *)va); 78 + ch++; 79 + } 80 + return (u32 *)&ch->rc_position; 81 + } 82 + 83 + /* 84 + * Determine number of chunks and total bytes in chunk list. The chunk 85 + * list has already been verified to fit within the RPCRDMA header. 86 + */ 87 + void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch, 88 + int *ch_count, int *byte_count) 89 + { 90 + /* compute the number of bytes represented by read chunks */ 91 + *byte_count = 0; 92 + *ch_count = 0; 93 + for (; ch->rc_discrim != 0; ch++) { 94 + *byte_count = *byte_count + ch->rc_target.rs_length; 95 + *ch_count = *ch_count + 1; 96 + } 97 + } 98 + 99 + /* 100 + * Decodes a write chunk list. The expected format is as follows: 101 + * descrim : xdr_one 102 + * nchunks : <count> 103 + * handle : u32 RKEY ---+ 104 + * length : u32 <len of segment> | 105 + * offset : remove va + <count> 106 + * . . . | 107 + * ---+ 108 + */ 109 + static u32 *decode_write_list(u32 *va, u32 *vaend) 110 + { 111 + int ch_no; 112 + struct rpcrdma_write_array *ary = 113 + (struct rpcrdma_write_array *)va; 114 + 115 + /* Check for not write-array */ 116 + if (ary->wc_discrim == xdr_zero) 117 + return (u32 *)&ary->wc_nchunks; 118 + 119 + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > 120 + (unsigned long)vaend) { 121 + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); 122 + return NULL; 123 + } 124 + ary->wc_discrim = ntohl(ary->wc_discrim); 125 + ary->wc_nchunks = ntohl(ary->wc_nchunks); 126 + if (((unsigned long)&ary->wc_array[0] + 127 + (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > 128 + (unsigned long)vaend) { 129 + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", 130 + ary, ary->wc_nchunks, vaend); 131 + return NULL; 132 + } 133 + for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { 134 + u64 ch_offset; 135 + 136 + ary->wc_array[ch_no].wc_target.rs_handle = 137 + ntohl(ary->wc_array[ch_no].wc_target.rs_handle); 138 + ary->wc_array[ch_no].wc_target.rs_length = 139 + ntohl(ary->wc_array[ch_no].wc_target.rs_length); 140 + va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; 141 + xdr_decode_hyper(va, &ch_offset); 142 + put_unaligned(ch_offset, (u64 *)va); 143 + } 144 + 145 + /* 146 + * rs_length is the 2nd 4B field in wc_target and taking its 147 + * address skips the list terminator 148 + */ 149 + return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length; 150 + } 151 + 152 + static u32 *decode_reply_array(u32 *va, u32 *vaend) 153 + { 154 + int ch_no; 155 + struct rpcrdma_write_array *ary = 156 + (struct rpcrdma_write_array *)va; 157 + 158 + /* Check for no reply-array */ 159 + if (ary->wc_discrim == xdr_zero) 160 + return (u32 *)&ary->wc_nchunks; 161 + 162 + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > 163 + (unsigned long)vaend) { 164 + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); 165 + return NULL; 166 + } 167 + ary->wc_discrim = ntohl(ary->wc_discrim); 168 + ary->wc_nchunks = ntohl(ary->wc_nchunks); 169 + if (((unsigned long)&ary->wc_array[0] + 170 + (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > 171 + (unsigned long)vaend) { 172 + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", 173 + ary, ary->wc_nchunks, vaend); 174 + return NULL; 175 + } 176 + for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { 177 + u64 ch_offset; 178 + 179 + ary->wc_array[ch_no].wc_target.rs_handle = 180 + ntohl(ary->wc_array[ch_no].wc_target.rs_handle); 181 + ary->wc_array[ch_no].wc_target.rs_length = 182 + ntohl(ary->wc_array[ch_no].wc_target.rs_length); 183 + va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; 184 + xdr_decode_hyper(va, &ch_offset); 185 + put_unaligned(ch_offset, (u64 *)va); 186 + } 187 + 188 + return (u32 *)&ary->wc_array[ch_no]; 189 + } 190 + 191 + int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, 192 + struct svc_rqst *rqstp) 193 + { 194 + struct rpcrdma_msg *rmsgp = NULL; 195 + u32 *va; 196 + u32 *vaend; 197 + u32 hdr_len; 198 + 199 + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; 200 + 201 + /* Verify that there's enough bytes for header + something */ 202 + if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) { 203 + dprintk("svcrdma: header too short = %d\n", 204 + rqstp->rq_arg.len); 205 + return -EINVAL; 206 + } 207 + 208 + /* Decode the header */ 209 + rmsgp->rm_xid = ntohl(rmsgp->rm_xid); 210 + rmsgp->rm_vers = ntohl(rmsgp->rm_vers); 211 + rmsgp->rm_credit = ntohl(rmsgp->rm_credit); 212 + rmsgp->rm_type = ntohl(rmsgp->rm_type); 213 + 214 + if (rmsgp->rm_vers != RPCRDMA_VERSION) 215 + return -ENOSYS; 216 + 217 + /* Pull in the extra for the padded case and bump our pointer */ 218 + if (rmsgp->rm_type == RDMA_MSGP) { 219 + int hdrlen; 220 + rmsgp->rm_body.rm_padded.rm_align = 221 + ntohl(rmsgp->rm_body.rm_padded.rm_align); 222 + rmsgp->rm_body.rm_padded.rm_thresh = 223 + ntohl(rmsgp->rm_body.rm_padded.rm_thresh); 224 + 225 + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; 226 + rqstp->rq_arg.head[0].iov_base = va; 227 + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); 228 + rqstp->rq_arg.head[0].iov_len -= hdrlen; 229 + if (hdrlen > rqstp->rq_arg.len) 230 + return -EINVAL; 231 + return hdrlen; 232 + } 233 + 234 + /* The chunk list may contain either a read chunk list or a write 235 + * chunk list and a reply chunk list. 236 + */ 237 + va = &rmsgp->rm_body.rm_chunks[0]; 238 + vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); 239 + va = decode_read_list(va, vaend); 240 + if (!va) 241 + return -EINVAL; 242 + va = decode_write_list(va, vaend); 243 + if (!va) 244 + return -EINVAL; 245 + va = decode_reply_array(va, vaend); 246 + if (!va) 247 + return -EINVAL; 248 + 249 + rqstp->rq_arg.head[0].iov_base = va; 250 + hdr_len = (unsigned long)va - (unsigned long)rmsgp; 251 + rqstp->rq_arg.head[0].iov_len -= hdr_len; 252 + 253 + *rdma_req = rmsgp; 254 + return hdr_len; 255 + } 256 + 257 + int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp) 258 + { 259 + struct rpcrdma_msg *rmsgp = NULL; 260 + struct rpcrdma_read_chunk *ch; 261 + struct rpcrdma_write_array *ary; 262 + u32 *va; 263 + u32 hdrlen; 264 + 265 + dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n", 266 + rqstp); 267 + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; 268 + 269 + /* Pull in the extra for the padded case and bump our pointer */ 270 + if (rmsgp->rm_type == RDMA_MSGP) { 271 + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; 272 + rqstp->rq_arg.head[0].iov_base = va; 273 + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); 274 + rqstp->rq_arg.head[0].iov_len -= hdrlen; 275 + return hdrlen; 276 + } 277 + 278 + /* 279 + * Skip all chunks to find RPC msg. These were previously processed 280 + */ 281 + va = &rmsgp->rm_body.rm_chunks[0]; 282 + 283 + /* Skip read-list */ 284 + for (ch = (struct rpcrdma_read_chunk *)va; 285 + ch->rc_discrim != xdr_zero; ch++); 286 + va = (u32 *)&ch->rc_position; 287 + 288 + /* Skip write-list */ 289 + ary = (struct rpcrdma_write_array *)va; 290 + if (ary->wc_discrim == xdr_zero) 291 + va = (u32 *)&ary->wc_nchunks; 292 + else 293 + /* 294 + * rs_length is the 2nd 4B field in wc_target and taking its 295 + * address skips the list terminator 296 + */ 297 + va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length; 298 + 299 + /* Skip reply-array */ 300 + ary = (struct rpcrdma_write_array *)va; 301 + if (ary->wc_discrim == xdr_zero) 302 + va = (u32 *)&ary->wc_nchunks; 303 + else 304 + va = (u32 *)&ary->wc_array[ary->wc_nchunks]; 305 + 306 + rqstp->rq_arg.head[0].iov_base = va; 307 + hdrlen = (unsigned long)va - (unsigned long)rmsgp; 308 + rqstp->rq_arg.head[0].iov_len -= hdrlen; 309 + 310 + return hdrlen; 311 + } 312 + 313 + int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, 314 + struct rpcrdma_msg *rmsgp, 315 + enum rpcrdma_errcode err, u32 *va) 316 + { 317 + u32 *startp = va; 318 + 319 + *va++ = htonl(rmsgp->rm_xid); 320 + *va++ = htonl(rmsgp->rm_vers); 321 + *va++ = htonl(xprt->sc_max_requests); 322 + *va++ = htonl(RDMA_ERROR); 323 + *va++ = htonl(err); 324 + if (err == ERR_VERS) { 325 + *va++ = htonl(RPCRDMA_VERSION); 326 + *va++ = htonl(RPCRDMA_VERSION); 327 + } 328 + 329 + return (int)((unsigned long)va - (unsigned long)startp); 330 + } 331 + 332 + int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) 333 + { 334 + struct rpcrdma_write_array *wr_ary; 335 + 336 + /* There is no read-list in a reply */ 337 + 338 + /* skip write list */ 339 + wr_ary = (struct rpcrdma_write_array *) 340 + &rmsgp->rm_body.rm_chunks[1]; 341 + if (wr_ary->wc_discrim) 342 + wr_ary = (struct rpcrdma_write_array *) 343 + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]. 344 + wc_target.rs_length; 345 + else 346 + wr_ary = (struct rpcrdma_write_array *) 347 + &wr_ary->wc_nchunks; 348 + 349 + /* skip reply array */ 350 + if (wr_ary->wc_discrim) 351 + wr_ary = (struct rpcrdma_write_array *) 352 + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]; 353 + else 354 + wr_ary = (struct rpcrdma_write_array *) 355 + &wr_ary->wc_nchunks; 356 + 357 + return (unsigned long) wr_ary - (unsigned long) rmsgp; 358 + } 359 + 360 + void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) 361 + { 362 + struct rpcrdma_write_array *ary; 363 + 364 + /* no read-list */ 365 + rmsgp->rm_body.rm_chunks[0] = xdr_zero; 366 + 367 + /* write-array discrim */ 368 + ary = (struct rpcrdma_write_array *) 369 + &rmsgp->rm_body.rm_chunks[1]; 370 + ary->wc_discrim = xdr_one; 371 + ary->wc_nchunks = htonl(chunks); 372 + 373 + /* write-list terminator */ 374 + ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; 375 + 376 + /* reply-array discriminator */ 377 + ary->wc_array[chunks].wc_target.rs_length = xdr_zero; 378 + } 379 + 380 + void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, 381 + int chunks) 382 + { 383 + ary->wc_discrim = xdr_one; 384 + ary->wc_nchunks = htonl(chunks); 385 + } 386 + 387 + void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, 388 + int chunk_no, 389 + u32 rs_handle, u64 rs_offset, 390 + u32 write_len) 391 + { 392 + struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; 393 + seg->rs_handle = htonl(rs_handle); 394 + seg->rs_length = htonl(write_len); 395 + xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset); 396 + } 397 + 398 + void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, 399 + struct rpcrdma_msg *rdma_argp, 400 + struct rpcrdma_msg *rdma_resp, 401 + enum rpcrdma_proc rdma_type) 402 + { 403 + rdma_resp->rm_xid = htonl(rdma_argp->rm_xid); 404 + rdma_resp->rm_vers = htonl(rdma_argp->rm_vers); 405 + rdma_resp->rm_credit = htonl(xprt->sc_max_requests); 406 + rdma_resp->rm_type = htonl(rdma_type); 407 + 408 + /* Encode <nul> chunks lists */ 409 + rdma_resp->rm_body.rm_chunks[0] = xdr_zero; 410 + rdma_resp->rm_body.rm_chunks[1] = xdr_zero; 411 + rdma_resp->rm_body.rm_chunks[2] = xdr_zero; 412 + }

+586

net/sunrpc/xprtrdma/svc_rdma_recvfrom.c

···

··· 1 + /* 2 + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + * 39 + * Author: Tom Tucker <tom@opengridcomputing.com> 40 + */ 41 + 42 + #include <linux/sunrpc/debug.h> 43 + #include <linux/sunrpc/rpc_rdma.h> 44 + #include <linux/spinlock.h> 45 + #include <asm/unaligned.h> 46 + #include <rdma/ib_verbs.h> 47 + #include <rdma/rdma_cm.h> 48 + #include <linux/sunrpc/svc_rdma.h> 49 + 50 + #define RPCDBG_FACILITY RPCDBG_SVCXPRT 51 + 52 + /* 53 + * Replace the pages in the rq_argpages array with the pages from the SGE in 54 + * the RDMA_RECV completion. The SGL should contain full pages up until the 55 + * last one. 56 + */ 57 + static void rdma_build_arg_xdr(struct svc_rqst *rqstp, 58 + struct svc_rdma_op_ctxt *ctxt, 59 + u32 byte_count) 60 + { 61 + struct page *page; 62 + u32 bc; 63 + int sge_no; 64 + 65 + /* Swap the page in the SGE with the page in argpages */ 66 + page = ctxt->pages[0]; 67 + put_page(rqstp->rq_pages[0]); 68 + rqstp->rq_pages[0] = page; 69 + 70 + /* Set up the XDR head */ 71 + rqstp->rq_arg.head[0].iov_base = page_address(page); 72 + rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length); 73 + rqstp->rq_arg.len = byte_count; 74 + rqstp->rq_arg.buflen = byte_count; 75 + 76 + /* Compute bytes past head in the SGL */ 77 + bc = byte_count - rqstp->rq_arg.head[0].iov_len; 78 + 79 + /* If data remains, store it in the pagelist */ 80 + rqstp->rq_arg.page_len = bc; 81 + rqstp->rq_arg.page_base = 0; 82 + rqstp->rq_arg.pages = &rqstp->rq_pages[1]; 83 + sge_no = 1; 84 + while (bc && sge_no < ctxt->count) { 85 + page = ctxt->pages[sge_no]; 86 + put_page(rqstp->rq_pages[sge_no]); 87 + rqstp->rq_pages[sge_no] = page; 88 + bc -= min(bc, ctxt->sge[sge_no].length); 89 + rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; 90 + sge_no++; 91 + } 92 + rqstp->rq_respages = &rqstp->rq_pages[sge_no]; 93 + 94 + /* We should never run out of SGE because the limit is defined to 95 + * support the max allowed RPC data length 96 + */ 97 + BUG_ON(bc && (sge_no == ctxt->count)); 98 + BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len) 99 + != byte_count); 100 + BUG_ON(rqstp->rq_arg.len != byte_count); 101 + 102 + /* If not all pages were used from the SGL, free the remaining ones */ 103 + bc = sge_no; 104 + while (sge_no < ctxt->count) { 105 + page = ctxt->pages[sge_no++]; 106 + put_page(page); 107 + } 108 + ctxt->count = bc; 109 + 110 + /* Set up tail */ 111 + rqstp->rq_arg.tail[0].iov_base = NULL; 112 + rqstp->rq_arg.tail[0].iov_len = 0; 113 + } 114 + 115 + struct chunk_sge { 116 + int start; /* sge no for this chunk */ 117 + int count; /* sge count for this chunk */ 118 + }; 119 + 120 + /* Encode a read-chunk-list as an array of IB SGE 121 + * 122 + * Assumptions: 123 + * - chunk[0]->position points to pages[0] at an offset of 0 124 + * - pages[] is not physically or virtually contigous and consists of 125 + * PAGE_SIZE elements. 126 + * 127 + * Output: 128 + * - sge array pointing into pages[] array. 129 + * - chunk_sge array specifying sge index and count for each 130 + * chunk in the read list 131 + * 132 + */ 133 + static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, 134 + struct svc_rqst *rqstp, 135 + struct svc_rdma_op_ctxt *head, 136 + struct rpcrdma_msg *rmsgp, 137 + struct ib_sge *sge, 138 + struct chunk_sge *ch_sge_ary, 139 + int ch_count, 140 + int byte_count) 141 + { 142 + int sge_no; 143 + int sge_bytes; 144 + int page_off; 145 + int page_no; 146 + int ch_bytes; 147 + int ch_no; 148 + struct rpcrdma_read_chunk *ch; 149 + 150 + sge_no = 0; 151 + page_no = 0; 152 + page_off = 0; 153 + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; 154 + ch_no = 0; 155 + ch_bytes = ch->rc_target.rs_length; 156 + head->arg.head[0] = rqstp->rq_arg.head[0]; 157 + head->arg.tail[0] = rqstp->rq_arg.tail[0]; 158 + head->arg.pages = &head->pages[head->count]; 159 + head->sge[0].length = head->count; /* save count of hdr pages */ 160 + head->arg.page_base = 0; 161 + head->arg.page_len = ch_bytes; 162 + head->arg.len = rqstp->rq_arg.len + ch_bytes; 163 + head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; 164 + head->count++; 165 + ch_sge_ary[0].start = 0; 166 + while (byte_count) { 167 + sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); 168 + sge[sge_no].addr = 169 + ib_dma_map_page(xprt->sc_cm_id->device, 170 + rqstp->rq_arg.pages[page_no], 171 + page_off, sge_bytes, 172 + DMA_FROM_DEVICE); 173 + sge[sge_no].length = sge_bytes; 174 + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; 175 + /* 176 + * Don't bump head->count here because the same page 177 + * may be used by multiple SGE. 178 + */ 179 + head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; 180 + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; 181 + 182 + byte_count -= sge_bytes; 183 + ch_bytes -= sge_bytes; 184 + sge_no++; 185 + /* 186 + * If all bytes for this chunk have been mapped to an 187 + * SGE, move to the next SGE 188 + */ 189 + if (ch_bytes == 0) { 190 + ch_sge_ary[ch_no].count = 191 + sge_no - ch_sge_ary[ch_no].start; 192 + ch_no++; 193 + ch++; 194 + ch_sge_ary[ch_no].start = sge_no; 195 + ch_bytes = ch->rc_target.rs_length; 196 + /* If bytes remaining account for next chunk */ 197 + if (byte_count) { 198 + head->arg.page_len += ch_bytes; 199 + head->arg.len += ch_bytes; 200 + head->arg.buflen += ch_bytes; 201 + } 202 + } 203 + /* 204 + * If this SGE consumed all of the page, move to the 205 + * next page 206 + */ 207 + if ((sge_bytes + page_off) == PAGE_SIZE) { 208 + page_no++; 209 + page_off = 0; 210 + /* 211 + * If there are still bytes left to map, bump 212 + * the page count 213 + */ 214 + if (byte_count) 215 + head->count++; 216 + } else 217 + page_off += sge_bytes; 218 + } 219 + BUG_ON(byte_count != 0); 220 + return sge_no; 221 + } 222 + 223 + static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt, 224 + struct ib_sge *sge, 225 + u64 *sgl_offset, 226 + int count) 227 + { 228 + int i; 229 + 230 + ctxt->count = count; 231 + for (i = 0; i < count; i++) { 232 + ctxt->sge[i].addr = sge[i].addr; 233 + ctxt->sge[i].length = sge[i].length; 234 + *sgl_offset = *sgl_offset + sge[i].length; 235 + } 236 + } 237 + 238 + static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) 239 + { 240 + #ifdef RDMA_TRANSPORT_IWARP 241 + if ((RDMA_TRANSPORT_IWARP == 242 + rdma_node_get_transport(xprt->sc_cm_id-> 243 + device->node_type)) 244 + && sge_count > 1) 245 + return 1; 246 + else 247 + #endif 248 + return min_t(int, sge_count, xprt->sc_max_sge); 249 + } 250 + 251 + /* 252 + * Use RDMA_READ to read data from the advertised client buffer into the 253 + * XDR stream starting at rq_arg.head[0].iov_base. 254 + * Each chunk in the array 255 + * contains the following fields: 256 + * discrim - '1', This isn't used for data placement 257 + * position - The xdr stream offset (the same for every chunk) 258 + * handle - RMR for client memory region 259 + * length - data transfer length 260 + * offset - 64 bit tagged offset in remote memory region 261 + * 262 + * On our side, we need to read into a pagelist. The first page immediately 263 + * follows the RPC header. 264 + * 265 + * This function returns 1 to indicate success. The data is not yet in 266 + * the pagelist and therefore the RPC request must be deferred. The 267 + * I/O completion will enqueue the transport again and 268 + * svc_rdma_recvfrom will complete the request. 269 + * 270 + * NOTE: The ctxt must not be touched after the last WR has been posted 271 + * because the I/O completion processing may occur on another 272 + * processor and free / modify the context. Ne touche pas! 273 + */ 274 + static int rdma_read_xdr(struct svcxprt_rdma *xprt, 275 + struct rpcrdma_msg *rmsgp, 276 + struct svc_rqst *rqstp, 277 + struct svc_rdma_op_ctxt *hdr_ctxt) 278 + { 279 + struct ib_send_wr read_wr; 280 + int err = 0; 281 + int ch_no; 282 + struct ib_sge *sge; 283 + int ch_count; 284 + int byte_count; 285 + int sge_count; 286 + u64 sgl_offset; 287 + struct rpcrdma_read_chunk *ch; 288 + struct svc_rdma_op_ctxt *ctxt = NULL; 289 + struct svc_rdma_op_ctxt *head; 290 + struct svc_rdma_op_ctxt *tmp_sge_ctxt; 291 + struct svc_rdma_op_ctxt *tmp_ch_ctxt; 292 + struct chunk_sge *ch_sge_ary; 293 + 294 + /* If no read list is present, return 0 */ 295 + ch = svc_rdma_get_read_chunk(rmsgp); 296 + if (!ch) 297 + return 0; 298 + 299 + /* Allocate temporary contexts to keep SGE */ 300 + BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge)); 301 + tmp_sge_ctxt = svc_rdma_get_context(xprt); 302 + sge = tmp_sge_ctxt->sge; 303 + tmp_ch_ctxt = svc_rdma_get_context(xprt); 304 + ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge; 305 + 306 + svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); 307 + sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, 308 + sge, ch_sge_ary, 309 + ch_count, byte_count); 310 + head = svc_rdma_get_context(xprt); 311 + sgl_offset = 0; 312 + ch_no = 0; 313 + 314 + for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; 315 + ch->rc_discrim != 0; ch++, ch_no++) { 316 + next_sge: 317 + if (!ctxt) 318 + ctxt = head; 319 + else { 320 + ctxt->next = svc_rdma_get_context(xprt); 321 + ctxt = ctxt->next; 322 + } 323 + ctxt->next = NULL; 324 + ctxt->direction = DMA_FROM_DEVICE; 325 + clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); 326 + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); 327 + if ((ch+1)->rc_discrim == 0) { 328 + /* 329 + * Checked in sq_cq_reap to see if we need to 330 + * be enqueued 331 + */ 332 + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); 333 + ctxt->next = hdr_ctxt; 334 + hdr_ctxt->next = head; 335 + } 336 + 337 + /* Prepare READ WR */ 338 + memset(&read_wr, 0, sizeof read_wr); 339 + ctxt->wr_op = IB_WR_RDMA_READ; 340 + read_wr.wr_id = (unsigned long)ctxt; 341 + read_wr.opcode = IB_WR_RDMA_READ; 342 + read_wr.send_flags = IB_SEND_SIGNALED; 343 + read_wr.wr.rdma.rkey = ch->rc_target.rs_handle; 344 + read_wr.wr.rdma.remote_addr = 345 + get_unaligned(&(ch->rc_target.rs_offset)) + 346 + sgl_offset; 347 + read_wr.sg_list = &sge[ch_sge_ary[ch_no].start]; 348 + read_wr.num_sge = 349 + rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count); 350 + rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start], 351 + &sgl_offset, 352 + read_wr.num_sge); 353 + 354 + /* Post the read */ 355 + err = svc_rdma_send(xprt, &read_wr); 356 + if (err) { 357 + printk(KERN_ERR "svcrdma: Error posting send = %d\n", 358 + err); 359 + /* 360 + * Break the circular list so free knows when 361 + * to stop if the error happened to occur on 362 + * the last read 363 + */ 364 + ctxt->next = NULL; 365 + goto out; 366 + } 367 + atomic_inc(&rdma_stat_read); 368 + 369 + if (read_wr.num_sge < ch_sge_ary[ch_no].count) { 370 + ch_sge_ary[ch_no].count -= read_wr.num_sge; 371 + ch_sge_ary[ch_no].start += read_wr.num_sge; 372 + goto next_sge; 373 + } 374 + sgl_offset = 0; 375 + err = 0; 376 + } 377 + 378 + out: 379 + svc_rdma_put_context(tmp_sge_ctxt, 0); 380 + svc_rdma_put_context(tmp_ch_ctxt, 0); 381 + 382 + /* Detach arg pages. svc_recv will replenish them */ 383 + for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) 384 + rqstp->rq_pages[ch_no] = NULL; 385 + 386 + /* 387 + * Detach res pages. svc_release must see a resused count of 388 + * zero or it will attempt to put them. 389 + */ 390 + while (rqstp->rq_resused) 391 + rqstp->rq_respages[--rqstp->rq_resused] = NULL; 392 + 393 + if (err) { 394 + printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err); 395 + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); 396 + /* Free the linked list of read contexts */ 397 + while (head != NULL) { 398 + ctxt = head->next; 399 + svc_rdma_put_context(head, 1); 400 + head = ctxt; 401 + } 402 + return 0; 403 + } 404 + 405 + return 1; 406 + } 407 + 408 + static int rdma_read_complete(struct svc_rqst *rqstp, 409 + struct svc_rdma_op_ctxt *data) 410 + { 411 + struct svc_rdma_op_ctxt *head = data->next; 412 + int page_no; 413 + int ret; 414 + 415 + BUG_ON(!head); 416 + 417 + /* Copy RPC pages */ 418 + for (page_no = 0; page_no < head->count; page_no++) { 419 + put_page(rqstp->rq_pages[page_no]); 420 + rqstp->rq_pages[page_no] = head->pages[page_no]; 421 + } 422 + /* Point rq_arg.pages past header */ 423 + rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length]; 424 + rqstp->rq_arg.page_len = head->arg.page_len; 425 + rqstp->rq_arg.page_base = head->arg.page_base; 426 + 427 + /* rq_respages starts after the last arg page */ 428 + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; 429 + rqstp->rq_resused = 0; 430 + 431 + /* Rebuild rq_arg head and tail. */ 432 + rqstp->rq_arg.head[0] = head->arg.head[0]; 433 + rqstp->rq_arg.tail[0] = head->arg.tail[0]; 434 + rqstp->rq_arg.len = head->arg.len; 435 + rqstp->rq_arg.buflen = head->arg.buflen; 436 + 437 + /* XXX: What should this be? */ 438 + rqstp->rq_prot = IPPROTO_MAX; 439 + 440 + /* 441 + * Free the contexts we used to build the RDMA_READ. We have 442 + * to be careful here because the context list uses the same 443 + * next pointer used to chain the contexts associated with the 444 + * RDMA_READ 445 + */ 446 + data->next = NULL; /* terminate circular list */ 447 + do { 448 + data = head->next; 449 + svc_rdma_put_context(head, 0); 450 + head = data; 451 + } while (head != NULL); 452 + 453 + ret = rqstp->rq_arg.head[0].iov_len 454 + + rqstp->rq_arg.page_len 455 + + rqstp->rq_arg.tail[0].iov_len; 456 + dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, " 457 + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", 458 + ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, 459 + rqstp->rq_arg.head[0].iov_len); 460 + 461 + /* Indicate that we've consumed an RQ credit */ 462 + rqstp->rq_xprt_ctxt = rqstp->rq_xprt; 463 + svc_xprt_received(rqstp->rq_xprt); 464 + return ret; 465 + } 466 + 467 + /* 468 + * Set up the rqstp thread context to point to the RQ buffer. If 469 + * necessary, pull additional data from the client with an RDMA_READ 470 + * request. 471 + */ 472 + int svc_rdma_recvfrom(struct svc_rqst *rqstp) 473 + { 474 + struct svc_xprt *xprt = rqstp->rq_xprt; 475 + struct svcxprt_rdma *rdma_xprt = 476 + container_of(xprt, struct svcxprt_rdma, sc_xprt); 477 + struct svc_rdma_op_ctxt *ctxt = NULL; 478 + struct rpcrdma_msg *rmsgp; 479 + int ret = 0; 480 + int len; 481 + 482 + dprintk("svcrdma: rqstp=%p\n", rqstp); 483 + 484 + /* 485 + * The rq_xprt_ctxt indicates if we've consumed an RQ credit 486 + * or not. It is used in the rdma xpo_release_rqst function to 487 + * determine whether or not to return an RQ WQE to the RQ. 488 + */ 489 + rqstp->rq_xprt_ctxt = NULL; 490 + 491 + spin_lock_bh(&rdma_xprt->sc_read_complete_lock); 492 + if (!list_empty(&rdma_xprt->sc_read_complete_q)) { 493 + ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, 494 + struct svc_rdma_op_ctxt, 495 + dto_q); 496 + list_del_init(&ctxt->dto_q); 497 + } 498 + spin_unlock_bh(&rdma_xprt->sc_read_complete_lock); 499 + if (ctxt) 500 + return rdma_read_complete(rqstp, ctxt); 501 + 502 + spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); 503 + if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { 504 + ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, 505 + struct svc_rdma_op_ctxt, 506 + dto_q); 507 + list_del_init(&ctxt->dto_q); 508 + } else { 509 + atomic_inc(&rdma_stat_rq_starve); 510 + clear_bit(XPT_DATA, &xprt->xpt_flags); 511 + ctxt = NULL; 512 + } 513 + spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); 514 + if (!ctxt) { 515 + /* This is the EAGAIN path. The svc_recv routine will 516 + * return -EAGAIN, the nfsd thread will go to call into 517 + * svc_recv again and we shouldn't be on the active 518 + * transport list 519 + */ 520 + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) 521 + goto close_out; 522 + 523 + BUG_ON(ret); 524 + goto out; 525 + } 526 + dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", 527 + ctxt, rdma_xprt, rqstp, ctxt->wc_status); 528 + BUG_ON(ctxt->wc_status != IB_WC_SUCCESS); 529 + atomic_inc(&rdma_stat_recv); 530 + 531 + /* Build up the XDR from the receive buffers. */ 532 + rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); 533 + 534 + /* Decode the RDMA header. */ 535 + len = svc_rdma_xdr_decode_req(&rmsgp, rqstp); 536 + rqstp->rq_xprt_hlen = len; 537 + 538 + /* If the request is invalid, reply with an error */ 539 + if (len < 0) { 540 + if (len == -ENOSYS) 541 + (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); 542 + goto close_out; 543 + } 544 + 545 + /* Read read-list data. If we would need to wait, defer 546 + * it. Not that in this case, we don't return the RQ credit 547 + * until after the read completes. 548 + */ 549 + if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) { 550 + svc_xprt_received(xprt); 551 + return 0; 552 + } 553 + 554 + /* Indicate we've consumed an RQ credit */ 555 + rqstp->rq_xprt_ctxt = rqstp->rq_xprt; 556 + 557 + ret = rqstp->rq_arg.head[0].iov_len 558 + + rqstp->rq_arg.page_len 559 + + rqstp->rq_arg.tail[0].iov_len; 560 + svc_rdma_put_context(ctxt, 0); 561 + out: 562 + dprintk("svcrdma: ret = %d, rq_arg.len =%d, " 563 + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", 564 + ret, rqstp->rq_arg.len, 565 + rqstp->rq_arg.head[0].iov_base, 566 + rqstp->rq_arg.head[0].iov_len); 567 + rqstp->rq_prot = IPPROTO_MAX; 568 + svc_xprt_copy_addrs(rqstp, xprt); 569 + svc_xprt_received(xprt); 570 + return ret; 571 + 572 + close_out: 573 + if (ctxt) { 574 + svc_rdma_put_context(ctxt, 1); 575 + /* Indicate we've consumed an RQ credit */ 576 + rqstp->rq_xprt_ctxt = rqstp->rq_xprt; 577 + } 578 + dprintk("svcrdma: transport %p is closing\n", xprt); 579 + /* 580 + * Set the close bit and enqueue it. svc_recv will see the 581 + * close bit and call svc_xprt_delete 582 + */ 583 + set_bit(XPT_CLOSE, &xprt->xpt_flags); 584 + svc_xprt_received(xprt); 585 + return 0; 586 + }

+520

net/sunrpc/xprtrdma/svc_rdma_sendto.c

···

··· 1 + /* 2 + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + * 39 + * Author: Tom Tucker <tom@opengridcomputing.com> 40 + */ 41 + 42 + #include <linux/sunrpc/debug.h> 43 + #include <linux/sunrpc/rpc_rdma.h> 44 + #include <linux/spinlock.h> 45 + #include <asm/unaligned.h> 46 + #include <rdma/ib_verbs.h> 47 + #include <rdma/rdma_cm.h> 48 + #include <linux/sunrpc/svc_rdma.h> 49 + 50 + #define RPCDBG_FACILITY RPCDBG_SVCXPRT 51 + 52 + /* Encode an XDR as an array of IB SGE 53 + * 54 + * Assumptions: 55 + * - head[0] is physically contiguous. 56 + * - tail[0] is physically contiguous. 57 + * - pages[] is not physically or virtually contigous and consists of 58 + * PAGE_SIZE elements. 59 + * 60 + * Output: 61 + * SGE[0] reserved for RCPRDMA header 62 + * SGE[1] data from xdr->head[] 63 + * SGE[2..sge_count-2] data from xdr->pages[] 64 + * SGE[sge_count-1] data from xdr->tail. 65 + * 66 + */ 67 + static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, 68 + struct xdr_buf *xdr, 69 + struct ib_sge *sge, 70 + int *sge_count) 71 + { 72 + /* Max we need is the length of the XDR / pagesize + one for 73 + * head + one for tail + one for RPCRDMA header 74 + */ 75 + int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; 76 + int sge_no; 77 + u32 byte_count = xdr->len; 78 + u32 sge_bytes; 79 + u32 page_bytes; 80 + int page_off; 81 + int page_no; 82 + 83 + /* Skip the first sge, this is for the RPCRDMA header */ 84 + sge_no = 1; 85 + 86 + /* Head SGE */ 87 + sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device, 88 + xdr->head[0].iov_base, 89 + xdr->head[0].iov_len, 90 + DMA_TO_DEVICE); 91 + sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len); 92 + byte_count -= sge_bytes; 93 + sge[sge_no].length = sge_bytes; 94 + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; 95 + sge_no++; 96 + 97 + /* pages SGE */ 98 + page_no = 0; 99 + page_bytes = xdr->page_len; 100 + page_off = xdr->page_base; 101 + while (byte_count && page_bytes) { 102 + sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off)); 103 + sge[sge_no].addr = 104 + ib_dma_map_page(xprt->sc_cm_id->device, 105 + xdr->pages[page_no], page_off, 106 + sge_bytes, DMA_TO_DEVICE); 107 + sge_bytes = min(sge_bytes, page_bytes); 108 + byte_count -= sge_bytes; 109 + page_bytes -= sge_bytes; 110 + sge[sge_no].length = sge_bytes; 111 + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; 112 + 113 + sge_no++; 114 + page_no++; 115 + page_off = 0; /* reset for next time through loop */ 116 + } 117 + 118 + /* Tail SGE */ 119 + if (byte_count && xdr->tail[0].iov_len) { 120 + sge[sge_no].addr = 121 + ib_dma_map_single(xprt->sc_cm_id->device, 122 + xdr->tail[0].iov_base, 123 + xdr->tail[0].iov_len, 124 + DMA_TO_DEVICE); 125 + sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len); 126 + byte_count -= sge_bytes; 127 + sge[sge_no].length = sge_bytes; 128 + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; 129 + sge_no++; 130 + } 131 + 132 + BUG_ON(sge_no > sge_max); 133 + BUG_ON(byte_count != 0); 134 + 135 + *sge_count = sge_no; 136 + return sge; 137 + } 138 + 139 + 140 + /* Assumptions: 141 + * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE 142 + */ 143 + static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, 144 + u32 rmr, u64 to, 145 + u32 xdr_off, int write_len, 146 + struct ib_sge *xdr_sge, int sge_count) 147 + { 148 + struct svc_rdma_op_ctxt *tmp_sge_ctxt; 149 + struct ib_send_wr write_wr; 150 + struct ib_sge *sge; 151 + int xdr_sge_no; 152 + int sge_no; 153 + int sge_bytes; 154 + int sge_off; 155 + int bc; 156 + struct svc_rdma_op_ctxt *ctxt; 157 + int ret = 0; 158 + 159 + BUG_ON(sge_count >= 32); 160 + dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " 161 + "write_len=%d, xdr_sge=%p, sge_count=%d\n", 162 + rmr, to, xdr_off, write_len, xdr_sge, sge_count); 163 + 164 + ctxt = svc_rdma_get_context(xprt); 165 + ctxt->count = 0; 166 + tmp_sge_ctxt = svc_rdma_get_context(xprt); 167 + sge = tmp_sge_ctxt->sge; 168 + 169 + /* Find the SGE associated with xdr_off */ 170 + for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count; 171 + xdr_sge_no++) { 172 + if (xdr_sge[xdr_sge_no].length > bc) 173 + break; 174 + bc -= xdr_sge[xdr_sge_no].length; 175 + } 176 + 177 + sge_off = bc; 178 + bc = write_len; 179 + sge_no = 0; 180 + 181 + /* Copy the remaining SGE */ 182 + while (bc != 0 && xdr_sge_no < sge_count) { 183 + sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off; 184 + sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey; 185 + sge_bytes = min((size_t)bc, 186 + (size_t)(xdr_sge[xdr_sge_no].length-sge_off)); 187 + sge[sge_no].length = sge_bytes; 188 + 189 + sge_off = 0; 190 + sge_no++; 191 + xdr_sge_no++; 192 + bc -= sge_bytes; 193 + } 194 + 195 + BUG_ON(bc != 0); 196 + BUG_ON(xdr_sge_no > sge_count); 197 + 198 + /* Prepare WRITE WR */ 199 + memset(&write_wr, 0, sizeof write_wr); 200 + ctxt->wr_op = IB_WR_RDMA_WRITE; 201 + write_wr.wr_id = (unsigned long)ctxt; 202 + write_wr.sg_list = &sge[0]; 203 + write_wr.num_sge = sge_no; 204 + write_wr.opcode = IB_WR_RDMA_WRITE; 205 + write_wr.send_flags = IB_SEND_SIGNALED; 206 + write_wr.wr.rdma.rkey = rmr; 207 + write_wr.wr.rdma.remote_addr = to; 208 + 209 + /* Post It */ 210 + atomic_inc(&rdma_stat_write); 211 + if (svc_rdma_send(xprt, &write_wr)) { 212 + svc_rdma_put_context(ctxt, 1); 213 + /* Fatal error, close transport */ 214 + ret = -EIO; 215 + } 216 + svc_rdma_put_context(tmp_sge_ctxt, 0); 217 + return ret; 218 + } 219 + 220 + static int send_write_chunks(struct svcxprt_rdma *xprt, 221 + struct rpcrdma_msg *rdma_argp, 222 + struct rpcrdma_msg *rdma_resp, 223 + struct svc_rqst *rqstp, 224 + struct ib_sge *sge, 225 + int sge_count) 226 + { 227 + u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; 228 + int write_len; 229 + int max_write; 230 + u32 xdr_off; 231 + int chunk_off; 232 + int chunk_no; 233 + struct rpcrdma_write_array *arg_ary; 234 + struct rpcrdma_write_array *res_ary; 235 + int ret; 236 + 237 + arg_ary = svc_rdma_get_write_array(rdma_argp); 238 + if (!arg_ary) 239 + return 0; 240 + res_ary = (struct rpcrdma_write_array *) 241 + &rdma_resp->rm_body.rm_chunks[1]; 242 + 243 + max_write = xprt->sc_max_sge * PAGE_SIZE; 244 + 245 + /* Write chunks start at the pagelist */ 246 + for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; 247 + xfer_len && chunk_no < arg_ary->wc_nchunks; 248 + chunk_no++) { 249 + struct rpcrdma_segment *arg_ch; 250 + u64 rs_offset; 251 + 252 + arg_ch = &arg_ary->wc_array[chunk_no].wc_target; 253 + write_len = min(xfer_len, arg_ch->rs_length); 254 + 255 + /* Prepare the response chunk given the length actually 256 + * written */ 257 + rs_offset = get_unaligned(&(arg_ch->rs_offset)); 258 + svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, 259 + arg_ch->rs_handle, 260 + rs_offset, 261 + write_len); 262 + chunk_off = 0; 263 + while (write_len) { 264 + int this_write; 265 + this_write = min(write_len, max_write); 266 + ret = send_write(xprt, rqstp, 267 + arg_ch->rs_handle, 268 + rs_offset + chunk_off, 269 + xdr_off, 270 + this_write, 271 + sge, 272 + sge_count); 273 + if (ret) { 274 + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", 275 + ret); 276 + return -EIO; 277 + } 278 + chunk_off += this_write; 279 + xdr_off += this_write; 280 + xfer_len -= this_write; 281 + write_len -= this_write; 282 + } 283 + } 284 + /* Update the req with the number of chunks actually used */ 285 + svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no); 286 + 287 + return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; 288 + } 289 + 290 + static int send_reply_chunks(struct svcxprt_rdma *xprt, 291 + struct rpcrdma_msg *rdma_argp, 292 + struct rpcrdma_msg *rdma_resp, 293 + struct svc_rqst *rqstp, 294 + struct ib_sge *sge, 295 + int sge_count) 296 + { 297 + u32 xfer_len = rqstp->rq_res.len; 298 + int write_len; 299 + int max_write; 300 + u32 xdr_off; 301 + int chunk_no; 302 + int chunk_off; 303 + struct rpcrdma_segment *ch; 304 + struct rpcrdma_write_array *arg_ary; 305 + struct rpcrdma_write_array *res_ary; 306 + int ret; 307 + 308 + arg_ary = svc_rdma_get_reply_array(rdma_argp); 309 + if (!arg_ary) 310 + return 0; 311 + /* XXX: need to fix when reply lists occur with read-list and or 312 + * write-list */ 313 + res_ary = (struct rpcrdma_write_array *) 314 + &rdma_resp->rm_body.rm_chunks[2]; 315 + 316 + max_write = xprt->sc_max_sge * PAGE_SIZE; 317 + 318 + /* xdr offset starts at RPC message */ 319 + for (xdr_off = 0, chunk_no = 0; 320 + xfer_len && chunk_no < arg_ary->wc_nchunks; 321 + chunk_no++) { 322 + u64 rs_offset; 323 + ch = &arg_ary->wc_array[chunk_no].wc_target; 324 + write_len = min(xfer_len, ch->rs_length); 325 + 326 + 327 + /* Prepare the reply chunk given the length actually 328 + * written */ 329 + rs_offset = get_unaligned(&(ch->rs_offset)); 330 + svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, 331 + ch->rs_handle, rs_offset, 332 + write_len); 333 + chunk_off = 0; 334 + while (write_len) { 335 + int this_write; 336 + 337 + this_write = min(write_len, max_write); 338 + ret = send_write(xprt, rqstp, 339 + ch->rs_handle, 340 + rs_offset + chunk_off, 341 + xdr_off, 342 + this_write, 343 + sge, 344 + sge_count); 345 + if (ret) { 346 + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", 347 + ret); 348 + return -EIO; 349 + } 350 + chunk_off += this_write; 351 + xdr_off += this_write; 352 + xfer_len -= this_write; 353 + write_len -= this_write; 354 + } 355 + } 356 + /* Update the req with the number of chunks actually used */ 357 + svc_rdma_xdr_encode_reply_array(res_ary, chunk_no); 358 + 359 + return rqstp->rq_res.len; 360 + } 361 + 362 + /* This function prepares the portion of the RPCRDMA message to be 363 + * sent in the RDMA_SEND. This function is called after data sent via 364 + * RDMA has already been transmitted. There are three cases: 365 + * - The RPCRDMA header, RPC header, and payload are all sent in a 366 + * single RDMA_SEND. This is the "inline" case. 367 + * - The RPCRDMA header and some portion of the RPC header and data 368 + * are sent via this RDMA_SEND and another portion of the data is 369 + * sent via RDMA. 370 + * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC 371 + * header and data are all transmitted via RDMA. 372 + * In all three cases, this function prepares the RPCRDMA header in 373 + * sge[0], the 'type' parameter indicates the type to place in the 374 + * RPCRDMA header, and the 'byte_count' field indicates how much of 375 + * the XDR to include in this RDMA_SEND. 376 + */ 377 + static int send_reply(struct svcxprt_rdma *rdma, 378 + struct svc_rqst *rqstp, 379 + struct page *page, 380 + struct rpcrdma_msg *rdma_resp, 381 + struct svc_rdma_op_ctxt *ctxt, 382 + int sge_count, 383 + int byte_count) 384 + { 385 + struct ib_send_wr send_wr; 386 + int sge_no; 387 + int sge_bytes; 388 + int page_no; 389 + int ret; 390 + 391 + /* Prepare the context */ 392 + ctxt->pages[0] = page; 393 + ctxt->count = 1; 394 + 395 + /* Prepare the SGE for the RPCRDMA Header */ 396 + ctxt->sge[0].addr = 397 + ib_dma_map_page(rdma->sc_cm_id->device, 398 + page, 0, PAGE_SIZE, DMA_TO_DEVICE); 399 + ctxt->direction = DMA_TO_DEVICE; 400 + ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); 401 + ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; 402 + 403 + /* Determine how many of our SGE are to be transmitted */ 404 + for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) { 405 + sge_bytes = min((size_t)ctxt->sge[sge_no].length, 406 + (size_t)byte_count); 407 + byte_count -= sge_bytes; 408 + } 409 + BUG_ON(byte_count != 0); 410 + 411 + /* Save all respages in the ctxt and remove them from the 412 + * respages array. They are our pages until the I/O 413 + * completes. 414 + */ 415 + for (page_no = 0; page_no < rqstp->rq_resused; page_no++) { 416 + ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; 417 + ctxt->count++; 418 + rqstp->rq_respages[page_no] = NULL; 419 + } 420 + 421 + BUG_ON(sge_no > rdma->sc_max_sge); 422 + memset(&send_wr, 0, sizeof send_wr); 423 + ctxt->wr_op = IB_WR_SEND; 424 + send_wr.wr_id = (unsigned long)ctxt; 425 + send_wr.sg_list = ctxt->sge; 426 + send_wr.num_sge = sge_no; 427 + send_wr.opcode = IB_WR_SEND; 428 + send_wr.send_flags = IB_SEND_SIGNALED; 429 + 430 + ret = svc_rdma_send(rdma, &send_wr); 431 + if (ret) 432 + svc_rdma_put_context(ctxt, 1); 433 + 434 + return ret; 435 + } 436 + 437 + void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) 438 + { 439 + } 440 + 441 + /* 442 + * Return the start of an xdr buffer. 443 + */ 444 + static void *xdr_start(struct xdr_buf *xdr) 445 + { 446 + return xdr->head[0].iov_base - 447 + (xdr->len - 448 + xdr->page_len - 449 + xdr->tail[0].iov_len - 450 + xdr->head[0].iov_len); 451 + } 452 + 453 + int svc_rdma_sendto(struct svc_rqst *rqstp) 454 + { 455 + struct svc_xprt *xprt = rqstp->rq_xprt; 456 + struct svcxprt_rdma *rdma = 457 + container_of(xprt, struct svcxprt_rdma, sc_xprt); 458 + struct rpcrdma_msg *rdma_argp; 459 + struct rpcrdma_msg *rdma_resp; 460 + struct rpcrdma_write_array *reply_ary; 461 + enum rpcrdma_proc reply_type; 462 + int ret; 463 + int inline_bytes; 464 + struct ib_sge *sge; 465 + int sge_count = 0; 466 + struct page *res_page; 467 + struct svc_rdma_op_ctxt *ctxt; 468 + 469 + dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); 470 + 471 + /* Get the RDMA request header. */ 472 + rdma_argp = xdr_start(&rqstp->rq_arg); 473 + 474 + /* Build an SGE for the XDR */ 475 + ctxt = svc_rdma_get_context(rdma); 476 + ctxt->direction = DMA_TO_DEVICE; 477 + sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count); 478 + 479 + inline_bytes = rqstp->rq_res.len; 480 + 481 + /* Create the RDMA response header */ 482 + res_page = svc_rdma_get_page(); 483 + rdma_resp = page_address(res_page); 484 + reply_ary = svc_rdma_get_reply_array(rdma_argp); 485 + if (reply_ary) 486 + reply_type = RDMA_NOMSG; 487 + else 488 + reply_type = RDMA_MSG; 489 + svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, 490 + rdma_resp, reply_type); 491 + 492 + /* Send any write-chunk data and build resp write-list */ 493 + ret = send_write_chunks(rdma, rdma_argp, rdma_resp, 494 + rqstp, sge, sge_count); 495 + if (ret < 0) { 496 + printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", 497 + ret); 498 + goto error; 499 + } 500 + inline_bytes -= ret; 501 + 502 + /* Send any reply-list data and update resp reply-list */ 503 + ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, 504 + rqstp, sge, sge_count); 505 + if (ret < 0) { 506 + printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", 507 + ret); 508 + goto error; 509 + } 510 + inline_bytes -= ret; 511 + 512 + ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count, 513 + inline_bytes); 514 + dprintk("svcrdma: send_reply returns %d\n", ret); 515 + return ret; 516 + error: 517 + svc_rdma_put_context(ctxt, 0); 518 + put_page(res_page); 519 + return ret; 520 + }

+1080

net/sunrpc/xprtrdma/svc_rdma_transport.c

···

··· 1 + /* 2 + * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + * 39 + * Author: Tom Tucker <tom@opengridcomputing.com> 40 + */ 41 + 42 + #include <linux/sunrpc/svc_xprt.h> 43 + #include <linux/sunrpc/debug.h> 44 + #include <linux/sunrpc/rpc_rdma.h> 45 + #include <linux/spinlock.h> 46 + #include <rdma/ib_verbs.h> 47 + #include <rdma/rdma_cm.h> 48 + #include <linux/sunrpc/svc_rdma.h> 49 + 50 + #define RPCDBG_FACILITY RPCDBG_SVCXPRT 51 + 52 + static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, 53 + struct sockaddr *sa, int salen, 54 + int flags); 55 + static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); 56 + static void svc_rdma_release_rqst(struct svc_rqst *); 57 + static void rdma_destroy_xprt(struct svcxprt_rdma *xprt); 58 + static void dto_tasklet_func(unsigned long data); 59 + static void svc_rdma_detach(struct svc_xprt *xprt); 60 + static void svc_rdma_free(struct svc_xprt *xprt); 61 + static int svc_rdma_has_wspace(struct svc_xprt *xprt); 62 + static void rq_cq_reap(struct svcxprt_rdma *xprt); 63 + static void sq_cq_reap(struct svcxprt_rdma *xprt); 64 + 65 + DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL); 66 + static DEFINE_SPINLOCK(dto_lock); 67 + static LIST_HEAD(dto_xprt_q); 68 + 69 + static struct svc_xprt_ops svc_rdma_ops = { 70 + .xpo_create = svc_rdma_create, 71 + .xpo_recvfrom = svc_rdma_recvfrom, 72 + .xpo_sendto = svc_rdma_sendto, 73 + .xpo_release_rqst = svc_rdma_release_rqst, 74 + .xpo_detach = svc_rdma_detach, 75 + .xpo_free = svc_rdma_free, 76 + .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, 77 + .xpo_has_wspace = svc_rdma_has_wspace, 78 + .xpo_accept = svc_rdma_accept, 79 + }; 80 + 81 + struct svc_xprt_class svc_rdma_class = { 82 + .xcl_name = "rdma", 83 + .xcl_owner = THIS_MODULE, 84 + .xcl_ops = &svc_rdma_ops, 85 + .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, 86 + }; 87 + 88 + static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) 89 + { 90 + int target; 91 + int at_least_one = 0; 92 + struct svc_rdma_op_ctxt *ctxt; 93 + 94 + target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump, 95 + xprt->sc_ctxt_max); 96 + 97 + spin_lock_bh(&xprt->sc_ctxt_lock); 98 + while (xprt->sc_ctxt_cnt < target) { 99 + xprt->sc_ctxt_cnt++; 100 + spin_unlock_bh(&xprt->sc_ctxt_lock); 101 + 102 + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); 103 + 104 + spin_lock_bh(&xprt->sc_ctxt_lock); 105 + if (ctxt) { 106 + at_least_one = 1; 107 + ctxt->next = xprt->sc_ctxt_head; 108 + xprt->sc_ctxt_head = ctxt; 109 + } else { 110 + /* kmalloc failed...give up for now */ 111 + xprt->sc_ctxt_cnt--; 112 + break; 113 + } 114 + } 115 + spin_unlock_bh(&xprt->sc_ctxt_lock); 116 + dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n", 117 + xprt->sc_ctxt_max, xprt->sc_ctxt_cnt); 118 + return at_least_one; 119 + } 120 + 121 + struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) 122 + { 123 + struct svc_rdma_op_ctxt *ctxt; 124 + 125 + while (1) { 126 + spin_lock_bh(&xprt->sc_ctxt_lock); 127 + if (unlikely(xprt->sc_ctxt_head == NULL)) { 128 + /* Try to bump my cache. */ 129 + spin_unlock_bh(&xprt->sc_ctxt_lock); 130 + 131 + if (rdma_bump_context_cache(xprt)) 132 + continue; 133 + 134 + printk(KERN_INFO "svcrdma: sleeping waiting for " 135 + "context memory on xprt=%p\n", 136 + xprt); 137 + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); 138 + continue; 139 + } 140 + ctxt = xprt->sc_ctxt_head; 141 + xprt->sc_ctxt_head = ctxt->next; 142 + spin_unlock_bh(&xprt->sc_ctxt_lock); 143 + ctxt->xprt = xprt; 144 + INIT_LIST_HEAD(&ctxt->dto_q); 145 + ctxt->count = 0; 146 + break; 147 + } 148 + return ctxt; 149 + } 150 + 151 + void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) 152 + { 153 + struct svcxprt_rdma *xprt; 154 + int i; 155 + 156 + BUG_ON(!ctxt); 157 + xprt = ctxt->xprt; 158 + if (free_pages) 159 + for (i = 0; i < ctxt->count; i++) 160 + put_page(ctxt->pages[i]); 161 + 162 + for (i = 0; i < ctxt->count; i++) 163 + dma_unmap_single(xprt->sc_cm_id->device->dma_device, 164 + ctxt->sge[i].addr, 165 + ctxt->sge[i].length, 166 + ctxt->direction); 167 + spin_lock_bh(&xprt->sc_ctxt_lock); 168 + ctxt->next = xprt->sc_ctxt_head; 169 + xprt->sc_ctxt_head = ctxt; 170 + spin_unlock_bh(&xprt->sc_ctxt_lock); 171 + } 172 + 173 + /* ib_cq event handler */ 174 + static void cq_event_handler(struct ib_event *event, void *context) 175 + { 176 + struct svc_xprt *xprt = context; 177 + dprintk("svcrdma: received CQ event id=%d, context=%p\n", 178 + event->event, context); 179 + set_bit(XPT_CLOSE, &xprt->xpt_flags); 180 + } 181 + 182 + /* QP event handler */ 183 + static void qp_event_handler(struct ib_event *event, void *context) 184 + { 185 + struct svc_xprt *xprt = context; 186 + 187 + switch (event->event) { 188 + /* These are considered benign events */ 189 + case IB_EVENT_PATH_MIG: 190 + case IB_EVENT_COMM_EST: 191 + case IB_EVENT_SQ_DRAINED: 192 + case IB_EVENT_QP_LAST_WQE_REACHED: 193 + dprintk("svcrdma: QP event %d received for QP=%p\n", 194 + event->event, event->element.qp); 195 + break; 196 + /* These are considered fatal events */ 197 + case IB_EVENT_PATH_MIG_ERR: 198 + case IB_EVENT_QP_FATAL: 199 + case IB_EVENT_QP_REQ_ERR: 200 + case IB_EVENT_QP_ACCESS_ERR: 201 + case IB_EVENT_DEVICE_FATAL: 202 + default: 203 + dprintk("svcrdma: QP ERROR event %d received for QP=%p, " 204 + "closing transport\n", 205 + event->event, event->element.qp); 206 + set_bit(XPT_CLOSE, &xprt->xpt_flags); 207 + break; 208 + } 209 + } 210 + 211 + /* 212 + * Data Transfer Operation Tasklet 213 + * 214 + * Walks a list of transports with I/O pending, removing entries as 215 + * they are added to the server's I/O pending list. Two bits indicate 216 + * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave 217 + * spinlock that serializes access to the transport list with the RQ 218 + * and SQ interrupt handlers. 219 + */ 220 + static void dto_tasklet_func(unsigned long data) 221 + { 222 + struct svcxprt_rdma *xprt; 223 + unsigned long flags; 224 + 225 + spin_lock_irqsave(&dto_lock, flags); 226 + while (!list_empty(&dto_xprt_q)) { 227 + xprt = list_entry(dto_xprt_q.next, 228 + struct svcxprt_rdma, sc_dto_q); 229 + list_del_init(&xprt->sc_dto_q); 230 + spin_unlock_irqrestore(&dto_lock, flags); 231 + 232 + if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) { 233 + ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); 234 + rq_cq_reap(xprt); 235 + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 236 + /* 237 + * If data arrived before established event, 238 + * don't enqueue. This defers RPC I/O until the 239 + * RDMA connection is complete. 240 + */ 241 + if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) 242 + svc_xprt_enqueue(&xprt->sc_xprt); 243 + } 244 + 245 + if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) { 246 + ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); 247 + sq_cq_reap(xprt); 248 + } 249 + 250 + spin_lock_irqsave(&dto_lock, flags); 251 + } 252 + spin_unlock_irqrestore(&dto_lock, flags); 253 + } 254 + 255 + /* 256 + * Receive Queue Completion Handler 257 + * 258 + * Since an RQ completion handler is called on interrupt context, we 259 + * need to defer the handling of the I/O to a tasklet 260 + */ 261 + static void rq_comp_handler(struct ib_cq *cq, void *cq_context) 262 + { 263 + struct svcxprt_rdma *xprt = cq_context; 264 + unsigned long flags; 265 + 266 + /* 267 + * Set the bit regardless of whether or not it's on the list 268 + * because it may be on the list already due to an SQ 269 + * completion. 270 + */ 271 + set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); 272 + 273 + /* 274 + * If this transport is not already on the DTO transport queue, 275 + * add it 276 + */ 277 + spin_lock_irqsave(&dto_lock, flags); 278 + if (list_empty(&xprt->sc_dto_q)) 279 + list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); 280 + spin_unlock_irqrestore(&dto_lock, flags); 281 + 282 + /* Tasklet does all the work to avoid irqsave locks. */ 283 + tasklet_schedule(&dto_tasklet); 284 + } 285 + 286 + /* 287 + * rq_cq_reap - Process the RQ CQ. 288 + * 289 + * Take all completing WC off the CQE and enqueue the associated DTO 290 + * context on the dto_q for the transport. 291 + */ 292 + static void rq_cq_reap(struct svcxprt_rdma *xprt) 293 + { 294 + int ret; 295 + struct ib_wc wc; 296 + struct svc_rdma_op_ctxt *ctxt = NULL; 297 + 298 + atomic_inc(&rdma_stat_rq_poll); 299 + 300 + spin_lock_bh(&xprt->sc_rq_dto_lock); 301 + while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { 302 + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; 303 + ctxt->wc_status = wc.status; 304 + ctxt->byte_len = wc.byte_len; 305 + if (wc.status != IB_WC_SUCCESS) { 306 + /* Close the transport */ 307 + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); 308 + svc_rdma_put_context(ctxt, 1); 309 + continue; 310 + } 311 + list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); 312 + } 313 + spin_unlock_bh(&xprt->sc_rq_dto_lock); 314 + 315 + if (ctxt) 316 + atomic_inc(&rdma_stat_rq_prod); 317 + } 318 + 319 + /* 320 + * Send Queue Completion Handler - potentially called on interrupt context. 321 + */ 322 + static void sq_cq_reap(struct svcxprt_rdma *xprt) 323 + { 324 + struct svc_rdma_op_ctxt *ctxt = NULL; 325 + struct ib_wc wc; 326 + struct ib_cq *cq = xprt->sc_sq_cq; 327 + int ret; 328 + 329 + atomic_inc(&rdma_stat_sq_poll); 330 + while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { 331 + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; 332 + xprt = ctxt->xprt; 333 + 334 + if (wc.status != IB_WC_SUCCESS) 335 + /* Close the transport */ 336 + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); 337 + 338 + /* Decrement used SQ WR count */ 339 + atomic_dec(&xprt->sc_sq_count); 340 + wake_up(&xprt->sc_send_wait); 341 + 342 + switch (ctxt->wr_op) { 343 + case IB_WR_SEND: 344 + case IB_WR_RDMA_WRITE: 345 + svc_rdma_put_context(ctxt, 1); 346 + break; 347 + 348 + case IB_WR_RDMA_READ: 349 + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { 350 + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 351 + set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); 352 + spin_lock_bh(&xprt->sc_read_complete_lock); 353 + list_add_tail(&ctxt->dto_q, 354 + &xprt->sc_read_complete_q); 355 + spin_unlock_bh(&xprt->sc_read_complete_lock); 356 + svc_xprt_enqueue(&xprt->sc_xprt); 357 + } 358 + break; 359 + 360 + default: 361 + printk(KERN_ERR "svcrdma: unexpected completion type, " 362 + "opcode=%d, status=%d\n", 363 + wc.opcode, wc.status); 364 + break; 365 + } 366 + } 367 + 368 + if (ctxt) 369 + atomic_inc(&rdma_stat_sq_prod); 370 + } 371 + 372 + static void sq_comp_handler(struct ib_cq *cq, void *cq_context) 373 + { 374 + struct svcxprt_rdma *xprt = cq_context; 375 + unsigned long flags; 376 + 377 + /* 378 + * Set the bit regardless of whether or not it's on the list 379 + * because it may be on the list already due to an RQ 380 + * completion. 381 + */ 382 + set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); 383 + 384 + /* 385 + * If this transport is not already on the DTO transport queue, 386 + * add it 387 + */ 388 + spin_lock_irqsave(&dto_lock, flags); 389 + if (list_empty(&xprt->sc_dto_q)) 390 + list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); 391 + spin_unlock_irqrestore(&dto_lock, flags); 392 + 393 + /* Tasklet does all the work to avoid irqsave locks. */ 394 + tasklet_schedule(&dto_tasklet); 395 + } 396 + 397 + static void create_context_cache(struct svcxprt_rdma *xprt, 398 + int ctxt_count, int ctxt_bump, int ctxt_max) 399 + { 400 + struct svc_rdma_op_ctxt *ctxt; 401 + int i; 402 + 403 + xprt->sc_ctxt_max = ctxt_max; 404 + xprt->sc_ctxt_bump = ctxt_bump; 405 + xprt->sc_ctxt_cnt = 0; 406 + xprt->sc_ctxt_head = NULL; 407 + for (i = 0; i < ctxt_count; i++) { 408 + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); 409 + if (ctxt) { 410 + ctxt->next = xprt->sc_ctxt_head; 411 + xprt->sc_ctxt_head = ctxt; 412 + xprt->sc_ctxt_cnt++; 413 + } 414 + } 415 + } 416 + 417 + static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt) 418 + { 419 + struct svc_rdma_op_ctxt *next; 420 + if (!ctxt) 421 + return; 422 + 423 + do { 424 + next = ctxt->next; 425 + kfree(ctxt); 426 + ctxt = next; 427 + } while (next); 428 + } 429 + 430 + static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, 431 + int listener) 432 + { 433 + struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); 434 + 435 + if (!cma_xprt) 436 + return NULL; 437 + svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv); 438 + INIT_LIST_HEAD(&cma_xprt->sc_accept_q); 439 + INIT_LIST_HEAD(&cma_xprt->sc_dto_q); 440 + INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); 441 + INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); 442 + init_waitqueue_head(&cma_xprt->sc_send_wait); 443 + 444 + spin_lock_init(&cma_xprt->sc_lock); 445 + spin_lock_init(&cma_xprt->sc_read_complete_lock); 446 + spin_lock_init(&cma_xprt->sc_ctxt_lock); 447 + spin_lock_init(&cma_xprt->sc_rq_dto_lock); 448 + 449 + cma_xprt->sc_ord = svcrdma_ord; 450 + 451 + cma_xprt->sc_max_req_size = svcrdma_max_req_size; 452 + cma_xprt->sc_max_requests = svcrdma_max_requests; 453 + cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; 454 + atomic_set(&cma_xprt->sc_sq_count, 0); 455 + 456 + if (!listener) { 457 + int reqs = cma_xprt->sc_max_requests; 458 + create_context_cache(cma_xprt, 459 + reqs << 1, /* starting size */ 460 + reqs, /* bump amount */ 461 + reqs + 462 + cma_xprt->sc_sq_depth + 463 + RPCRDMA_MAX_THREADS + 1); /* max */ 464 + if (!cma_xprt->sc_ctxt_head) { 465 + kfree(cma_xprt); 466 + return NULL; 467 + } 468 + clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); 469 + } else 470 + set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); 471 + 472 + return cma_xprt; 473 + } 474 + 475 + struct page *svc_rdma_get_page(void) 476 + { 477 + struct page *page; 478 + 479 + while ((page = alloc_page(GFP_KERNEL)) == NULL) { 480 + /* If we can't get memory, wait a bit and try again */ 481 + printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 " 482 + "jiffies.\n"); 483 + schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); 484 + } 485 + return page; 486 + } 487 + 488 + int svc_rdma_post_recv(struct svcxprt_rdma *xprt) 489 + { 490 + struct ib_recv_wr recv_wr, *bad_recv_wr; 491 + struct svc_rdma_op_ctxt *ctxt; 492 + struct page *page; 493 + unsigned long pa; 494 + int sge_no; 495 + int buflen; 496 + int ret; 497 + 498 + ctxt = svc_rdma_get_context(xprt); 499 + buflen = 0; 500 + ctxt->direction = DMA_FROM_DEVICE; 501 + for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { 502 + BUG_ON(sge_no >= xprt->sc_max_sge); 503 + page = svc_rdma_get_page(); 504 + ctxt->pages[sge_no] = page; 505 + pa = ib_dma_map_page(xprt->sc_cm_id->device, 506 + page, 0, PAGE_SIZE, 507 + DMA_FROM_DEVICE); 508 + ctxt->sge[sge_no].addr = pa; 509 + ctxt->sge[sge_no].length = PAGE_SIZE; 510 + ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey; 511 + buflen += PAGE_SIZE; 512 + } 513 + ctxt->count = sge_no; 514 + recv_wr.next = NULL; 515 + recv_wr.sg_list = &ctxt->sge[0]; 516 + recv_wr.num_sge = ctxt->count; 517 + recv_wr.wr_id = (u64)(unsigned long)ctxt; 518 + 519 + ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); 520 + return ret; 521 + } 522 + 523 + /* 524 + * This function handles the CONNECT_REQUEST event on a listening 525 + * endpoint. It is passed the cma_id for the _new_ connection. The context in 526 + * this cma_id is inherited from the listening cma_id and is the svc_xprt 527 + * structure for the listening endpoint. 528 + * 529 + * This function creates a new xprt for the new connection and enqueues it on 530 + * the accept queue for the listent xprt. When the listen thread is kicked, it 531 + * will call the recvfrom method on the listen xprt which will accept the new 532 + * connection. 533 + */ 534 + static void handle_connect_req(struct rdma_cm_id *new_cma_id) 535 + { 536 + struct svcxprt_rdma *listen_xprt = new_cma_id->context; 537 + struct svcxprt_rdma *newxprt; 538 + 539 + /* Create a new transport */ 540 + newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); 541 + if (!newxprt) { 542 + dprintk("svcrdma: failed to create new transport\n"); 543 + return; 544 + } 545 + newxprt->sc_cm_id = new_cma_id; 546 + new_cma_id->context = newxprt; 547 + dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", 548 + newxprt, newxprt->sc_cm_id, listen_xprt); 549 + 550 + /* 551 + * Enqueue the new transport on the accept queue of the listening 552 + * transport 553 + */ 554 + spin_lock_bh(&listen_xprt->sc_lock); 555 + list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q); 556 + spin_unlock_bh(&listen_xprt->sc_lock); 557 + 558 + /* 559 + * Can't use svc_xprt_received here because we are not on a 560 + * rqstp thread 561 + */ 562 + set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags); 563 + svc_xprt_enqueue(&listen_xprt->sc_xprt); 564 + } 565 + 566 + /* 567 + * Handles events generated on the listening endpoint. These events will be 568 + * either be incoming connect requests or adapter removal events. 569 + */ 570 + static int rdma_listen_handler(struct rdma_cm_id *cma_id, 571 + struct rdma_cm_event *event) 572 + { 573 + struct svcxprt_rdma *xprt = cma_id->context; 574 + int ret = 0; 575 + 576 + switch (event->event) { 577 + case RDMA_CM_EVENT_CONNECT_REQUEST: 578 + dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " 579 + "event=%d\n", cma_id, cma_id->context, event->event); 580 + handle_connect_req(cma_id); 581 + break; 582 + 583 + case RDMA_CM_EVENT_ESTABLISHED: 584 + /* Accept complete */ 585 + dprintk("svcrdma: Connection completed on LISTEN xprt=%p, " 586 + "cm_id=%p\n", xprt, cma_id); 587 + break; 588 + 589 + case RDMA_CM_EVENT_DEVICE_REMOVAL: 590 + dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n", 591 + xprt, cma_id); 592 + if (xprt) 593 + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); 594 + break; 595 + 596 + default: 597 + dprintk("svcrdma: Unexpected event on listening endpoint %p, " 598 + "event=%d\n", cma_id, event->event); 599 + break; 600 + } 601 + 602 + return ret; 603 + } 604 + 605 + static int rdma_cma_handler(struct rdma_cm_id *cma_id, 606 + struct rdma_cm_event *event) 607 + { 608 + struct svc_xprt *xprt = cma_id->context; 609 + struct svcxprt_rdma *rdma = 610 + container_of(xprt, struct svcxprt_rdma, sc_xprt); 611 + switch (event->event) { 612 + case RDMA_CM_EVENT_ESTABLISHED: 613 + /* Accept complete */ 614 + dprintk("svcrdma: Connection completed on DTO xprt=%p, " 615 + "cm_id=%p\n", xprt, cma_id); 616 + clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); 617 + svc_xprt_enqueue(xprt); 618 + break; 619 + case RDMA_CM_EVENT_DISCONNECTED: 620 + dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", 621 + xprt, cma_id); 622 + if (xprt) { 623 + set_bit(XPT_CLOSE, &xprt->xpt_flags); 624 + svc_xprt_enqueue(xprt); 625 + } 626 + break; 627 + case RDMA_CM_EVENT_DEVICE_REMOVAL: 628 + dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " 629 + "event=%d\n", cma_id, xprt, event->event); 630 + if (xprt) { 631 + set_bit(XPT_CLOSE, &xprt->xpt_flags); 632 + svc_xprt_enqueue(xprt); 633 + } 634 + break; 635 + default: 636 + dprintk("svcrdma: Unexpected event on DTO endpoint %p, " 637 + "event=%d\n", cma_id, event->event); 638 + break; 639 + } 640 + return 0; 641 + } 642 + 643 + /* 644 + * Create a listening RDMA service endpoint. 645 + */ 646 + static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, 647 + struct sockaddr *sa, int salen, 648 + int flags) 649 + { 650 + struct rdma_cm_id *listen_id; 651 + struct svcxprt_rdma *cma_xprt; 652 + struct svc_xprt *xprt; 653 + int ret; 654 + 655 + dprintk("svcrdma: Creating RDMA socket\n"); 656 + 657 + cma_xprt = rdma_create_xprt(serv, 1); 658 + if (!cma_xprt) 659 + return ERR_PTR(ENOMEM); 660 + xprt = &cma_xprt->sc_xprt; 661 + 662 + listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); 663 + if (IS_ERR(listen_id)) { 664 + rdma_destroy_xprt(cma_xprt); 665 + dprintk("svcrdma: rdma_create_id failed = %ld\n", 666 + PTR_ERR(listen_id)); 667 + return (void *)listen_id; 668 + } 669 + ret = rdma_bind_addr(listen_id, sa); 670 + if (ret) { 671 + rdma_destroy_xprt(cma_xprt); 672 + rdma_destroy_id(listen_id); 673 + dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); 674 + return ERR_PTR(ret); 675 + } 676 + cma_xprt->sc_cm_id = listen_id; 677 + 678 + ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); 679 + if (ret) { 680 + rdma_destroy_id(listen_id); 681 + rdma_destroy_xprt(cma_xprt); 682 + dprintk("svcrdma: rdma_listen failed = %d\n", ret); 683 + } 684 + 685 + /* 686 + * We need to use the address from the cm_id in case the 687 + * caller specified 0 for the port number. 688 + */ 689 + sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr; 690 + svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); 691 + 692 + return &cma_xprt->sc_xprt; 693 + } 694 + 695 + /* 696 + * This is the xpo_recvfrom function for listening endpoints. Its 697 + * purpose is to accept incoming connections. The CMA callback handler 698 + * has already created a new transport and attached it to the new CMA 699 + * ID. 700 + * 701 + * There is a queue of pending connections hung on the listening 702 + * transport. This queue contains the new svc_xprt structure. This 703 + * function takes svc_xprt structures off the accept_q and completes 704 + * the connection. 705 + */ 706 + static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) 707 + { 708 + struct svcxprt_rdma *listen_rdma; 709 + struct svcxprt_rdma *newxprt = NULL; 710 + struct rdma_conn_param conn_param; 711 + struct ib_qp_init_attr qp_attr; 712 + struct ib_device_attr devattr; 713 + struct sockaddr *sa; 714 + int ret; 715 + int i; 716 + 717 + listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); 718 + clear_bit(XPT_CONN, &xprt->xpt_flags); 719 + /* Get the next entry off the accept list */ 720 + spin_lock_bh(&listen_rdma->sc_lock); 721 + if (!list_empty(&listen_rdma->sc_accept_q)) { 722 + newxprt = list_entry(listen_rdma->sc_accept_q.next, 723 + struct svcxprt_rdma, sc_accept_q); 724 + list_del_init(&newxprt->sc_accept_q); 725 + } 726 + if (!list_empty(&listen_rdma->sc_accept_q)) 727 + set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags); 728 + spin_unlock_bh(&listen_rdma->sc_lock); 729 + if (!newxprt) 730 + return NULL; 731 + 732 + dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", 733 + newxprt, newxprt->sc_cm_id); 734 + 735 + ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); 736 + if (ret) { 737 + dprintk("svcrdma: could not query device attributes on " 738 + "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret); 739 + goto errout; 740 + } 741 + 742 + /* Qualify the transport resource defaults with the 743 + * capabilities of this particular device */ 744 + newxprt->sc_max_sge = min((size_t)devattr.max_sge, 745 + (size_t)RPCSVC_MAXPAGES); 746 + newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, 747 + (size_t)svcrdma_max_requests); 748 + newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; 749 + 750 + newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom, 751 + (size_t)svcrdma_ord); 752 + 753 + newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); 754 + if (IS_ERR(newxprt->sc_pd)) { 755 + dprintk("svcrdma: error creating PD for connect request\n"); 756 + goto errout; 757 + } 758 + newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, 759 + sq_comp_handler, 760 + cq_event_handler, 761 + newxprt, 762 + newxprt->sc_sq_depth, 763 + 0); 764 + if (IS_ERR(newxprt->sc_sq_cq)) { 765 + dprintk("svcrdma: error creating SQ CQ for connect request\n"); 766 + goto errout; 767 + } 768 + newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, 769 + rq_comp_handler, 770 + cq_event_handler, 771 + newxprt, 772 + newxprt->sc_max_requests, 773 + 0); 774 + if (IS_ERR(newxprt->sc_rq_cq)) { 775 + dprintk("svcrdma: error creating RQ CQ for connect request\n"); 776 + goto errout; 777 + } 778 + 779 + memset(&qp_attr, 0, sizeof qp_attr); 780 + qp_attr.event_handler = qp_event_handler; 781 + qp_attr.qp_context = &newxprt->sc_xprt; 782 + qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; 783 + qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; 784 + qp_attr.cap.max_send_sge = newxprt->sc_max_sge; 785 + qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; 786 + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 787 + qp_attr.qp_type = IB_QPT_RC; 788 + qp_attr.send_cq = newxprt->sc_sq_cq; 789 + qp_attr.recv_cq = newxprt->sc_rq_cq; 790 + dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n" 791 + " cm_id->device=%p, sc_pd->device=%p\n" 792 + " cap.max_send_wr = %d\n" 793 + " cap.max_recv_wr = %d\n" 794 + " cap.max_send_sge = %d\n" 795 + " cap.max_recv_sge = %d\n", 796 + newxprt->sc_cm_id, newxprt->sc_pd, 797 + newxprt->sc_cm_id->device, newxprt->sc_pd->device, 798 + qp_attr.cap.max_send_wr, 799 + qp_attr.cap.max_recv_wr, 800 + qp_attr.cap.max_send_sge, 801 + qp_attr.cap.max_recv_sge); 802 + 803 + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); 804 + if (ret) { 805 + /* 806 + * XXX: This is a hack. We need a xx_request_qp interface 807 + * that will adjust the qp_attr's with a best-effort 808 + * number 809 + */ 810 + qp_attr.cap.max_send_sge -= 2; 811 + qp_attr.cap.max_recv_sge -= 2; 812 + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, 813 + &qp_attr); 814 + if (ret) { 815 + dprintk("svcrdma: failed to create QP, ret=%d\n", ret); 816 + goto errout; 817 + } 818 + newxprt->sc_max_sge = qp_attr.cap.max_send_sge; 819 + newxprt->sc_max_sge = qp_attr.cap.max_recv_sge; 820 + newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; 821 + newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; 822 + } 823 + newxprt->sc_qp = newxprt->sc_cm_id->qp; 824 + 825 + /* Register all of physical memory */ 826 + newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd, 827 + IB_ACCESS_LOCAL_WRITE | 828 + IB_ACCESS_REMOTE_WRITE); 829 + if (IS_ERR(newxprt->sc_phys_mr)) { 830 + dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret); 831 + goto errout; 832 + } 833 + 834 + /* Post receive buffers */ 835 + for (i = 0; i < newxprt->sc_max_requests; i++) { 836 + ret = svc_rdma_post_recv(newxprt); 837 + if (ret) { 838 + dprintk("svcrdma: failure posting receive buffers\n"); 839 + goto errout; 840 + } 841 + } 842 + 843 + /* Swap out the handler */ 844 + newxprt->sc_cm_id->event_handler = rdma_cma_handler; 845 + 846 + /* Accept Connection */ 847 + set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); 848 + memset(&conn_param, 0, sizeof conn_param); 849 + conn_param.responder_resources = 0; 850 + conn_param.initiator_depth = newxprt->sc_ord; 851 + ret = rdma_accept(newxprt->sc_cm_id, &conn_param); 852 + if (ret) { 853 + dprintk("svcrdma: failed to accept new connection, ret=%d\n", 854 + ret); 855 + goto errout; 856 + } 857 + 858 + dprintk("svcrdma: new connection %p accepted with the following " 859 + "attributes:\n" 860 + " local_ip : %d.%d.%d.%d\n" 861 + " local_port : %d\n" 862 + " remote_ip : %d.%d.%d.%d\n" 863 + " remote_port : %d\n" 864 + " max_sge : %d\n" 865 + " sq_depth : %d\n" 866 + " max_requests : %d\n" 867 + " ord : %d\n", 868 + newxprt, 869 + NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id-> 870 + route.addr.src_addr)->sin_addr.s_addr), 871 + ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> 872 + route.addr.src_addr)->sin_port), 873 + NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id-> 874 + route.addr.dst_addr)->sin_addr.s_addr), 875 + ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> 876 + route.addr.dst_addr)->sin_port), 877 + newxprt->sc_max_sge, 878 + newxprt->sc_sq_depth, 879 + newxprt->sc_max_requests, 880 + newxprt->sc_ord); 881 + 882 + /* Set the local and remote addresses in the transport */ 883 + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; 884 + svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); 885 + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; 886 + svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); 887 + 888 + ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); 889 + ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); 890 + return &newxprt->sc_xprt; 891 + 892 + errout: 893 + dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); 894 + rdma_destroy_id(newxprt->sc_cm_id); 895 + rdma_destroy_xprt(newxprt); 896 + return NULL; 897 + } 898 + 899 + /* 900 + * Post an RQ WQE to the RQ when the rqst is being released. This 901 + * effectively returns an RQ credit to the client. The rq_xprt_ctxt 902 + * will be null if the request is deferred due to an RDMA_READ or the 903 + * transport had no data ready (EAGAIN). Note that an RPC deferred in 904 + * svc_process will still return the credit, this is because the data 905 + * is copied and no longer consume a WQE/WC. 906 + */ 907 + static void svc_rdma_release_rqst(struct svc_rqst *rqstp) 908 + { 909 + int err; 910 + struct svcxprt_rdma *rdma = 911 + container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); 912 + if (rqstp->rq_xprt_ctxt) { 913 + BUG_ON(rqstp->rq_xprt_ctxt != rdma); 914 + err = svc_rdma_post_recv(rdma); 915 + if (err) 916 + dprintk("svcrdma: failed to post an RQ WQE error=%d\n", 917 + err); 918 + } 919 + rqstp->rq_xprt_ctxt = NULL; 920 + } 921 + 922 + /* Disable data ready events for this connection */ 923 + static void svc_rdma_detach(struct svc_xprt *xprt) 924 + { 925 + struct svcxprt_rdma *rdma = 926 + container_of(xprt, struct svcxprt_rdma, sc_xprt); 927 + unsigned long flags; 928 + 929 + dprintk("svc: svc_rdma_detach(%p)\n", xprt); 930 + /* 931 + * Shutdown the connection. This will ensure we don't get any 932 + * more events from the provider. 933 + */ 934 + rdma_disconnect(rdma->sc_cm_id); 935 + rdma_destroy_id(rdma->sc_cm_id); 936 + 937 + /* We may already be on the DTO list */ 938 + spin_lock_irqsave(&dto_lock, flags); 939 + if (!list_empty(&rdma->sc_dto_q)) 940 + list_del_init(&rdma->sc_dto_q); 941 + spin_unlock_irqrestore(&dto_lock, flags); 942 + } 943 + 944 + static void svc_rdma_free(struct svc_xprt *xprt) 945 + { 946 + struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt; 947 + dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); 948 + rdma_destroy_xprt(rdma); 949 + kfree(rdma); 950 + } 951 + 952 + static void rdma_destroy_xprt(struct svcxprt_rdma *xprt) 953 + { 954 + if (xprt->sc_qp && !IS_ERR(xprt->sc_qp)) 955 + ib_destroy_qp(xprt->sc_qp); 956 + 957 + if (xprt->sc_sq_cq && !IS_ERR(xprt->sc_sq_cq)) 958 + ib_destroy_cq(xprt->sc_sq_cq); 959 + 960 + if (xprt->sc_rq_cq && !IS_ERR(xprt->sc_rq_cq)) 961 + ib_destroy_cq(xprt->sc_rq_cq); 962 + 963 + if (xprt->sc_phys_mr && !IS_ERR(xprt->sc_phys_mr)) 964 + ib_dereg_mr(xprt->sc_phys_mr); 965 + 966 + if (xprt->sc_pd && !IS_ERR(xprt->sc_pd)) 967 + ib_dealloc_pd(xprt->sc_pd); 968 + 969 + destroy_context_cache(xprt->sc_ctxt_head); 970 + } 971 + 972 + static int svc_rdma_has_wspace(struct svc_xprt *xprt) 973 + { 974 + struct svcxprt_rdma *rdma = 975 + container_of(xprt, struct svcxprt_rdma, sc_xprt); 976 + 977 + /* 978 + * If there are fewer SQ WR available than required to send a 979 + * simple response, return false. 980 + */ 981 + if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3)) 982 + return 0; 983 + 984 + /* 985 + * ...or there are already waiters on the SQ, 986 + * return false. 987 + */ 988 + if (waitqueue_active(&rdma->sc_send_wait)) 989 + return 0; 990 + 991 + /* Otherwise return true. */ 992 + return 1; 993 + } 994 + 995 + int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) 996 + { 997 + struct ib_send_wr *bad_wr; 998 + int ret; 999 + 1000 + if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) 1001 + return 0; 1002 + 1003 + BUG_ON(wr->send_flags != IB_SEND_SIGNALED); 1004 + BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != 1005 + wr->opcode); 1006 + /* If the SQ is full, wait until an SQ entry is available */ 1007 + while (1) { 1008 + spin_lock_bh(&xprt->sc_lock); 1009 + if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { 1010 + spin_unlock_bh(&xprt->sc_lock); 1011 + atomic_inc(&rdma_stat_sq_starve); 1012 + /* See if we can reap some SQ WR */ 1013 + sq_cq_reap(xprt); 1014 + 1015 + /* Wait until SQ WR available if SQ still full */ 1016 + wait_event(xprt->sc_send_wait, 1017 + atomic_read(&xprt->sc_sq_count) < 1018 + xprt->sc_sq_depth); 1019 + continue; 1020 + } 1021 + /* Bumped used SQ WR count and post */ 1022 + ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); 1023 + if (!ret) 1024 + atomic_inc(&xprt->sc_sq_count); 1025 + else 1026 + dprintk("svcrdma: failed to post SQ WR rc=%d, " 1027 + "sc_sq_count=%d, sc_sq_depth=%d\n", 1028 + ret, atomic_read(&xprt->sc_sq_count), 1029 + xprt->sc_sq_depth); 1030 + spin_unlock_bh(&xprt->sc_lock); 1031 + break; 1032 + } 1033 + return ret; 1034 + } 1035 + 1036 + int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, 1037 + enum rpcrdma_errcode err) 1038 + { 1039 + struct ib_send_wr err_wr; 1040 + struct ib_sge sge; 1041 + struct page *p; 1042 + struct svc_rdma_op_ctxt *ctxt; 1043 + u32 *va; 1044 + int length; 1045 + int ret; 1046 + 1047 + p = svc_rdma_get_page(); 1048 + va = page_address(p); 1049 + 1050 + /* XDR encode error */ 1051 + length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); 1052 + 1053 + /* Prepare SGE for local address */ 1054 + sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, 1055 + p, 0, PAGE_SIZE, DMA_FROM_DEVICE); 1056 + sge.lkey = xprt->sc_phys_mr->lkey; 1057 + sge.length = length; 1058 + 1059 + ctxt = svc_rdma_get_context(xprt); 1060 + ctxt->count = 1; 1061 + ctxt->pages[0] = p; 1062 + 1063 + /* Prepare SEND WR */ 1064 + memset(&err_wr, 0, sizeof err_wr); 1065 + ctxt->wr_op = IB_WR_SEND; 1066 + err_wr.wr_id = (unsigned long)ctxt; 1067 + err_wr.sg_list = &sge; 1068 + err_wr.num_sge = 1; 1069 + err_wr.opcode = IB_WR_SEND; 1070 + err_wr.send_flags = IB_SEND_SIGNALED; 1071 + 1072 + /* Post It */ 1073 + ret = svc_rdma_send(xprt, &err_wr); 1074 + if (ret) { 1075 + dprintk("svcrdma: Error posting send = %d\n", ret); 1076 + svc_rdma_put_context(ctxt, 1); 1077 + } 1078 + 1079 + return ret; 1080 + }