commit f4921aff5b174349bc36551f142a5dbac782ea3f · tjh.dev/kernel

+7

Documentation/kernel-parameters.txt

··· 1083 1083 [NFS] set the maximum lifetime for idmapper cache 1084 1084 entries. 1085 1085 1086 + nfs.enable_ino64= 1087 + [NFS] enable 64-bit inode numbers. 1088 + If zero, the NFS client will fake up a 32-bit inode 1089 + number for the readdir() and stat() syscalls instead 1090 + of returning the full 64-bit number. 1091 + The default is to return 64-bit inode numbers. 1092 + 1086 1093 nmi_watchdog= [KNL,BUGS=X86-32] Debugging features for SMP kernels 1087 1094 1088 1095 no387 [BUGS=X86-32] Tells the kernel to use the 387 maths

+8

fs/Kconfig

··· 1755 1755 config SUNRPC_GSS 1756 1756 tristate 1757 1757 1758 + config SUNRPC_XPRT_RDMA 1759 + tristate "RDMA transport for sunrpc (EXPERIMENTAL)" 1760 + depends on SUNRPC && INFINIBAND && EXPERIMENTAL 1761 + default m 1762 + help 1763 + Adds a client RPC transport for supporting kernel NFS over RDMA 1764 + mounts, including Infiniband and iWARP. Experimental. 1765 + 1758 1766 config SUNRPC_BIND34 1759 1767 bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)" 1760 1768 depends on SUNRPC && EXPERIMENTAL

+2 -1

fs/lockd/mon.c

··· 10 10 #include <linux/utsname.h> 11 11 #include <linux/kernel.h> 12 12 #include <linux/sunrpc/clnt.h> 13 + #include <linux/sunrpc/xprtsock.h> 13 14 #include <linux/sunrpc/svc.h> 14 15 #include <linux/lockd/lockd.h> 15 16 #include <linux/lockd/sm_inter.h> ··· 133 132 .sin_port = 0, 134 133 }; 135 134 struct rpc_create_args args = { 136 - .protocol = IPPROTO_UDP, 135 + .protocol = XPRT_TRANSPORT_UDP, 137 136 .address = (struct sockaddr *)&sin, 138 137 .addrsize = sizeof(sin), 139 138 .servername = "localhost",

+4 -4

fs/lockd/xdr.c

··· 62 62 } 63 63 else 64 64 { 65 - printk(KERN_NOTICE 66 - "lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN); 65 + dprintk("lockd: bad cookie size %d (only cookies under " 66 + "%d bytes are supported.)\n", 67 + len, NLM_MAXCOOKIELEN); 67 68 return NULL; 68 69 } 69 70 return p; ··· 85 84 unsigned int len; 86 85 87 86 if ((len = ntohl(*p++)) != NFS2_FHSIZE) { 88 - printk(KERN_NOTICE 89 - "lockd: bad fhandle size %d (should be %d)\n", 87 + dprintk("lockd: bad fhandle size %d (should be %d)\n", 90 88 len, NFS2_FHSIZE); 91 89 return NULL; 92 90 }

+4 -4

fs/lockd/xdr4.c

··· 64 64 } 65 65 else 66 66 { 67 - printk(KERN_NOTICE 68 - "lockd: bad cookie size %d (only cookies under %d bytes are supported.)\n", len, NLM_MAXCOOKIELEN); 67 + dprintk("lockd: bad cookie size %d (only cookies under " 68 + "%d bytes are supported.)\n", 69 + len, NLM_MAXCOOKIELEN); 69 70 return NULL; 70 71 } 71 72 return p; ··· 87 86 memset(f->data, 0, sizeof(f->data)); 88 87 f->size = ntohl(*p++); 89 88 if (f->size > NFS_MAXFHSIZE) { 90 - printk(KERN_NOTICE 91 - "lockd: bad fhandle size %d (should be <=%d)\n", 89 + dprintk("lockd: bad fhandle size %d (should be <=%d)\n", 92 90 f->size, NFS_MAXFHSIZE); 93 91 return NULL; 94 92 }

-1

fs/nfs/Makefile

··· 16 16 nfs4namespace.o 17 17 nfs-$(CONFIG_NFS_DIRECTIO) += direct.o 18 18 nfs-$(CONFIG_SYSCTL) += sysctl.o 19 - nfs-objs := $(nfs-y)

+27 -22

fs/nfs/client.c

··· 23 23 #include <linux/sunrpc/clnt.h> 24 24 #include <linux/sunrpc/stats.h> 25 25 #include <linux/sunrpc/metrics.h> 26 + #include <linux/sunrpc/xprtsock.h> 27 + #include <linux/sunrpc/xprtrdma.h> 26 28 #include <linux/nfs_fs.h> 27 29 #include <linux/nfs_mount.h> 28 30 #include <linux/nfs4_mount.h> ··· 342 340 to->to_retries = 2; 343 341 344 342 switch (proto) { 345 - case IPPROTO_TCP: 343 + case XPRT_TRANSPORT_TCP: 344 + case XPRT_TRANSPORT_RDMA: 346 345 if (!to->to_initval) 347 346 to->to_initval = 60 * HZ; 348 347 if (to->to_initval > NFS_MAX_TCP_TIMEOUT) ··· 352 349 to->to_maxval = to->to_initval + (to->to_increment * to->to_retries); 353 350 to->to_exponential = 0; 354 351 break; 355 - case IPPROTO_UDP: 352 + case XPRT_TRANSPORT_UDP: 356 353 default: 357 354 if (!to->to_initval) 358 355 to->to_initval = 11 * HZ / 10; ··· 504 501 /* 505 502 * Initialise an NFS2 or NFS3 client 506 503 */ 507 - static int nfs_init_client(struct nfs_client *clp, const struct nfs_mount_data *data) 504 + static int nfs_init_client(struct nfs_client *clp, 505 + const struct nfs_parsed_mount_data *data) 508 506 { 509 - int proto = (data->flags & NFS_MOUNT_TCP) ? IPPROTO_TCP : IPPROTO_UDP; 510 507 int error; 511 508 512 509 if (clp->cl_cons_state == NFS_CS_READY) { ··· 525 522 * Create a client RPC handle for doing FSSTAT with UNIX auth only 526 523 * - RFC 2623, sec 2.3.2 527 524 */ 528 - error = nfs_create_rpc_client(clp, proto, data->timeo, data->retrans, 529 - RPC_AUTH_UNIX, 0); 525 + error = nfs_create_rpc_client(clp, data->nfs_server.protocol, 526 + data->timeo, data->retrans, RPC_AUTH_UNIX, 0); 530 527 if (error < 0) 531 528 goto error; 532 529 nfs_mark_client_ready(clp, NFS_CS_READY); ··· 541 538 /* 542 539 * Create a version 2 or 3 client 543 540 */ 544 - static int nfs_init_server(struct nfs_server *server, const struct nfs_mount_data *data) 541 + static int nfs_init_server(struct nfs_server *server, 542 + const struct nfs_parsed_mount_data *data) 545 543 { 546 544 struct nfs_client *clp; 547 545 int error, nfsvers = 2; ··· 555 551 #endif 556 552 557 553 /* Allocate or find a client reference we can use */ 558 - clp = nfs_get_client(data->hostname, &data->addr, nfsvers); 554 + clp = nfs_get_client(data->nfs_server.hostname, 555 + &data->nfs_server.address, nfsvers); 559 556 if (IS_ERR(clp)) { 560 557 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); 561 558 return PTR_ERR(clp); ··· 586 581 if (error < 0) 587 582 goto error; 588 583 589 - error = nfs_init_server_rpcclient(server, data->pseudoflavor); 584 + error = nfs_init_server_rpcclient(server, data->auth_flavors[0]); 590 585 if (error < 0) 591 586 goto error; 592 587 ··· 765 760 * Create a version 2 or 3 volume record 766 761 * - keyed on server and FSID 767 762 */ 768 - struct nfs_server *nfs_create_server(const struct nfs_mount_data *data, 763 + struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, 769 764 struct nfs_fh *mntfh) 770 765 { 771 766 struct nfs_server *server; ··· 911 906 * Create a version 4 volume record 912 907 */ 913 908 static int nfs4_init_server(struct nfs_server *server, 914 - const struct nfs4_mount_data *data, rpc_authflavor_t authflavour) 909 + const struct nfs_parsed_mount_data *data) 915 910 { 916 911 int error; 917 912 ··· 931 926 server->acdirmin = data->acdirmin * HZ; 932 927 server->acdirmax = data->acdirmax * HZ; 933 928 934 - error = nfs_init_server_rpcclient(server, authflavour); 929 + error = nfs_init_server_rpcclient(server, data->auth_flavors[0]); 935 930 936 931 /* Done */ 937 932 dprintk("<-- nfs4_init_server() = %d\n", error); ··· 942 937 * Create a version 4 volume record 943 938 * - keyed on server and FSID 944 939 */ 945 - struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *data, 946 - const char *hostname, 947 - const struct sockaddr_in *addr, 948 - const char *mntpath, 949 - const char *ip_addr, 950 - rpc_authflavor_t authflavour, 940 + struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, 951 941 struct nfs_fh *mntfh) 952 942 { 953 943 struct nfs_fattr fattr; ··· 956 956 return ERR_PTR(-ENOMEM); 957 957 958 958 /* Get a client record */ 959 - error = nfs4_set_client(server, hostname, addr, ip_addr, authflavour, 960 - data->proto, data->timeo, data->retrans); 959 + error = nfs4_set_client(server, 960 + data->nfs_server.hostname, 961 + &data->nfs_server.address, 962 + data->client_address, 963 + data->auth_flavors[0], 964 + data->nfs_server.protocol, 965 + data->timeo, data->retrans); 961 966 if (error < 0) 962 967 goto error; 963 968 964 969 /* set up the general RPC client */ 965 - error = nfs4_init_server(server, data, authflavour); 970 + error = nfs4_init_server(server, data); 966 971 if (error < 0) 967 972 goto error; 968 973 ··· 976 971 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 977 972 978 973 /* Probe the root fh to retrieve its FSID */ 979 - error = nfs4_path_walk(server, mntfh, mntpath); 974 + error = nfs4_path_walk(server, mntfh, data->nfs_server.export_path); 980 975 if (error < 0) 981 976 goto error; 982 977

+4 -2

fs/nfs/delegation.c

··· 52 52 for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) { 53 53 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 54 54 continue; 55 - if ((struct nfs_open_context *)fl->fl_file->private_data != ctx) 55 + if (nfs_file_open_context(fl->fl_file) != ctx) 56 56 continue; 57 57 status = nfs4_lock_delegation_recall(state, fl); 58 58 if (status >= 0) ··· 109 109 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res) 110 110 { 111 111 struct nfs_delegation *delegation = NFS_I(inode)->delegation; 112 + struct rpc_cred *oldcred; 112 113 113 114 if (delegation == NULL) 114 115 return; ··· 117 116 sizeof(delegation->stateid.data)); 118 117 delegation->type = res->delegation_type; 119 118 delegation->maxsize = res->maxsize; 120 - put_rpccred(cred); 119 + oldcred = delegation->cred; 121 120 delegation->cred = get_rpccred(cred); 122 121 delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM; 123 122 NFS_I(inode)->delegation_state = delegation->type; 124 123 smp_wmb(); 124 + put_rpccred(oldcred); 125 125 } 126 126 127 127 /*

+95 -168

fs/nfs/dir.c

··· 200 200 desc->timestamp = timestamp; 201 201 desc->timestamp_valid = 1; 202 202 SetPageUptodate(page); 203 - spin_lock(&inode->i_lock); 204 - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; 205 - spin_unlock(&inode->i_lock); 206 203 /* Ensure consistent page alignment of the data. 207 204 * Note: assumes we have exclusive access to this mapping either 208 205 * through inode->i_mutex or some other mechanism. ··· 211 214 unlock_page(page); 212 215 return 0; 213 216 error: 214 - SetPageError(page); 215 217 unlock_page(page); 216 - nfs_zap_caches(inode); 217 218 desc->error = error; 218 219 return -EIO; 219 220 } ··· 402 407 struct file *file = desc->file; 403 408 struct nfs_entry *entry = desc->entry; 404 409 struct dentry *dentry = NULL; 405 - unsigned long fileid; 410 + u64 fileid; 406 411 int loop_count = 0, 407 412 res; 408 413 ··· 413 418 unsigned d_type = DT_UNKNOWN; 414 419 /* Note: entry->prev_cookie contains the cookie for 415 420 * retrieving the current dirent on the server */ 416 - fileid = nfs_fileid_to_ino_t(entry->ino); 421 + fileid = entry->ino; 417 422 418 423 /* Get a dentry if we have one */ 419 424 if (dentry != NULL) ··· 423 428 /* Use readdirplus info */ 424 429 if (dentry != NULL && dentry->d_inode != NULL) { 425 430 d_type = dt_type(dentry->d_inode); 426 - fileid = dentry->d_inode->i_ino; 431 + fileid = NFS_FILEID(dentry->d_inode); 427 432 } 428 433 429 434 res = filldir(dirent, entry->name, entry->len, 430 - file->f_pos, fileid, d_type); 435 + file->f_pos, nfs_compat_user_ino64(fileid), 436 + d_type); 431 437 if (res < 0) 432 438 break; 433 439 file->f_pos++; ··· 486 490 page, 487 491 NFS_SERVER(inode)->dtsize, 488 492 desc->plus); 489 - spin_lock(&inode->i_lock); 490 - NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; 491 - spin_unlock(&inode->i_lock); 492 493 desc->page = page; 493 494 desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ 494 495 if (desc->error >= 0) { ··· 551 558 memset(desc, 0, sizeof(*desc)); 552 559 553 560 desc->file = filp; 554 - desc->dir_cookie = &((struct nfs_open_context *)filp->private_data)->dir_cookie; 561 + desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie; 555 562 desc->decode = NFS_PROTO(inode)->decode_dirent; 556 563 desc->plus = NFS_USE_READDIRPLUS(inode); 557 564 ··· 616 623 } 617 624 if (offset != filp->f_pos) { 618 625 filp->f_pos = offset; 619 - ((struct nfs_open_context *)filp->private_data)->dir_cookie = 0; 626 + nfs_file_open_context(filp)->dir_cookie = 0; 620 627 } 621 628 out: 622 629 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); ··· 643 650 */ 644 651 static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) 645 652 { 646 - unsigned long verf; 647 - 648 653 if (IS_ROOT(dentry)) 649 654 return 1; 650 - verf = dentry->d_time; 651 - if (nfs_caches_unstable(dir) 652 - || verf != NFS_I(dir)->cache_change_attribute) 655 + if (!nfs_verify_change_attribute(dir, dentry->d_time)) 656 + return 0; 657 + /* Revalidate nfsi->cache_change_attribute before we declare a match */ 658 + if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) 659 + return 0; 660 + if (!nfs_verify_change_attribute(dir, dentry->d_time)) 653 661 return 0; 654 662 return 1; 655 - } 656 - 657 - static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf) 658 - { 659 - dentry->d_time = verf; 660 - } 661 - 662 - static void nfs_refresh_verifier(struct dentry * dentry, unsigned long verf) 663 - { 664 - nfs_set_verifier(dentry, verf); 665 - } 666 - 667 - /* 668 - * Whenever an NFS operation succeeds, we know that the dentry 669 - * is valid, so we update the revalidation timestamp. 670 - */ 671 - static inline void nfs_renew_times(struct dentry * dentry) 672 - { 673 - dentry->d_time = jiffies; 674 663 } 675 664 676 665 /* ··· 667 692 if (nd->flags & (LOOKUP_CONTINUE|LOOKUP_PARENT)) 668 693 return 0; 669 694 return nd->flags & mask; 695 + } 696 + 697 + /* 698 + * Use intent information to check whether or not we're going to do 699 + * an O_EXCL create using this path component. 700 + */ 701 + static int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) 702 + { 703 + if (NFS_PROTO(dir)->version == 2) 704 + return 0; 705 + if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0) 706 + return 0; 707 + return (nd->intent.open.flags & O_EXCL) != 0; 670 708 } 671 709 672 710 /* ··· 705 717 (S_ISREG(inode->i_mode) || 706 718 S_ISDIR(inode->i_mode))) 707 719 goto out_force; 720 + return 0; 708 721 } 709 722 return nfs_revalidate_inode(server, inode); 710 723 out_force: ··· 748 759 int error; 749 760 struct nfs_fh fhandle; 750 761 struct nfs_fattr fattr; 751 - unsigned long verifier; 752 762 753 763 parent = dget_parent(dentry); 754 764 lock_kernel(); 755 765 dir = parent->d_inode; 756 766 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); 757 767 inode = dentry->d_inode; 758 - 759 - /* Revalidate parent directory attribute cache */ 760 - if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) 761 - goto out_zap_parent; 762 768 763 769 if (!inode) { 764 770 if (nfs_neg_need_reval(dir, dentry, nd)) ··· 769 785 } 770 786 771 787 /* Force a full look up iff the parent directory has changed */ 772 - if (nfs_check_verifier(dir, dentry)) { 788 + if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) { 773 789 if (nfs_lookup_verify_inode(inode, nd)) 774 790 goto out_zap_parent; 775 791 goto out_valid; ··· 778 794 if (NFS_STALE(inode)) 779 795 goto out_bad; 780 796 781 - verifier = nfs_save_change_attribute(dir); 782 797 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); 783 798 if (error) 784 799 goto out_bad; ··· 786 803 if ((error = nfs_refresh_inode(inode, &fattr)) != 0) 787 804 goto out_bad; 788 805 789 - nfs_renew_times(dentry); 790 - nfs_refresh_verifier(dentry, verifier); 806 + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 791 807 out_valid: 792 808 unlock_kernel(); 793 809 dput(parent); ··· 797 815 out_zap_parent: 798 816 nfs_zap_caches(dir); 799 817 out_bad: 800 - NFS_CACHEINV(dir); 818 + nfs_mark_for_revalidate(dir); 801 819 if (inode && S_ISDIR(inode->i_mode)) { 802 820 /* Purge readdir caches. */ 803 821 nfs_zap_caches(inode); ··· 854 872 nfs_complete_unlink(dentry, inode); 855 873 unlock_kernel(); 856 874 } 857 - /* When creating a negative dentry, we want to renew d_time */ 858 - nfs_renew_times(dentry); 859 875 iput(inode); 860 876 } 861 877 ··· 862 882 .d_delete = nfs_dentry_delete, 863 883 .d_iput = nfs_dentry_iput, 864 884 }; 865 - 866 - /* 867 - * Use intent information to check whether or not we're going to do 868 - * an O_EXCL create using this path component. 869 - */ 870 - static inline 871 - int nfs_is_exclusive_create(struct inode *dir, struct nameidata *nd) 872 - { 873 - if (NFS_PROTO(dir)->version == 2) 874 - return 0; 875 - if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_CREATE) == 0) 876 - return 0; 877 - return (nd->intent.open.flags & O_EXCL) != 0; 878 - } 879 - 880 - static inline int nfs_reval_fsid(struct inode *dir, const struct nfs_fattr *fattr) 881 - { 882 - struct nfs_server *server = NFS_SERVER(dir); 883 - 884 - if (!nfs_fsid_equal(&server->fsid, &fattr->fsid)) 885 - /* Revalidate fsid using the parent directory */ 886 - return __nfs_revalidate_inode(server, dir); 887 - return 0; 888 - } 889 885 890 886 static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 891 887 { ··· 901 945 res = ERR_PTR(error); 902 946 goto out_unlock; 903 947 } 904 - error = nfs_reval_fsid(dir, &fattr); 905 - if (error < 0) { 906 - res = ERR_PTR(error); 907 - goto out_unlock; 908 - } 909 948 inode = nfs_fhget(dentry->d_sb, &fhandle, &fattr); 910 949 res = (struct dentry *)inode; 911 950 if (IS_ERR(res)) ··· 909 958 no_entry: 910 959 res = d_materialise_unique(dentry, inode); 911 960 if (res != NULL) { 912 - struct dentry *parent; 913 961 if (IS_ERR(res)) 914 962 goto out_unlock; 915 - /* Was a directory renamed! */ 916 - parent = dget_parent(res); 917 - if (!IS_ROOT(parent)) 918 - nfs_mark_for_revalidate(parent->d_inode); 919 - dput(parent); 920 963 dentry = res; 921 964 } 922 - nfs_renew_times(dentry); 923 965 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 924 966 out_unlock: 925 967 unlock_kernel(); ··· 964 1020 } 965 1021 dentry->d_op = NFS_PROTO(dir)->dentry_ops; 966 1022 967 - /* Let vfs_create() deal with O_EXCL */ 1023 + /* Let vfs_create() deal with O_EXCL. Instantiate, but don't hash 1024 + * the dentry. */ 968 1025 if (nd->intent.open.flags & O_EXCL) { 969 - d_add(dentry, NULL); 1026 + d_instantiate(dentry, NULL); 970 1027 goto out; 971 1028 } 972 1029 973 1030 /* Open the file on the server */ 974 1031 lock_kernel(); 975 - /* Revalidate parent directory attribute cache */ 976 - error = nfs_revalidate_inode(NFS_SERVER(dir), dir); 977 - if (error < 0) { 978 - res = ERR_PTR(error); 979 - unlock_kernel(); 980 - goto out; 981 - } 982 - 983 - if (nd->intent.open.flags & O_CREAT) { 984 - nfs_begin_data_update(dir); 985 - res = nfs4_atomic_open(dir, dentry, nd); 986 - nfs_end_data_update(dir); 987 - } else 988 - res = nfs4_atomic_open(dir, dentry, nd); 1032 + res = nfs4_atomic_open(dir, dentry, nd); 989 1033 unlock_kernel(); 990 1034 if (IS_ERR(res)) { 991 1035 error = PTR_ERR(res); ··· 995 1063 } 996 1064 } else if (res != NULL) 997 1065 dentry = res; 998 - nfs_renew_times(dentry); 999 - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1000 1066 out: 1001 1067 return res; 1002 1068 no_open: ··· 1006 1076 struct dentry *parent = NULL; 1007 1077 struct inode *inode = dentry->d_inode; 1008 1078 struct inode *dir; 1009 - unsigned long verifier; 1010 1079 int openflags, ret = 0; 1011 1080 1012 1081 parent = dget_parent(dentry); ··· 1015 1086 /* We can't create new files in nfs_open_revalidate(), so we 1016 1087 * optimize away revalidation of negative dentries. 1017 1088 */ 1018 - if (inode == NULL) 1089 + if (inode == NULL) { 1090 + if (!nfs_neg_need_reval(dir, dentry, nd)) 1091 + ret = 1; 1019 1092 goto out; 1093 + } 1094 + 1020 1095 /* NFS only supports OPEN on regular files */ 1021 1096 if (!S_ISREG(inode->i_mode)) 1022 1097 goto no_open; ··· 1037 1104 * change attribute *before* we do the RPC call. 1038 1105 */ 1039 1106 lock_kernel(); 1040 - verifier = nfs_save_change_attribute(dir); 1041 1107 ret = nfs4_open_revalidate(dir, dentry, openflags, nd); 1042 - if (!ret) 1043 - nfs_refresh_verifier(dentry, verifier); 1044 1108 unlock_kernel(); 1045 1109 out: 1046 1110 dput(parent); ··· 1063 1133 .len = entry->len, 1064 1134 }; 1065 1135 struct inode *inode; 1136 + unsigned long verf = nfs_save_change_attribute(dir); 1066 1137 1067 1138 switch (name.len) { 1068 1139 case 2: ··· 1074 1143 if (name.name[0] == '.') 1075 1144 return dget(parent); 1076 1145 } 1146 + 1147 + spin_lock(&dir->i_lock); 1148 + if (NFS_I(dir)->cache_validity & NFS_INO_INVALID_DATA) { 1149 + spin_unlock(&dir->i_lock); 1150 + return NULL; 1151 + } 1152 + spin_unlock(&dir->i_lock); 1153 + 1077 1154 name.hash = full_name_hash(name.name, name.len); 1078 1155 dentry = d_lookup(parent, &name); 1079 1156 if (dentry != NULL) { ··· 1122 1183 dentry = alias; 1123 1184 } 1124 1185 1125 - nfs_renew_times(dentry); 1126 - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1127 - return dentry; 1128 1186 out_renew: 1129 - nfs_renew_times(dentry); 1130 - nfs_refresh_verifier(dentry, nfs_save_change_attribute(dir)); 1187 + nfs_set_verifier(dentry, verf); 1131 1188 return dentry; 1132 1189 } 1133 1190 ··· 1133 1198 int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, 1134 1199 struct nfs_fattr *fattr) 1135 1200 { 1201 + struct dentry *parent = dget_parent(dentry); 1202 + struct inode *dir = parent->d_inode; 1136 1203 struct inode *inode; 1137 1204 int error = -EACCES; 1138 1205 1206 + d_drop(dentry); 1207 + 1139 1208 /* We may have been initialized further down */ 1140 1209 if (dentry->d_inode) 1141 - return 0; 1210 + goto out; 1142 1211 if (fhandle->size == 0) { 1143 - struct inode *dir = dentry->d_parent->d_inode; 1144 1212 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1145 1213 if (error) 1146 - return error; 1214 + goto out_error; 1147 1215 } 1216 + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1148 1217 if (!(fattr->valid & NFS_ATTR_FATTR)) { 1149 1218 struct nfs_server *server = NFS_SB(dentry->d_sb); 1150 1219 error = server->nfs_client->rpc_ops->getattr(server, fhandle, fattr); 1151 1220 if (error < 0) 1152 - return error; 1221 + goto out_error; 1153 1222 } 1154 1223 inode = nfs_fhget(dentry->d_sb, fhandle, fattr); 1155 1224 error = PTR_ERR(inode); 1156 1225 if (IS_ERR(inode)) 1157 - return error; 1158 - d_instantiate(dentry, inode); 1159 - if (d_unhashed(dentry)) 1160 - d_rehash(dentry); 1226 + goto out_error; 1227 + d_add(dentry, inode); 1228 + out: 1229 + dput(parent); 1161 1230 return 0; 1231 + out_error: 1232 + nfs_mark_for_revalidate(dir); 1233 + dput(parent); 1234 + return error; 1162 1235 } 1163 1236 1164 1237 /* ··· 1192 1249 open_flags = nd->intent.open.flags; 1193 1250 1194 1251 lock_kernel(); 1195 - nfs_begin_data_update(dir); 1196 1252 error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd); 1197 - nfs_end_data_update(dir); 1198 1253 if (error != 0) 1199 1254 goto out_err; 1200 - nfs_renew_times(dentry); 1201 - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1202 1255 unlock_kernel(); 1203 1256 return 0; 1204 1257 out_err: ··· 1222 1283 attr.ia_valid = ATTR_MODE; 1223 1284 1224 1285 lock_kernel(); 1225 - nfs_begin_data_update(dir); 1226 1286 status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev); 1227 - nfs_end_data_update(dir); 1228 1287 if (status != 0) 1229 1288 goto out_err; 1230 - nfs_renew_times(dentry); 1231 - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1232 1289 unlock_kernel(); 1233 1290 return 0; 1234 1291 out_err: ··· 1248 1313 attr.ia_mode = mode | S_IFDIR; 1249 1314 1250 1315 lock_kernel(); 1251 - nfs_begin_data_update(dir); 1252 1316 error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); 1253 - nfs_end_data_update(dir); 1254 1317 if (error != 0) 1255 1318 goto out_err; 1256 - nfs_renew_times(dentry); 1257 - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1258 1319 unlock_kernel(); 1259 1320 return 0; 1260 1321 out_err: ··· 1267 1336 dir->i_sb->s_id, dir->i_ino, dentry->d_name.name); 1268 1337 1269 1338 lock_kernel(); 1270 - nfs_begin_data_update(dir); 1271 1339 error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); 1272 1340 /* Ensure the VFS deletes this inode */ 1273 1341 if (error == 0 && dentry->d_inode != NULL) 1274 1342 clear_nlink(dentry->d_inode); 1275 - nfs_end_data_update(dir); 1276 1343 unlock_kernel(); 1277 1344 1278 1345 return error; ··· 1279 1350 static int nfs_sillyrename(struct inode *dir, struct dentry *dentry) 1280 1351 { 1281 1352 static unsigned int sillycounter; 1282 - const int i_inosize = sizeof(dir->i_ino)*2; 1353 + const int fileidsize = sizeof(NFS_FILEID(dentry->d_inode))*2; 1283 1354 const int countersize = sizeof(sillycounter)*2; 1284 - const int slen = sizeof(".nfs") + i_inosize + countersize - 1; 1355 + const int slen = sizeof(".nfs")+fileidsize+countersize-1; 1285 1356 char silly[slen+1]; 1286 1357 struct qstr qsilly; 1287 1358 struct dentry *sdentry; ··· 1299 1370 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) 1300 1371 goto out; 1301 1372 1302 - sprintf(silly, ".nfs%*.*lx", 1303 - i_inosize, i_inosize, dentry->d_inode->i_ino); 1373 + sprintf(silly, ".nfs%*.*Lx", 1374 + fileidsize, fileidsize, 1375 + (unsigned long long)NFS_FILEID(dentry->d_inode)); 1304 1376 1305 1377 /* Return delegation in anticipation of the rename */ 1306 1378 nfs_inode_return_delegation(dentry->d_inode); ··· 1328 1398 1329 1399 qsilly.name = silly; 1330 1400 qsilly.len = strlen(silly); 1331 - nfs_begin_data_update(dir); 1332 1401 if (dentry->d_inode) { 1333 - nfs_begin_data_update(dentry->d_inode); 1334 1402 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, 1335 1403 dir, &qsilly); 1336 1404 nfs_mark_for_revalidate(dentry->d_inode); 1337 - nfs_end_data_update(dentry->d_inode); 1338 1405 } else 1339 1406 error = NFS_PROTO(dir)->rename(dir, &dentry->d_name, 1340 1407 dir, &qsilly); 1341 - nfs_end_data_update(dir); 1342 1408 if (!error) { 1343 - nfs_renew_times(dentry); 1344 1409 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1345 1410 d_move(dentry, sdentry); 1346 1411 error = nfs_async_unlink(dir, dentry); ··· 1368 1443 goto out; 1369 1444 } 1370 1445 1371 - nfs_begin_data_update(dir); 1372 1446 if (inode != NULL) { 1373 1447 nfs_inode_return_delegation(inode); 1374 - nfs_begin_data_update(inode); 1375 1448 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1376 1449 /* The VFS may want to delete this inode */ 1377 1450 if (error == 0) 1378 1451 drop_nlink(inode); 1379 1452 nfs_mark_for_revalidate(inode); 1380 - nfs_end_data_update(inode); 1381 1453 } else 1382 1454 error = NFS_PROTO(dir)->remove(dir, &dentry->d_name); 1383 - nfs_end_data_update(dir); 1384 1455 out: 1385 1456 return error; 1386 1457 } ··· 1414 1493 spin_unlock(&dcache_lock); 1415 1494 error = nfs_safe_remove(dentry); 1416 1495 if (!error) { 1417 - nfs_renew_times(dentry); 1418 1496 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1419 1497 } else if (need_rehash) 1420 1498 d_rehash(dentry); ··· 1468 1548 memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); 1469 1549 kunmap_atomic(kaddr, KM_USER0); 1470 1550 1471 - nfs_begin_data_update(dir); 1472 1551 error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); 1473 - nfs_end_data_update(dir); 1474 1552 if (error != 0) { 1475 1553 dfprintk(VFS, "NFS: symlink(%s/%ld, %s, %s) error %d\n", 1476 1554 dir->i_sb->s_id, dir->i_ino, ··· 1508 1590 dentry->d_parent->d_name.name, dentry->d_name.name); 1509 1591 1510 1592 lock_kernel(); 1511 - nfs_begin_data_update(dir); 1512 - nfs_begin_data_update(inode); 1593 + d_drop(dentry); 1513 1594 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1514 1595 if (error == 0) { 1515 1596 atomic_inc(&inode->i_count); 1516 - d_instantiate(dentry, inode); 1597 + d_add(dentry, inode); 1517 1598 } 1518 - nfs_end_data_update(inode); 1519 - nfs_end_data_update(dir); 1520 1599 unlock_kernel(); 1521 1600 return error; 1522 1601 } ··· 1616 1701 d_delete(new_dentry); 1617 1702 } 1618 1703 1619 - nfs_begin_data_update(old_dir); 1620 - nfs_begin_data_update(new_dir); 1621 - nfs_begin_data_update(old_inode); 1622 1704 error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name, 1623 1705 new_dir, &new_dentry->d_name); 1624 1706 nfs_mark_for_revalidate(old_inode); 1625 - nfs_end_data_update(old_inode); 1626 - nfs_end_data_update(new_dir); 1627 - nfs_end_data_update(old_dir); 1628 1707 out: 1629 1708 if (rehash) 1630 1709 d_rehash(rehash); 1631 1710 if (!error) { 1632 1711 d_move(old_dentry, new_dentry); 1633 - nfs_renew_times(new_dentry); 1634 - nfs_refresh_verifier(new_dentry, nfs_save_change_attribute(new_dir)); 1712 + nfs_set_verifier(new_dentry, 1713 + nfs_save_change_attribute(new_dir)); 1635 1714 } 1636 1715 1637 1716 /* new dentry created? */ ··· 1751 1842 return NULL; 1752 1843 } 1753 1844 1754 - int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) 1845 + static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) 1755 1846 { 1756 1847 struct nfs_inode *nfsi = NFS_I(inode); 1757 1848 struct nfs_access_entry *cache; ··· 1763 1854 cache = nfs_access_search_rbtree(inode, cred); 1764 1855 if (cache == NULL) 1765 1856 goto out; 1766 - if (time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))) 1857 + if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 1767 1858 goto out_stale; 1768 1859 res->jiffies = cache->jiffies; 1769 1860 res->cred = cache->cred; ··· 1818 1909 nfs_access_free_entry(entry); 1819 1910 } 1820 1911 1821 - void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) 1912 + static void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) 1822 1913 { 1823 1914 struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); 1824 1915 if (cache == NULL) ··· 1864 1955 if ((cache.mask & mask) == mask) 1865 1956 return 0; 1866 1957 return -EACCES; 1958 + } 1959 + 1960 + static int nfs_open_permission_mask(int openflags) 1961 + { 1962 + int mask = 0; 1963 + 1964 + if (openflags & FMODE_READ) 1965 + mask |= MAY_READ; 1966 + if (openflags & FMODE_WRITE) 1967 + mask |= MAY_WRITE; 1968 + if (openflags & FMODE_EXEC) 1969 + mask |= MAY_EXEC; 1970 + return mask; 1971 + } 1972 + 1973 + int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) 1974 + { 1975 + return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags)); 1867 1976 } 1868 1977 1869 1978 int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)

+2 -6

fs/nfs/direct.c

··· 368 368 return -ENOMEM; 369 369 370 370 dreq->inode = inode; 371 - dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); 371 + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 372 372 if (!is_sync_kiocb(iocb)) 373 373 dreq->iocb = iocb; 374 374 ··· 510 510 nfs_direct_write_reschedule(dreq); 511 511 break; 512 512 default: 513 - nfs_end_data_update(inode); 514 513 if (dreq->commit_data != NULL) 515 514 nfs_commit_free(dreq->commit_data); 516 515 nfs_direct_free_writedata(dreq); ··· 532 533 533 534 static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) 534 535 { 535 - nfs_end_data_update(inode); 536 536 nfs_direct_free_writedata(dreq); 537 537 nfs_zap_mapping(inode, inode->i_mapping); 538 538 nfs_direct_complete(dreq); ··· 716 718 sync = FLUSH_STABLE; 717 719 718 720 dreq->inode = inode; 719 - dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data); 721 + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 720 722 if (!is_sync_kiocb(iocb)) 721 723 dreq->iocb = iocb; 722 724 723 725 nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count); 724 - 725 - nfs_begin_data_update(inode); 726 726 727 727 rpc_clnt_sigmask(clnt, &oldset); 728 728 result = nfs_direct_write_schedule(dreq, user_addr, count, pos, sync);

+80 -25

fs/nfs/file.c

··· 33 33 #include <asm/system.h> 34 34 35 35 #include "delegation.h" 36 + #include "internal.h" 36 37 #include "iostat.h" 37 38 38 39 #define NFSDBG_FACILITY NFSDBG_FILE ··· 55 54 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); 56 55 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 57 56 static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); 57 + 58 + static struct vm_operations_struct nfs_file_vm_ops; 58 59 59 60 const struct file_operations nfs_file_operations = { 60 61 .llseek = nfs_file_llseek, ··· 177 174 } 178 175 179 176 /* 177 + * Helper for nfs_file_flush() and nfs_fsync() 178 + * 179 + * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to 180 + * disk, but it retrieves and clears ctx->error after synching, despite 181 + * the two being set at the same time in nfs_context_set_write_error(). 182 + * This is because the former is used to notify the _next_ call to 183 + * nfs_file_write() that a write error occured, and hence cause it to 184 + * fall back to doing a synchronous write. 185 + */ 186 + static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode) 187 + { 188 + int have_error, status; 189 + int ret = 0; 190 + 191 + have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 192 + status = nfs_wb_all(inode); 193 + have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 194 + if (have_error) 195 + ret = xchg(&ctx->error, 0); 196 + if (!ret) 197 + ret = status; 198 + return ret; 199 + } 200 + 201 + /* 180 202 * Flush all dirty pages, and check for write errors. 181 203 * 182 204 */ 183 205 static int 184 206 nfs_file_flush(struct file *file, fl_owner_t id) 185 207 { 186 - struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; 208 + struct nfs_open_context *ctx = nfs_file_open_context(file); 187 209 struct inode *inode = file->f_path.dentry->d_inode; 188 210 int status; 189 211 ··· 217 189 if ((file->f_mode & FMODE_WRITE) == 0) 218 190 return 0; 219 191 nfs_inc_stats(inode, NFSIOS_VFSFLUSH); 220 - lock_kernel(); 192 + 221 193 /* Ensure that data+attribute caches are up to date after close() */ 222 - status = nfs_wb_all(inode); 223 - if (!status) { 224 - status = ctx->error; 225 - ctx->error = 0; 226 - if (!status) 227 - nfs_revalidate_inode(NFS_SERVER(inode), inode); 228 - } 229 - unlock_kernel(); 194 + status = nfs_do_fsync(ctx, inode); 195 + if (!status) 196 + nfs_revalidate_inode(NFS_SERVER(inode), inode); 230 197 return status; 231 198 } 232 199 ··· 280 257 dentry->d_parent->d_name.name, dentry->d_name.name); 281 258 282 259 status = nfs_revalidate_mapping(inode, file->f_mapping); 283 - if (!status) 284 - status = generic_file_mmap(file, vma); 260 + if (!status) { 261 + vma->vm_ops = &nfs_file_vm_ops; 262 + vma->vm_flags |= VM_CAN_NONLINEAR; 263 + file_accessed(file); 264 + } 285 265 return status; 286 266 } 287 267 ··· 296 270 static int 297 271 nfs_fsync(struct file *file, struct dentry *dentry, int datasync) 298 272 { 299 - struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; 273 + struct nfs_open_context *ctx = nfs_file_open_context(file); 300 274 struct inode *inode = dentry->d_inode; 301 - int status; 302 275 303 276 dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); 304 277 305 278 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 306 - lock_kernel(); 307 - status = nfs_wb_all(inode); 308 - if (!status) { 309 - status = ctx->error; 310 - ctx->error = 0; 311 - } 312 - unlock_kernel(); 313 - return status; 279 + return nfs_do_fsync(ctx, inode); 314 280 } 315 281 316 282 /* ··· 351 333 const struct address_space_operations nfs_file_aops = { 352 334 .readpage = nfs_readpage, 353 335 .readpages = nfs_readpages, 354 - .set_page_dirty = nfs_set_page_dirty, 336 + .set_page_dirty = __set_page_dirty_nobuffers, 355 337 .writepage = nfs_writepage, 356 338 .writepages = nfs_writepages, 357 339 .prepare_write = nfs_prepare_write, ··· 363 345 #endif 364 346 .launder_page = nfs_launder_page, 365 347 }; 348 + 349 + static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) 350 + { 351 + struct file *filp = vma->vm_file; 352 + unsigned pagelen; 353 + int ret = -EINVAL; 354 + 355 + lock_page(page); 356 + if (page->mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping) 357 + goto out_unlock; 358 + pagelen = nfs_page_length(page); 359 + if (pagelen == 0) 360 + goto out_unlock; 361 + ret = nfs_prepare_write(filp, page, 0, pagelen); 362 + if (!ret) 363 + ret = nfs_commit_write(filp, page, 0, pagelen); 364 + out_unlock: 365 + unlock_page(page); 366 + return ret; 367 + } 368 + 369 + static struct vm_operations_struct nfs_file_vm_ops = { 370 + .fault = filemap_fault, 371 + .page_mkwrite = nfs_vm_page_mkwrite, 372 + }; 373 + 374 + static int nfs_need_sync_write(struct file *filp, struct inode *inode) 375 + { 376 + struct nfs_open_context *ctx; 377 + 378 + if (IS_SYNC(inode) || (filp->f_flags & O_SYNC)) 379 + return 1; 380 + ctx = nfs_file_open_context(filp); 381 + if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags)) 382 + return 1; 383 + return 0; 384 + } 366 385 367 386 static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, 368 387 unsigned long nr_segs, loff_t pos) ··· 437 382 nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, count); 438 383 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 439 384 /* Return error values for O_SYNC and IS_SYNC() */ 440 - if (result >= 0 && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_SYNC))) { 441 - int err = nfs_fsync(iocb->ki_filp, dentry, 1); 385 + if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { 386 + int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); 442 387 if (err < 0) 443 388 result = err; 444 389 }

+142 -131

fs/nfs/inode.c

··· 49 49 50 50 #define NFSDBG_FACILITY NFSDBG_VFS 51 51 52 + #define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 53 + 54 + /* Default is to see 64-bit inode numbers */ 55 + static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; 56 + 52 57 static void nfs_invalidate_inode(struct inode *); 53 58 static int nfs_update_inode(struct inode *, struct nfs_fattr *); 54 59 ··· 65 60 nfs_fattr_to_ino_t(struct nfs_fattr *fattr) 66 61 { 67 62 return nfs_fileid_to_ino_t(fattr->fileid); 63 + } 64 + 65 + /** 66 + * nfs_compat_user_ino64 - returns the user-visible inode number 67 + * @fileid: 64-bit fileid 68 + * 69 + * This function returns a 32-bit inode number if the boot parameter 70 + * nfs.enable_ino64 is zero. 71 + */ 72 + u64 nfs_compat_user_ino64(u64 fileid) 73 + { 74 + int ino; 75 + 76 + if (enable_ino64) 77 + return fileid; 78 + ino = fileid; 79 + if (sizeof(ino) < sizeof(fileid)) 80 + ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8; 81 + return ino; 68 82 } 69 83 70 84 int nfs_write_inode(struct inode *inode, int sync) ··· 109 85 */ 110 86 BUG_ON(nfs_have_writebacks(inode)); 111 87 BUG_ON(!list_empty(&NFS_I(inode)->open_files)); 112 - BUG_ON(atomic_read(&NFS_I(inode)->data_updates) != 0); 113 88 nfs_zap_acl_cache(inode); 114 89 nfs_access_zap_cache(inode); 115 90 } ··· 141 118 142 119 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); 143 120 144 - NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); 145 - NFS_ATTRTIMEO_UPDATE(inode) = jiffies; 121 + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 122 + nfsi->attrtimeo_timestamp = jiffies; 146 123 147 124 memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); 148 125 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) ··· 176 153 clear_acl_cache(inode); 177 154 spin_lock(&inode->i_lock); 178 155 NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL; 156 + spin_unlock(&inode->i_lock); 157 + } 158 + 159 + void nfs_invalidate_atime(struct inode *inode) 160 + { 161 + spin_lock(&inode->i_lock); 162 + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; 179 163 spin_unlock(&inode->i_lock); 180 164 } 181 165 ··· 368 338 return 0; 369 339 370 340 lock_kernel(); 371 - nfs_begin_data_update(inode); 372 341 /* Write all dirty data */ 373 342 if (S_ISREG(inode->i_mode)) { 374 343 filemap_write_and_wait(inode->i_mapping); ··· 381 352 error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); 382 353 if (error == 0) 383 354 nfs_refresh_inode(inode, &fattr); 384 - nfs_end_data_update(inode); 385 355 unlock_kernel(); 386 356 return error; 387 357 } ··· 459 431 460 432 /* Flush out writes to the server in order to update c/mtime */ 461 433 if (S_ISREG(inode->i_mode)) 462 - nfs_sync_mapping_range(inode->i_mapping, 0, 0, FLUSH_NOCOMMIT); 434 + nfs_wb_nocommit(inode); 463 435 464 436 /* 465 437 * We may force a getattr if the user cares about atime. ··· 478 450 err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 479 451 else 480 452 err = nfs_revalidate_inode(NFS_SERVER(inode), inode); 481 - if (!err) 453 + if (!err) { 482 454 generic_fillattr(inode, stat); 455 + stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); 456 + } 483 457 return err; 484 458 } 485 459 ··· 566 536 static void nfs_file_clear_open_context(struct file *filp) 567 537 { 568 538 struct inode *inode = filp->f_path.dentry->d_inode; 569 - struct nfs_open_context *ctx = (struct nfs_open_context *)filp->private_data; 539 + struct nfs_open_context *ctx = nfs_file_open_context(filp); 570 540 571 541 if (ctx) { 572 542 filp->private_data = NULL; ··· 628 598 status = nfs_wait_on_inode(inode); 629 599 if (status < 0) 630 600 goto out; 631 - if (NFS_STALE(inode)) { 632 - status = -ESTALE; 633 - /* Do we trust the cached ESTALE? */ 634 - if (NFS_ATTRTIMEO(inode) != 0) { 635 - if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) { 636 - /* no */ 637 - } else 638 - goto out; 639 - } 640 - } 601 + 602 + status = -ESTALE; 603 + if (NFS_STALE(inode)) 604 + goto out; 641 605 642 606 status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); 643 607 if (status != 0) { ··· 678 654 679 655 if (nfs_have_delegation(inode, FMODE_READ)) 680 656 return 0; 681 - return time_after(jiffies, nfsi->read_cache_jiffies+nfsi->attrtimeo); 657 + return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 682 658 } 683 659 684 660 /** ··· 707 683 } 708 684 spin_lock(&inode->i_lock); 709 685 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; 710 - if (S_ISDIR(inode->i_mode)) { 686 + if (S_ISDIR(inode->i_mode)) 711 687 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 712 - /* This ensures we revalidate child dentries */ 713 - nfsi->cache_change_attribute = jiffies; 714 - } 715 688 spin_unlock(&inode->i_lock); 716 689 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 717 690 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", ··· 777 756 return ret; 778 757 } 779 758 780 - /** 781 - * nfs_begin_data_update 782 - * @inode - pointer to inode 783 - * Declare that a set of operations will update file data on the server 784 - */ 785 - void nfs_begin_data_update(struct inode *inode) 786 - { 787 - atomic_inc(&NFS_I(inode)->data_updates); 788 - } 789 - 790 - /** 791 - * nfs_end_data_update 792 - * @inode - pointer to inode 793 - * Declare end of the operations that will update file data 794 - * This will mark the inode as immediately needing revalidation 795 - * of its attribute cache. 796 - */ 797 - void nfs_end_data_update(struct inode *inode) 798 - { 799 - struct nfs_inode *nfsi = NFS_I(inode); 800 - 801 - /* Directories: invalidate page cache */ 802 - if (S_ISDIR(inode->i_mode)) { 803 - spin_lock(&inode->i_lock); 804 - nfsi->cache_validity |= NFS_INO_INVALID_DATA; 805 - spin_unlock(&inode->i_lock); 806 - } 807 - nfsi->cache_change_attribute = jiffies; 808 - atomic_dec(&nfsi->data_updates); 809 - } 810 - 811 759 static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) 812 760 { 813 761 struct nfs_inode *nfsi = NFS_I(inode); 814 - unsigned long now = jiffies; 815 762 763 + if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 && 764 + nfsi->change_attr == fattr->pre_change_attr) { 765 + nfsi->change_attr = fattr->change_attr; 766 + if (S_ISDIR(inode->i_mode)) 767 + nfsi->cache_validity |= NFS_INO_INVALID_DATA; 768 + } 816 769 /* If we have atomic WCC data, we may update some attributes */ 817 770 if ((fattr->valid & NFS_ATTR_WCC) != 0) { 818 - if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { 771 + if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) 819 772 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 820 - nfsi->cache_change_attribute = now; 821 - } 822 773 if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { 823 774 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 824 - nfsi->cache_change_attribute = now; 775 + if (S_ISDIR(inode->i_mode)) 776 + nfsi->cache_validity |= NFS_INO_INVALID_DATA; 825 777 } 826 - if (inode->i_size == fattr->pre_size && nfsi->npages == 0) { 778 + if (inode->i_size == fattr->pre_size && nfsi->npages == 0) 827 779 inode->i_size = fattr->size; 828 - nfsi->cache_change_attribute = now; 829 - } 830 780 } 831 781 } 832 782 ··· 814 822 { 815 823 struct nfs_inode *nfsi = NFS_I(inode); 816 824 loff_t cur_size, new_isize; 817 - int data_unstable; 825 + unsigned long invalid = 0; 818 826 819 827 820 828 /* Has the inode gone and changed behind our back? */ ··· 823 831 return -EIO; 824 832 } 825 833 826 - /* Are we in the process of updating data on the server? */ 827 - data_unstable = nfs_caches_unstable(inode); 828 - 829 834 /* Do atomic weak cache consistency updates */ 830 835 nfs_wcc_update_inode(inode, fattr); 831 836 832 837 if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 833 838 nfsi->change_attr != fattr->change_attr) 834 - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 839 + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 835 840 836 841 /* Verify a few of the more important attributes */ 837 842 if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) 838 - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 843 + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 839 844 840 845 cur_size = i_size_read(inode); 841 846 new_isize = nfs_size_to_loff_t(fattr->size); 842 847 if (cur_size != new_isize && nfsi->npages == 0) 843 - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 848 + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 844 849 845 850 /* Have any file permissions changed? */ 846 851 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) 847 852 || inode->i_uid != fattr->uid 848 853 || inode->i_gid != fattr->gid) 849 - nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; 854 + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; 850 855 851 856 /* Has the link count changed? */ 852 857 if (inode->i_nlink != fattr->nlink) 853 - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; 858 + invalid |= NFS_INO_INVALID_ATTR; 854 859 855 860 if (!timespec_equal(&inode->i_atime, &fattr->atime)) 856 - nfsi->cache_validity |= NFS_INO_INVALID_ATIME; 861 + invalid |= NFS_INO_INVALID_ATIME; 862 + 863 + if (invalid != 0) 864 + nfsi->cache_validity |= invalid; 865 + else 866 + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR 867 + | NFS_INO_INVALID_ATIME 868 + | NFS_INO_REVAL_PAGECACHE); 857 869 858 870 nfsi->read_cache_jiffies = fattr->time_start; 859 871 return 0; ··· 907 911 int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) 908 912 { 909 913 struct nfs_inode *nfsi = NFS_I(inode); 910 - int status = 0; 911 914 912 915 spin_lock(&inode->i_lock); 913 - if (unlikely((fattr->valid & NFS_ATTR_FATTR) == 0)) { 914 - nfsi->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 915 - goto out; 916 - } 917 - status = nfs_update_inode(inode, fattr); 918 - out: 916 + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 917 + if (S_ISDIR(inode->i_mode)) 918 + nfsi->cache_validity |= NFS_INO_INVALID_DATA; 919 919 spin_unlock(&inode->i_lock); 920 - return status; 920 + return nfs_refresh_inode(inode, fattr); 921 + } 922 + 923 + /** 924 + * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache 925 + * @inode - pointer to inode 926 + * @fattr - updated attributes 927 + * 928 + * After an operation that has changed the inode metadata, mark the 929 + * attribute cache as being invalid, then try to update it. Fake up 930 + * weak cache consistency data, if none exist. 931 + * 932 + * This function is mainly designed to be used by the ->write_done() functions. 933 + */ 934 + int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) 935 + { 936 + if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 937 + (fattr->valid & NFS_ATTR_WCC_V4) == 0) { 938 + fattr->pre_change_attr = NFS_I(inode)->change_attr; 939 + fattr->valid |= NFS_ATTR_WCC_V4; 940 + } 941 + if ((fattr->valid & NFS_ATTR_FATTR) != 0 && 942 + (fattr->valid & NFS_ATTR_WCC) == 0) { 943 + memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime)); 944 + memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime)); 945 + fattr->pre_size = inode->i_size; 946 + fattr->valid |= NFS_ATTR_WCC; 947 + } 948 + return nfs_post_op_update_inode(inode, fattr); 921 949 } 922 950 923 951 /* ··· 961 941 struct nfs_server *server; 962 942 struct nfs_inode *nfsi = NFS_I(inode); 963 943 loff_t cur_isize, new_isize; 964 - unsigned int invalid = 0; 944 + unsigned long invalid = 0; 965 945 unsigned long now = jiffies; 966 - int data_stable; 967 946 968 947 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", 969 948 __FUNCTION__, inode->i_sb->s_id, inode->i_ino, ··· 987 968 * Update the read time so we don't revalidate too often. 988 969 */ 989 970 nfsi->read_cache_jiffies = fattr->time_start; 990 - nfsi->last_updated = now; 991 971 992 - /* Fix a wraparound issue with nfsi->cache_change_attribute */ 993 - if (time_before(now, nfsi->cache_change_attribute)) 994 - nfsi->cache_change_attribute = now - 600*HZ; 995 - 996 - /* Are we racing with known updates of the metadata on the server? */ 997 - data_stable = nfs_verify_change_attribute(inode, fattr->time_start); 998 - if (data_stable) 999 - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_ATIME); 972 + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME 973 + | NFS_INO_REVAL_PAGECACHE); 1000 974 1001 975 /* Do atomic weak cache consistency updates */ 1002 976 nfs_wcc_update_inode(inode, fattr); 977 + 978 + /* More cache consistency checks */ 979 + if (!(fattr->valid & NFS_ATTR_FATTR_V4)) { 980 + /* NFSv2/v3: Check if the mtime agrees */ 981 + if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { 982 + dprintk("NFS: mtime change on server for file %s/%ld\n", 983 + inode->i_sb->s_id, inode->i_ino); 984 + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 985 + nfsi->cache_change_attribute = now; 986 + } 987 + /* If ctime has changed we should definitely clear access+acl caches */ 988 + if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) 989 + invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 990 + } else if (nfsi->change_attr != fattr->change_attr) { 991 + dprintk("NFS: change_attr change on server for file %s/%ld\n", 992 + inode->i_sb->s_id, inode->i_ino); 993 + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 994 + nfsi->cache_change_attribute = now; 995 + } 1003 996 1004 997 /* Check if our cached file size is stale */ 1005 998 new_isize = nfs_size_to_loff_t(fattr->size); 1006 999 cur_isize = i_size_read(inode); 1007 1000 if (new_isize != cur_isize) { 1008 - /* Do we perhaps have any outstanding writes? */ 1009 - if (nfsi->npages == 0) { 1010 - /* No, but did we race with nfs_end_data_update()? */ 1011 - if (data_stable) { 1012 - inode->i_size = new_isize; 1013 - invalid |= NFS_INO_INVALID_DATA; 1014 - } 1015 - invalid |= NFS_INO_INVALID_ATTR; 1016 - } else if (new_isize > cur_isize) { 1001 + /* Do we perhaps have any outstanding writes, or has 1002 + * the file grown beyond our last write? */ 1003 + if (nfsi->npages == 0 || new_isize > cur_isize) { 1017 1004 inode->i_size = new_isize; 1018 1005 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1019 1006 } 1020 - nfsi->cache_change_attribute = now; 1021 1007 dprintk("NFS: isize change on server for file %s/%ld\n", 1022 1008 inode->i_sb->s_id, inode->i_ino); 1023 1009 } 1024 1010 1025 - /* Check if the mtime agrees */ 1026 - if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { 1027 - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1028 - dprintk("NFS: mtime change on server for file %s/%ld\n", 1029 - inode->i_sb->s_id, inode->i_ino); 1030 - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1031 - nfsi->cache_change_attribute = now; 1032 - } 1033 1011 1034 - /* If ctime has changed we should definitely clear access+acl caches */ 1035 - if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { 1036 - invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1037 - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 1038 - nfsi->cache_change_attribute = now; 1039 - } 1012 + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1013 + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 1040 1014 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); 1015 + nfsi->change_attr = fattr->change_attr; 1041 1016 1042 1017 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || 1043 1018 inode->i_uid != fattr->uid || ··· 1052 1039 inode->i_blocks = fattr->du.nfs2.blocks; 1053 1040 } 1054 1041 1055 - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && 1056 - nfsi->change_attr != fattr->change_attr) { 1057 - dprintk("NFS: change_attr change on server for file %s/%ld\n", 1058 - inode->i_sb->s_id, inode->i_ino); 1059 - nfsi->change_attr = fattr->change_attr; 1060 - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1061 - nfsi->cache_change_attribute = now; 1062 - } 1063 - 1064 1042 /* Update attrtimeo value if we're out of the unstable period */ 1065 1043 if (invalid & NFS_INO_INVALID_ATTR) { 1066 1044 nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); 1067 1045 nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); 1068 1046 nfsi->attrtimeo_timestamp = now; 1069 - } else if (time_after(now, nfsi->attrtimeo_timestamp+nfsi->attrtimeo)) { 1070 - if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) 1071 - nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); 1072 - nfsi->attrtimeo_timestamp = now; 1047 + nfsi->last_updated = now; 1048 + } else { 1049 + if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { 1050 + if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) 1051 + nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); 1052 + nfsi->attrtimeo_timestamp = now; 1053 + } 1054 + /* 1055 + * Avoid jiffy wraparound issues with nfsi->last_updated 1056 + */ 1057 + if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now)) 1058 + nfsi->last_updated = nfsi->read_cache_jiffies; 1073 1059 } 1060 + invalid &= ~NFS_INO_INVALID_ATTR; 1074 1061 /* Don't invalidate the data if we were to blame */ 1075 1062 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) 1076 1063 || S_ISLNK(inode->i_mode))) 1077 1064 invalid &= ~NFS_INO_INVALID_DATA; 1078 - if (data_stable) 1079 - invalid &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME|NFS_INO_REVAL_PAGECACHE); 1080 1065 if (!nfs_have_delegation(inode, FMODE_READ) || 1081 1066 (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) 1082 1067 nfsi->cache_validity |= invalid; ··· 1163 1152 INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); 1164 1153 INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); 1165 1154 INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC); 1166 - atomic_set(&nfsi->data_updates, 0); 1167 1155 nfsi->ncommit = 0; 1168 1156 nfsi->npages = 0; 1169 1157 nfs4_init_once(nfsi); ··· 1259 1249 /* Not quite true; I just maintain it */ 1260 1250 MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); 1261 1251 MODULE_LICENSE("GPL"); 1252 + module_param(enable_ino64, bool, 0644); 1262 1253 1263 1254 module_init(init_nfs_fs) 1264 1255 module_exit(exit_nfs_fs)

+39 -11

fs/nfs/internal.h

··· 5 5 #include <linux/mount.h> 6 6 7 7 struct nfs_string; 8 - struct nfs_mount_data; 9 - struct nfs4_mount_data; 10 8 11 9 /* Maximum number of readahead requests 12 10 * FIXME: this should really be a sysctl so that users may tune it to suit ··· 25 27 rpc_authflavor_t authflavor; 26 28 }; 27 29 30 + /* 31 + * In-kernel mount arguments 32 + */ 33 + struct nfs_parsed_mount_data { 34 + int flags; 35 + int rsize, wsize; 36 + int timeo, retrans; 37 + int acregmin, acregmax, 38 + acdirmin, acdirmax; 39 + int namlen; 40 + unsigned int bsize; 41 + unsigned int auth_flavor_len; 42 + rpc_authflavor_t auth_flavors[1]; 43 + char *client_address; 44 + 45 + struct { 46 + struct sockaddr_in address; 47 + char *hostname; 48 + unsigned int program; 49 + unsigned int version; 50 + unsigned short port; 51 + int protocol; 52 + } mount_server; 53 + 54 + struct { 55 + struct sockaddr_in address; 56 + char *hostname; 57 + char *export_path; 58 + unsigned int program; 59 + int protocol; 60 + } nfs_server; 61 + }; 62 + 28 63 /* client.c */ 29 64 extern struct rpc_program nfs_program; 30 65 31 66 extern void nfs_put_client(struct nfs_client *); 32 67 extern struct nfs_client *nfs_find_client(const struct sockaddr_in *, int); 33 - extern struct nfs_server *nfs_create_server(const struct nfs_mount_data *, 34 - struct nfs_fh *); 35 - extern struct nfs_server *nfs4_create_server(const struct nfs4_mount_data *, 36 - const char *, 37 - const struct sockaddr_in *, 38 - const char *, 39 - const char *, 40 - rpc_authflavor_t, 41 - struct nfs_fh *); 68 + extern struct nfs_server *nfs_create_server( 69 + const struct nfs_parsed_mount_data *, 70 + struct nfs_fh *); 71 + extern struct nfs_server *nfs4_create_server( 72 + const struct nfs_parsed_mount_data *, 73 + struct nfs_fh *); 42 74 extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *, 43 75 struct nfs_fh *); 44 76 extern void nfs_free_server(struct nfs_server *server);

+11 -9

fs/nfs/nfs2xdr.c

··· 251 251 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS_readres_sz) << 2; 252 252 xdr_inline_pages(&req->rq_rcv_buf, replen, 253 253 args->pages, args->pgbase, count); 254 + req->rq_rcv_buf.flags |= XDRBUF_READ; 254 255 return 0; 255 256 } 256 257 ··· 272 271 res->eof = 0; 273 272 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 274 273 if (iov->iov_len < hdrlen) { 275 - printk(KERN_WARNING "NFS: READ reply header overflowed:" 274 + dprintk("NFS: READ reply header overflowed:" 276 275 "length %d > %Zu\n", hdrlen, iov->iov_len); 277 276 return -errno_NFSERR_IO; 278 277 } else if (iov->iov_len != hdrlen) { ··· 282 281 283 282 recvd = req->rq_rcv_buf.len - hdrlen; 284 283 if (count > recvd) { 285 - printk(KERN_WARNING "NFS: server cheating in read reply: " 284 + dprintk("NFS: server cheating in read reply: " 286 285 "count %d > recvd %d\n", count, recvd); 287 286 count = recvd; 288 287 } ··· 314 313 315 314 /* Copy the page array */ 316 315 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); 316 + sndbuf->flags |= XDRBUF_WRITE; 317 317 return 0; 318 318 } 319 319 ··· 433 431 434 432 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 435 433 if (iov->iov_len < hdrlen) { 436 - printk(KERN_WARNING "NFS: READDIR reply header overflowed:" 434 + dprintk("NFS: READDIR reply header overflowed:" 437 435 "length %d > %Zu\n", hdrlen, iov->iov_len); 438 436 return -errno_NFSERR_IO; 439 437 } else if (iov->iov_len != hdrlen) { ··· 456 454 len = ntohl(*p++); 457 455 p += XDR_QUADLEN(len) + 1; /* name plus cookie */ 458 456 if (len > NFS2_MAXNAMLEN) { 459 - printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)!\n", 457 + dprintk("NFS: giant filename in readdir (len 0x%x)!\n", 460 458 len); 461 459 goto err_unmap; 462 460 } ··· 473 471 entry[0] = entry[1] = 0; 474 472 /* truncate listing ? */ 475 473 if (!nr) { 476 - printk(KERN_NOTICE "NFS: readdir reply truncated!\n"); 474 + dprintk("NFS: readdir reply truncated!\n"); 477 475 entry[1] = 1; 478 476 } 479 477 goto out; ··· 585 583 /* Convert length of symlink */ 586 584 len = ntohl(*p++); 587 585 if (len >= rcvbuf->page_len || len <= 0) { 588 - dprintk(KERN_WARNING "nfs: server returned giant symlink!\n"); 586 + dprintk("nfs: server returned giant symlink!\n"); 589 587 return -ENAMETOOLONG; 590 588 } 591 589 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 592 590 if (iov->iov_len < hdrlen) { 593 - printk(KERN_WARNING "NFS: READLINK reply header overflowed:" 591 + dprintk("NFS: READLINK reply header overflowed:" 594 592 "length %d > %Zu\n", hdrlen, iov->iov_len); 595 593 return -errno_NFSERR_IO; 596 594 } else if (iov->iov_len != hdrlen) { ··· 599 597 } 600 598 recvd = req->rq_rcv_buf.len - hdrlen; 601 599 if (recvd < len) { 602 - printk(KERN_WARNING "NFS: server cheating in readlink reply: " 600 + dprintk("NFS: server cheating in readlink reply: " 603 601 "count %u > recvd %u\n", len, recvd); 604 602 return -EIO; 605 603 } ··· 697 695 if (nfs_errtbl[i].stat == stat) 698 696 return nfs_errtbl[i].errno; 699 697 } 700 - printk(KERN_ERR "nfs_stat_to_errno: bad nfs status return value: %d\n", stat); 698 + dprintk("nfs_stat_to_errno: bad nfs status return value: %d\n", stat); 701 699 return nfs_errtbl[i].errno; 702 700 } 703 701

-2

fs/nfs/nfs3acl.c

··· 317 317 } 318 318 319 319 dprintk("NFS call setacl\n"); 320 - nfs_begin_data_update(inode); 321 320 msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; 322 321 status = rpc_call_sync(server->client_acl, &msg, 0); 323 322 spin_lock(&inode->i_lock); 324 323 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS; 325 324 spin_unlock(&inode->i_lock); 326 - nfs_end_data_update(inode); 327 325 dprintk("NFS reply setacl: %d\n", status); 328 326 329 327 /* pages may have been allocated at the xdr layer. */

+9 -8

fs/nfs/nfs3proc.c

··· 166 166 nfs_fattr_init(&dir_attr); 167 167 nfs_fattr_init(fattr); 168 168 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 169 + nfs_refresh_inode(dir, &dir_attr); 169 170 if (status >= 0 && !(fattr->valid & NFS_ATTR_FATTR)) { 170 171 msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR]; 171 172 msg.rpc_argp = fhandle; ··· 174 173 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 175 174 } 176 175 dprintk("NFS reply lookup: %d\n", status); 177 - if (status >= 0) 178 - status = nfs_refresh_inode(dir, &dir_attr); 179 176 return status; 180 177 } 181 178 ··· 606 607 607 608 nfs_fattr_init(&dir_attr); 608 609 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 610 + 611 + nfs_invalidate_atime(dir); 612 + 609 613 nfs_refresh_inode(dir, &dir_attr); 610 614 dprintk("NFS reply readdir: %d\n", status); 611 615 return status; ··· 726 724 { 727 725 if (nfs3_async_handle_jukebox(task, data->inode)) 728 726 return -EAGAIN; 729 - /* Call back common NFS readpage processing */ 730 - if (task->tk_status >= 0) 731 - nfs_refresh_inode(data->inode, &data->fattr); 727 + 728 + nfs_invalidate_atime(data->inode); 729 + nfs_refresh_inode(data->inode, &data->fattr); 732 730 return 0; 733 731 } 734 732 ··· 749 747 if (nfs3_async_handle_jukebox(task, data->inode)) 750 748 return -EAGAIN; 751 749 if (task->tk_status >= 0) 752 - nfs_post_op_update_inode(data->inode, data->res.fattr); 750 + nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); 753 751 return 0; 754 752 } 755 753 ··· 777 775 { 778 776 if (nfs3_async_handle_jukebox(task, data->inode)) 779 777 return -EAGAIN; 780 - if (task->tk_status >= 0) 781 - nfs_post_op_update_inode(data->inode, data->res.fattr); 778 + nfs_refresh_inode(data->inode, data->res.fattr); 782 779 return 0; 783 780 } 784 781

+14 -11

fs/nfs/nfs3xdr.c

··· 346 346 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS3_readres_sz) << 2; 347 347 xdr_inline_pages(&req->rq_rcv_buf, replen, 348 348 args->pages, args->pgbase, count); 349 + req->rq_rcv_buf.flags |= XDRBUF_READ; 349 350 return 0; 350 351 } 351 352 ··· 368 367 369 368 /* Copy the page array */ 370 369 xdr_encode_pages(sndbuf, args->pages, args->pgbase, count); 370 + sndbuf->flags |= XDRBUF_WRITE; 371 371 return 0; 372 372 } 373 373 ··· 526 524 527 525 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 528 526 if (iov->iov_len < hdrlen) { 529 - printk(KERN_WARNING "NFS: READDIR reply header overflowed:" 527 + dprintk("NFS: READDIR reply header overflowed:" 530 528 "length %d > %Zu\n", hdrlen, iov->iov_len); 531 529 return -errno_NFSERR_IO; 532 530 } else if (iov->iov_len != hdrlen) { ··· 549 547 len = ntohl(*p++); /* string length */ 550 548 p += XDR_QUADLEN(len) + 2; /* name + cookie */ 551 549 if (len > NFS3_MAXNAMLEN) { 552 - printk(KERN_WARNING "NFS: giant filename in readdir (len %x)!\n", 550 + dprintk("NFS: giant filename in readdir (len %x)!\n", 553 551 len); 554 552 goto err_unmap; 555 553 } ··· 569 567 goto short_pkt; 570 568 len = ntohl(*p++); 571 569 if (len > NFS3_FHSIZE) { 572 - printk(KERN_WARNING "NFS: giant filehandle in " 570 + dprintk("NFS: giant filehandle in " 573 571 "readdir (len %x)!\n", len); 574 572 goto err_unmap; 575 573 } ··· 590 588 entry[0] = entry[1] = 0; 591 589 /* truncate listing ? */ 592 590 if (!nr) { 593 - printk(KERN_NOTICE "NFS: readdir reply truncated!\n"); 591 + dprintk("NFS: readdir reply truncated!\n"); 594 592 entry[1] = 1; 595 593 } 596 594 goto out; ··· 828 826 /* Convert length of symlink */ 829 827 len = ntohl(*p++); 830 828 if (len >= rcvbuf->page_len || len <= 0) { 831 - dprintk(KERN_WARNING "nfs: server returned giant symlink!\n"); 829 + dprintk("nfs: server returned giant symlink!\n"); 832 830 return -ENAMETOOLONG; 833 831 } 834 832 835 833 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 836 834 if (iov->iov_len < hdrlen) { 837 - printk(KERN_WARNING "NFS: READLINK reply header overflowed:" 835 + dprintk("NFS: READLINK reply header overflowed:" 838 836 "length %d > %Zu\n", hdrlen, iov->iov_len); 839 837 return -errno_NFSERR_IO; 840 838 } else if (iov->iov_len != hdrlen) { 841 - dprintk("NFS: READLINK header is short. iovec will be shifted.\n"); 839 + dprintk("NFS: READLINK header is short. " 840 + "iovec will be shifted.\n"); 842 841 xdr_shift_buf(rcvbuf, iov->iov_len - hdrlen); 843 842 } 844 843 recvd = req->rq_rcv_buf.len - hdrlen; 845 844 if (recvd < len) { 846 - printk(KERN_WARNING "NFS: server cheating in readlink reply: " 845 + dprintk("NFS: server cheating in readlink reply: " 847 846 "count %u > recvd %u\n", len, recvd); 848 847 return -EIO; 849 848 } ··· 879 876 ocount = ntohl(*p++); 880 877 881 878 if (ocount != count) { 882 - printk(KERN_WARNING "NFS: READ count doesn't match RPC opaque count.\n"); 879 + dprintk("NFS: READ count doesn't match RPC opaque count.\n"); 883 880 return -errno_NFSERR_IO; 884 881 } 885 882 886 883 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 887 884 if (iov->iov_len < hdrlen) { 888 - printk(KERN_WARNING "NFS: READ reply header overflowed:" 885 + dprintk("NFS: READ reply header overflowed:" 889 886 "length %d > %Zu\n", hdrlen, iov->iov_len); 890 887 return -errno_NFSERR_IO; 891 888 } else if (iov->iov_len != hdrlen) { ··· 895 892 896 893 recvd = req->rq_rcv_buf.len - hdrlen; 897 894 if (count > recvd) { 898 - printk(KERN_WARNING "NFS: server cheating in read reply: " 895 + dprintk("NFS: server cheating in read reply: " 899 896 "count %d > recvd %d\n", count, recvd); 900 897 count = recvd; 901 898 res->eof = 0;

+36 -49

fs/nfs/nfs4proc.c

··· 62 62 static int _nfs4_proc_open(struct nfs4_opendata *data); 63 63 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 64 64 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *); 65 - static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); 66 65 static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); 67 66 static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp); 68 - static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags); 69 67 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 70 68 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 71 69 ··· 175 177 *p++ = xdr_one; /* bitmap length */ 176 178 *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ 177 179 *p++ = htonl(8); /* attribute buffer length */ 178 - p = xdr_encode_hyper(p, dentry->d_inode->i_ino); 180 + p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode)); 179 181 } 180 182 181 183 *p++ = xdr_one; /* next */ ··· 187 189 *p++ = xdr_one; /* bitmap length */ 188 190 *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ 189 191 *p++ = htonl(8); /* attribute buffer length */ 190 - p = xdr_encode_hyper(p, dentry->d_parent->d_inode->i_ino); 192 + p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode)); 191 193 192 194 readdir->pgbase = (char *)p - (char *)start; 193 195 readdir->count -= readdir->pgbase; ··· 209 211 210 212 spin_lock(&dir->i_lock); 211 213 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE|NFS_INO_INVALID_DATA; 212 - if (cinfo->before == nfsi->change_attr && cinfo->atomic) 213 - nfsi->change_attr = cinfo->after; 214 + if (!cinfo->atomic || cinfo->before != nfsi->change_attr) 215 + nfsi->cache_change_attribute = jiffies; 216 + nfsi->change_attr = cinfo->after; 214 217 spin_unlock(&dir->i_lock); 215 218 } 216 219 ··· 453 454 memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data)); 454 455 rcu_read_unlock(); 455 456 lock_kernel(); 456 - ret = _nfs4_do_access(state->inode, state->owner->so_cred, open_mode); 457 + ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode); 457 458 unlock_kernel(); 458 459 if (ret != 0) 459 460 goto out; ··· 947 948 return 0; 948 949 } 949 950 950 - static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags) 951 - { 952 - struct nfs_access_entry cache; 953 - int mask = 0; 954 - int status; 955 - 956 - if (openflags & FMODE_READ) 957 - mask |= MAY_READ; 958 - if (openflags & FMODE_WRITE) 959 - mask |= MAY_WRITE; 960 - if (openflags & FMODE_EXEC) 961 - mask |= MAY_EXEC; 962 - status = nfs_access_get_cached(inode, cred, &cache); 963 - if (status == 0) 964 - goto out; 965 - 966 - /* Be clever: ask server to check for all possible rights */ 967 - cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; 968 - cache.cred = cred; 969 - cache.jiffies = jiffies; 970 - status = _nfs4_proc_access(inode, &cache); 971 - if (status != 0) 972 - return status; 973 - nfs_access_add_cache(inode, &cache); 974 - out: 975 - if ((cache.mask & mask) == mask) 976 - return 0; 977 - return -EACCES; 978 - } 979 - 980 951 static int nfs4_recover_expired_lease(struct nfs_server *server) 981 952 { 982 953 struct nfs_client *clp = server->nfs_client; ··· 1350 1381 1351 1382 /* If the open_intent is for execute, we have an extra check to make */ 1352 1383 if (nd->intent.open.flags & FMODE_EXEC) { 1353 - ret = _nfs4_do_access(state->inode, 1384 + ret = nfs_may_open(state->inode, 1354 1385 state->owner->so_cred, 1355 1386 nd->intent.open.flags); 1356 1387 if (ret < 0) ··· 1359 1390 filp = lookup_instantiate_filp(nd, path->dentry, NULL); 1360 1391 if (!IS_ERR(filp)) { 1361 1392 struct nfs_open_context *ctx; 1362 - ctx = (struct nfs_open_context *)filp->private_data; 1393 + ctx = nfs_file_open_context(filp); 1363 1394 ctx->state = state; 1364 1395 return 0; 1365 1396 } ··· 1397 1428 state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred); 1398 1429 put_rpccred(cred); 1399 1430 if (IS_ERR(state)) { 1400 - if (PTR_ERR(state) == -ENOENT) 1431 + if (PTR_ERR(state) == -ENOENT) { 1401 1432 d_add(dentry, NULL); 1433 + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1434 + } 1402 1435 return (struct dentry *)state; 1403 1436 } 1404 1437 res = d_add_unique(dentry, igrab(state->inode)); 1405 1438 if (res != NULL) 1406 1439 path.dentry = res; 1440 + nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir)); 1407 1441 nfs4_intent_set_file(nd, &path, state); 1408 1442 return res; 1409 1443 } ··· 1440 1468 } 1441 1469 } 1442 1470 if (state->inode == dentry->d_inode) { 1471 + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1443 1472 nfs4_intent_set_file(nd, &path, state); 1444 1473 return 1; 1445 1474 } ··· 1730 1757 1731 1758 static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) 1732 1759 { 1760 + struct nfs_server *server = NFS_SERVER(inode); 1761 + struct nfs_fattr fattr; 1733 1762 struct nfs4_accessargs args = { 1734 1763 .fh = NFS_FH(inode), 1764 + .bitmask = server->attr_bitmask, 1735 1765 }; 1736 - struct nfs4_accessres res = { 0 }; 1766 + struct nfs4_accessres res = { 1767 + .server = server, 1768 + .fattr = &fattr, 1769 + }; 1737 1770 struct rpc_message msg = { 1738 1771 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], 1739 1772 .rpc_argp = &args, ··· 1765 1786 if (mode & MAY_EXEC) 1766 1787 args.access |= NFS4_ACCESS_EXECUTE; 1767 1788 } 1789 + nfs_fattr_init(&fattr); 1768 1790 status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); 1769 1791 if (!status) { 1770 1792 entry->mask = 0; ··· 1775 1795 entry->mask |= MAY_WRITE; 1776 1796 if (res.access & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) 1777 1797 entry->mask |= MAY_EXEC; 1798 + nfs_refresh_inode(inode, &fattr); 1778 1799 } 1779 1800 return status; 1780 1801 } ··· 1881 1900 } 1882 1901 state = nfs4_do_open(dir, &path, flags, sattr, cred); 1883 1902 put_rpccred(cred); 1903 + d_drop(dentry); 1884 1904 if (IS_ERR(state)) { 1885 1905 status = PTR_ERR(state); 1886 1906 goto out; 1887 1907 } 1888 - d_instantiate(dentry, igrab(state->inode)); 1908 + d_add(dentry, igrab(state->inode)); 1909 + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 1889 1910 if (flags & O_EXCL) { 1890 1911 struct nfs_fattr fattr; 1891 1912 status = nfs4_do_setattr(state->inode, &fattr, sattr, state); ··· 2201 2218 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 2202 2219 if (status == 0) 2203 2220 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); 2221 + 2222 + nfs_invalidate_atime(dir); 2223 + 2204 2224 dprintk("%s: returns %d\n", __FUNCTION__, status); 2205 2225 return status; 2206 2226 } ··· 2400 2414 rpc_restart_call(task); 2401 2415 return -EAGAIN; 2402 2416 } 2417 + 2418 + nfs_invalidate_atime(data->inode); 2403 2419 if (task->tk_status > 0) 2404 2420 renew_lease(server, data->timestamp); 2405 2421 return 0; ··· 2431 2443 } 2432 2444 if (task->tk_status >= 0) { 2433 2445 renew_lease(NFS_SERVER(inode), data->timestamp); 2434 - nfs_post_op_update_inode(inode, data->res.fattr); 2446 + nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); 2435 2447 } 2436 2448 return 0; 2437 2449 } ··· 2473 2485 rpc_restart_call(task); 2474 2486 return -EAGAIN; 2475 2487 } 2476 - if (task->tk_status >= 0) 2477 - nfs_post_op_update_inode(inode, data->res.fattr); 2488 + nfs_refresh_inode(inode, data->res.fattr); 2478 2489 return 0; 2479 2490 } 2480 2491 ··· 3043 3056 if (status == 0) { 3044 3057 status = data->rpc_status; 3045 3058 if (status == 0) 3046 - nfs_post_op_update_inode(inode, &data->fattr); 3059 + nfs_refresh_inode(inode, &data->fattr); 3047 3060 } 3048 3061 rpc_put_task(task); 3049 3062 return status; ··· 3290 3303 status = -ENOMEM; 3291 3304 if (seqid == NULL) 3292 3305 goto out; 3293 - task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid); 3306 + task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); 3294 3307 status = PTR_ERR(task); 3295 3308 if (IS_ERR(task)) 3296 3309 goto out; ··· 3434 3447 int ret; 3435 3448 3436 3449 dprintk("%s: begin!\n", __FUNCTION__); 3437 - data = nfs4_alloc_lockdata(fl, fl->fl_file->private_data, 3450 + data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), 3438 3451 fl->fl_u.nfs4_fl.owner); 3439 3452 if (data == NULL) 3440 3453 return -ENOMEM; ··· 3560 3573 int status; 3561 3574 3562 3575 /* verify open state */ 3563 - ctx = (struct nfs_open_context *)filp->private_data; 3576 + ctx = nfs_file_open_context(filp); 3564 3577 state = ctx->state; 3565 3578 3566 3579 if (request->fl_start < 0 || request->fl_end < 0)

+1 -1

fs/nfs/nfs4state.c

··· 774 774 for (fl = inode->i_flock; fl != 0; fl = fl->fl_next) { 775 775 if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) 776 776 continue; 777 - if (((struct nfs_open_context *)fl->fl_file->private_data)->state != state) 777 + if (nfs_file_open_context(fl->fl_file)->state != state) 778 778 continue; 779 779 status = ops->recover_lock(state, fl); 780 780 if (status >= 0)

+43 -29

fs/nfs/nfs4xdr.c

··· 376 376 decode_locku_maxsz) 377 377 #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ 378 378 encode_putfh_maxsz + \ 379 - encode_access_maxsz) 379 + encode_access_maxsz + \ 380 + encode_getattr_maxsz) 380 381 #define NFS4_dec_access_sz (compound_decode_hdr_maxsz + \ 381 382 decode_putfh_maxsz + \ 382 - decode_access_maxsz) 383 + decode_access_maxsz + \ 384 + decode_getattr_maxsz) 383 385 #define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \ 384 386 encode_putfh_maxsz + \ 385 387 encode_getattr_maxsz) ··· 564 562 565 563 #define RESERVE_SPACE(nbytes) do { \ 566 564 p = xdr_reserve_space(xdr, nbytes); \ 567 - if (!p) printk("RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \ 568 565 BUG_ON(!p); \ 569 566 } while (0) 570 567 ··· 629 628 if (iap->ia_valid & ATTR_UID) { 630 629 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name); 631 630 if (owner_namelen < 0) { 632 - printk(KERN_WARNING "nfs: couldn't resolve uid %d to string\n", 633 - iap->ia_uid); 631 + dprintk("nfs: couldn't resolve uid %d to string\n", 632 + iap->ia_uid); 634 633 /* XXX */ 635 634 strcpy(owner_name, "nobody"); 636 635 owner_namelen = sizeof("nobody") - 1; ··· 641 640 if (iap->ia_valid & ATTR_GID) { 642 641 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group); 643 642 if (owner_grouplen < 0) { 644 - printk(KERN_WARNING "nfs4: couldn't resolve gid %d to string\n", 645 - iap->ia_gid); 643 + dprintk("nfs: couldn't resolve gid %d to string\n", 644 + iap->ia_gid); 646 645 strcpy(owner_group, "nobody"); 647 646 owner_grouplen = sizeof("nobody") - 1; 648 647 /* goto out; */ ··· 712 711 * Now we backfill the bitmap and the attribute buffer length. 713 712 */ 714 713 if (len != ((char *)p - (char *)q) + 4) { 715 - printk ("encode_attr: Attr length calculation error! %u != %Zu\n", 714 + printk(KERN_ERR "nfs: Attr length error, %u != %Zu\n", 716 715 len, ((char *)p - (char *)q) + 4); 717 716 BUG(); 718 717 } ··· 1377 1376 { 1378 1377 struct xdr_stream xdr; 1379 1378 struct compound_hdr hdr = { 1380 - .nops = 2, 1379 + .nops = 3, 1381 1380 }; 1382 1381 int status; 1383 1382 1384 1383 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 1385 1384 encode_compound_hdr(&xdr, &hdr); 1386 - if ((status = encode_putfh(&xdr, args->fh)) == 0) 1387 - status = encode_access(&xdr, args->access); 1385 + status = encode_putfh(&xdr, args->fh); 1386 + if (status != 0) 1387 + goto out; 1388 + status = encode_access(&xdr, args->access); 1389 + if (status != 0) 1390 + goto out; 1391 + status = encode_getfattr(&xdr, args->bitmask); 1392 + out: 1388 1393 return status; 1389 1394 } 1390 1395 ··· 1864 1857 replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_read_sz) << 2; 1865 1858 xdr_inline_pages(&req->rq_rcv_buf, replen, 1866 1859 args->pages, args->pgbase, args->count); 1860 + req->rq_rcv_buf.flags |= XDRBUF_READ; 1867 1861 out: 1868 1862 return status; 1869 1863 } ··· 1941 1933 status = encode_write(&xdr, args); 1942 1934 if (status) 1943 1935 goto out; 1936 + req->rq_snd_buf.flags |= XDRBUF_WRITE; 1944 1937 status = encode_getfattr(&xdr, args->bitmask); 1945 1938 out: 1946 1939 return status; ··· 2189 2180 #define READ_BUF(nbytes) do { \ 2190 2181 p = xdr_inline_decode(xdr, nbytes); \ 2191 2182 if (unlikely(!p)) { \ 2192 - printk(KERN_INFO "%s: prematurely hit end of receive" \ 2183 + dprintk("nfs: %s: prematurely hit end of receive" \ 2193 2184 " buffer\n", __FUNCTION__); \ 2194 - printk(KERN_INFO "%s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ 2185 + dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ 2195 2186 __FUNCTION__, xdr->p, nbytes, xdr->end); \ 2196 2187 return -EIO; \ 2197 2188 } \ ··· 2232 2223 READ_BUF(8); 2233 2224 READ32(opnum); 2234 2225 if (opnum != expected) { 2235 - printk(KERN_NOTICE 2236 - "nfs4_decode_op_hdr: Server returned operation" 2237 - " %d but we issued a request for %d\n", 2226 + dprintk("nfs: Server returned operation" 2227 + " %d but we issued a request for %d\n", 2238 2228 opnum, expected); 2239 2229 return -EIO; 2240 2230 } ··· 2766 2758 dprintk("%s: nfs_map_name_to_uid failed!\n", 2767 2759 __FUNCTION__); 2768 2760 } else 2769 - printk(KERN_WARNING "%s: name too long (%u)!\n", 2761 + dprintk("%s: name too long (%u)!\n", 2770 2762 __FUNCTION__, len); 2771 2763 bitmap[1] &= ~FATTR4_WORD1_OWNER; 2772 2764 } ··· 2791 2783 dprintk("%s: nfs_map_group_to_gid failed!\n", 2792 2784 __FUNCTION__); 2793 2785 } else 2794 - printk(KERN_WARNING "%s: name too long (%u)!\n", 2786 + dprintk("%s: name too long (%u)!\n", 2795 2787 __FUNCTION__, len); 2796 2788 bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP; 2797 2789 } ··· 2958 2950 unsigned int nwords = xdr->p - savep; 2959 2951 2960 2952 if (unlikely(attrwords != nwords)) { 2961 - printk(KERN_WARNING "%s: server returned incorrect attribute length: %u %c %u\n", 2953 + dprintk("%s: server returned incorrect attribute length: " 2954 + "%u %c %u\n", 2962 2955 __FUNCTION__, 2963 2956 attrwords << 2, 2964 2957 (attrwords < nwords) ? '<' : '>', ··· 3460 3451 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 3461 3452 recvd = req->rq_rcv_buf.len - hdrlen; 3462 3453 if (count > recvd) { 3463 - printk(KERN_WARNING "NFS: server cheating in read reply: " 3454 + dprintk("NFS: server cheating in read reply: " 3464 3455 "count %u > recvd %u\n", count, recvd); 3465 3456 count = recvd; 3466 3457 eof = 0; ··· 3509 3500 p += 2; /* cookie */ 3510 3501 len = ntohl(*p++); /* filename length */ 3511 3502 if (len > NFS4_MAXNAMLEN) { 3512 - printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len); 3503 + dprintk("NFS: giant filename in readdir (len 0x%x)\n", 3504 + len); 3513 3505 goto err_unmap; 3514 3506 } 3515 3507 xlen = XDR_QUADLEN(len); ··· 3538 3528 entry[0] = entry[1] = 0; 3539 3529 /* truncate listing ? */ 3540 3530 if (!nr) { 3541 - printk(KERN_NOTICE "NFS: readdir reply truncated!\n"); 3531 + dprintk("NFS: readdir reply truncated!\n"); 3542 3532 entry[1] = 1; 3543 3533 } 3544 3534 goto out; ··· 3564 3554 READ_BUF(4); 3565 3555 READ32(len); 3566 3556 if (len >= rcvbuf->page_len || len <= 0) { 3567 - dprintk(KERN_WARNING "nfs: server returned giant symlink!\n"); 3557 + dprintk("nfs: server returned giant symlink!\n"); 3568 3558 return -ENAMETOOLONG; 3569 3559 } 3570 3560 hdrlen = (char *) xdr->p - (char *) iov->iov_base; 3571 3561 recvd = req->rq_rcv_buf.len - hdrlen; 3572 3562 if (recvd < len) { 3573 - printk(KERN_WARNING "NFS: server cheating in readlink reply: " 3563 + dprintk("NFS: server cheating in readlink reply: " 3574 3564 "count %u > recvd %u\n", len, recvd); 3575 3565 return -EIO; 3576 3566 } ··· 3653 3643 hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; 3654 3644 recvd = req->rq_rcv_buf.len - hdrlen; 3655 3645 if (attrlen > recvd) { 3656 - printk(KERN_WARNING "NFS: server cheating in getattr" 3646 + dprintk("NFS: server cheating in getattr" 3657 3647 " acl reply: attrlen %u > recvd %u\n", 3658 3648 attrlen, recvd); 3659 3649 return -EINVAL; ··· 3698 3688 READ_BUF(8); 3699 3689 READ32(opnum); 3700 3690 if (opnum != OP_SETCLIENTID) { 3701 - printk(KERN_NOTICE 3702 - "nfs4_decode_setclientid: Server returned operation" 3691 + dprintk("nfs: decode_setclientid: Server returned operation" 3703 3692 " %d\n", opnum); 3704 3693 return -EIO; 3705 3694 } ··· 3792 3783 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); 3793 3784 if ((status = decode_compound_hdr(&xdr, &hdr)) != 0) 3794 3785 goto out; 3795 - if ((status = decode_putfh(&xdr)) == 0) 3796 - status = decode_access(&xdr, res); 3786 + status = decode_putfh(&xdr); 3787 + if (status != 0) 3788 + goto out; 3789 + status = decode_access(&xdr, res); 3790 + if (status != 0) 3791 + goto out; 3792 + decode_getfattr(&xdr, res->fattr, res->server); 3797 3793 out: 3798 3794 return status; 3799 3795 }

+2 -1

fs/nfs/nfsroot.c

··· 76 76 #include <linux/fs.h> 77 77 #include <linux/init.h> 78 78 #include <linux/sunrpc/clnt.h> 79 + #include <linux/sunrpc/xprtsock.h> 79 80 #include <linux/nfs.h> 80 81 #include <linux/nfs_fs.h> 81 82 #include <linux/nfs_mount.h> ··· 492 491 struct sockaddr_in sin; 493 492 int status; 494 493 int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ? 495 - IPPROTO_TCP : IPPROTO_UDP; 494 + XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP; 496 495 int version = (nfs_data.flags & NFS_MOUNT_VER3) ? 497 496 NFS_MNT3_VERSION : NFS_MNT_VERSION; 498 497

+4 -1

fs/nfs/proc.c

··· 476 476 dprintk("NFS call readdir %d\n", (unsigned int)cookie); 477 477 status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); 478 478 479 + nfs_invalidate_atime(dir); 480 + 479 481 dprintk("NFS reply readdir: %d\n", status); 480 482 return status; 481 483 } ··· 552 550 553 551 static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data) 554 552 { 553 + nfs_invalidate_atime(data->inode); 555 554 if (task->tk_status >= 0) { 556 555 nfs_refresh_inode(data->inode, data->res.fattr); 557 556 /* Emulate the eof flag, which isn't normally needed in NFSv2 ··· 579 576 static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data) 580 577 { 581 578 if (task->tk_status >= 0) 582 - nfs_post_op_update_inode(data->inode, data->res.fattr); 579 + nfs_post_op_update_inode_force_wcc(data->inode, data->res.fattr); 583 580 return 0; 584 581 } 585 582

+2 -7

fs/nfs/read.c

··· 341 341 set_bit(NFS_INO_STALE, &NFS_FLAGS(data->inode)); 342 342 nfs_mark_for_revalidate(data->inode); 343 343 } 344 - spin_lock(&data->inode->i_lock); 345 - NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME; 346 - spin_unlock(&data->inode->i_lock); 347 344 return 0; 348 345 } 349 346 ··· 494 497 if (ctx == NULL) 495 498 goto out_unlock; 496 499 } else 497 - ctx = get_nfs_open_context((struct nfs_open_context *) 498 - file->private_data); 500 + ctx = get_nfs_open_context(nfs_file_open_context(file)); 499 501 500 502 error = nfs_readpage_async(ctx, inode, page); 501 503 ··· 572 576 if (desc.ctx == NULL) 573 577 return -EBADF; 574 578 } else 575 - desc.ctx = get_nfs_open_context((struct nfs_open_context *) 576 - filp->private_data); 579 + desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); 577 580 if (rsize < PAGE_CACHE_SIZE) 578 581 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 579 582 else

+170 -219

fs/nfs/super.c

··· 33 33 #include <linux/sunrpc/clnt.h> 34 34 #include <linux/sunrpc/stats.h> 35 35 #include <linux/sunrpc/metrics.h> 36 + #include <linux/sunrpc/xprtsock.h> 37 + #include <linux/sunrpc/xprtrdma.h> 36 38 #include <linux/nfs_fs.h> 37 39 #include <linux/nfs_mount.h> 38 40 #include <linux/nfs4_mount.h> ··· 60 58 61 59 #define NFSDBG_FACILITY NFSDBG_VFS 62 60 63 - 64 - struct nfs_parsed_mount_data { 65 - int flags; 66 - int rsize, wsize; 67 - int timeo, retrans; 68 - int acregmin, acregmax, 69 - acdirmin, acdirmax; 70 - int namlen; 71 - unsigned int bsize; 72 - unsigned int auth_flavor_len; 73 - rpc_authflavor_t auth_flavors[1]; 74 - char *client_address; 75 - 76 - struct { 77 - struct sockaddr_in address; 78 - unsigned int program; 79 - unsigned int version; 80 - unsigned short port; 81 - int protocol; 82 - } mount_server; 83 - 84 - struct { 85 - struct sockaddr_in address; 86 - char *hostname; 87 - char *export_path; 88 - unsigned int program; 89 - int protocol; 90 - } nfs_server; 91 - }; 92 - 93 61 enum { 94 62 /* Mount options that take no arguments */ 95 63 Opt_soft, Opt_hard, ··· 69 97 Opt_ac, Opt_noac, 70 98 Opt_lock, Opt_nolock, 71 99 Opt_v2, Opt_v3, 72 - Opt_udp, Opt_tcp, 100 + Opt_udp, Opt_tcp, Opt_rdma, 73 101 Opt_acl, Opt_noacl, 74 102 Opt_rdirplus, Opt_nordirplus, 75 103 Opt_sharecache, Opt_nosharecache, ··· 88 116 89 117 /* Mount options that take string arguments */ 90 118 Opt_sec, Opt_proto, Opt_mountproto, 91 - Opt_addr, Opt_mounthost, Opt_clientaddr, 119 + Opt_addr, Opt_mountaddr, Opt_clientaddr, 92 120 93 121 /* Mount options that are ignored */ 94 122 Opt_userspace, Opt_deprecated, ··· 115 143 { Opt_v3, "v3" }, 116 144 { Opt_udp, "udp" }, 117 145 { Opt_tcp, "tcp" }, 146 + { Opt_rdma, "rdma" }, 118 147 { Opt_acl, "acl" }, 119 148 { Opt_noacl, "noacl" }, 120 149 { Opt_rdirplus, "rdirplus" }, ··· 148 175 { Opt_mountproto, "mountproto=%s" }, 149 176 { Opt_addr, "addr=%s" }, 150 177 { Opt_clientaddr, "clientaddr=%s" }, 151 - { Opt_mounthost, "mounthost=%s" }, 178 + { Opt_userspace, "mounthost=%s" }, 179 + { Opt_mountaddr, "mountaddr=%s" }, 152 180 153 181 { Opt_err, NULL } 154 182 }; 155 183 156 184 enum { 157 - Opt_xprt_udp, Opt_xprt_tcp, 185 + Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma, 158 186 159 187 Opt_xprt_err 160 188 }; ··· 163 189 static match_table_t nfs_xprt_protocol_tokens = { 164 190 { Opt_xprt_udp, "udp" }, 165 191 { Opt_xprt_tcp, "tcp" }, 192 + { Opt_xprt_rdma, "rdma" }, 166 193 167 194 { Opt_xprt_err, NULL } 168 195 }; ··· 424 449 const char *nostr; 425 450 } nfs_info[] = { 426 451 { NFS_MOUNT_SOFT, ",soft", ",hard" }, 427 - { NFS_MOUNT_INTR, ",intr", "" }, 452 + { NFS_MOUNT_INTR, ",intr", ",nointr" }, 428 453 { NFS_MOUNT_NOCTO, ",nocto", "" }, 429 454 { NFS_MOUNT_NOAC, ",noac", "" }, 430 455 { NFS_MOUNT_NONLM, ",nolock", "" }, ··· 435 460 }; 436 461 const struct proc_nfs_info *nfs_infop; 437 462 struct nfs_client *clp = nfss->nfs_client; 438 - char buf[12]; 439 - const char *proto; 440 463 441 464 seq_printf(m, ",vers=%d", clp->rpc_ops->version); 442 465 seq_printf(m, ",rsize=%d", nfss->rsize); ··· 453 480 else 454 481 seq_puts(m, nfs_infop->nostr); 455 482 } 456 - switch (nfss->client->cl_xprt->prot) { 457 - case IPPROTO_TCP: 458 - proto = "tcp"; 459 - break; 460 - case IPPROTO_UDP: 461 - proto = "udp"; 462 - break; 463 - default: 464 - snprintf(buf, sizeof(buf), "%u", nfss->client->cl_xprt->prot); 465 - proto = buf; 466 - } 467 - seq_printf(m, ",proto=%s", proto); 483 + seq_printf(m, ",proto=%s", 484 + rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO)); 468 485 seq_printf(m, ",timeo=%lu", 10U * clp->retrans_timeo / HZ); 469 486 seq_printf(m, ",retrans=%u", clp->retrans_count); 470 487 seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); ··· 469 506 470 507 nfs_show_mount_options(m, nfss, 0); 471 508 472 - seq_puts(m, ",addr="); 473 - seq_escape(m, nfss->nfs_client->cl_hostname, " \t\n\\"); 509 + seq_printf(m, ",addr="NIPQUAD_FMT, 510 + NIPQUAD(nfss->nfs_client->cl_addr.sin_addr)); 474 511 475 512 return 0; 476 513 } ··· 661 698 break; 662 699 case Opt_udp: 663 700 mnt->flags &= ~NFS_MOUNT_TCP; 664 - mnt->nfs_server.protocol = IPPROTO_UDP; 701 + mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 665 702 mnt->timeo = 7; 666 703 mnt->retrans = 5; 667 704 break; 668 705 case Opt_tcp: 669 706 mnt->flags |= NFS_MOUNT_TCP; 670 - mnt->nfs_server.protocol = IPPROTO_TCP; 707 + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 708 + mnt->timeo = 600; 709 + mnt->retrans = 2; 710 + break; 711 + case Opt_rdma: 712 + mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */ 713 + mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 671 714 mnt->timeo = 600; 672 715 mnt->retrans = 2; 673 716 break; ··· 882 913 switch (token) { 883 914 case Opt_xprt_udp: 884 915 mnt->flags &= ~NFS_MOUNT_TCP; 885 - mnt->nfs_server.protocol = IPPROTO_UDP; 916 + mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 886 917 mnt->timeo = 7; 887 918 mnt->retrans = 5; 888 919 break; 889 920 case Opt_xprt_tcp: 890 921 mnt->flags |= NFS_MOUNT_TCP; 891 - mnt->nfs_server.protocol = IPPROTO_TCP; 922 + mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 923 + mnt->timeo = 600; 924 + mnt->retrans = 2; 925 + break; 926 + case Opt_xprt_rdma: 927 + /* vector side protocols to TCP */ 928 + mnt->flags |= NFS_MOUNT_TCP; 929 + mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 892 930 mnt->timeo = 600; 893 931 mnt->retrans = 2; 894 932 break; ··· 913 937 914 938 switch (token) { 915 939 case Opt_xprt_udp: 916 - mnt->mount_server.protocol = IPPROTO_UDP; 940 + mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; 917 941 break; 918 942 case Opt_xprt_tcp: 919 - mnt->mount_server.protocol = IPPROTO_TCP; 943 + mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; 920 944 break; 945 + case Opt_xprt_rdma: /* not used for side protocols */ 921 946 default: 922 947 goto out_unrec_xprt; 923 948 } ··· 938 961 goto out_nomem; 939 962 mnt->client_address = string; 940 963 break; 941 - case Opt_mounthost: 964 + case Opt_mountaddr: 942 965 string = match_strdup(args); 943 966 if (string == NULL) 944 967 goto out_nomem; ··· 1004 1027 sin = args->mount_server.address; 1005 1028 else 1006 1029 sin = args->nfs_server.address; 1007 - if (args->mount_server.port == 0) { 1008 - status = rpcb_getport_sync(&sin, 1009 - args->mount_server.program, 1010 - args->mount_server.version, 1011 - args->mount_server.protocol); 1012 - if (status < 0) 1013 - goto out_err; 1014 - sin.sin_port = htons(status); 1015 - } else 1016 - sin.sin_port = htons(args->mount_server.port); 1030 + /* 1031 + * autobind will be used if mount_server.port == 0 1032 + */ 1033 + sin.sin_port = htons(args->mount_server.port); 1017 1034 1018 1035 /* 1019 1036 * Now ask the mount server to map our export path ··· 1020 1049 args->mount_server.version, 1021 1050 args->mount_server.protocol, 1022 1051 root_fh); 1023 - if (status < 0) 1024 - goto out_err; 1052 + if (status == 0) 1053 + return 0; 1025 1054 1026 - return status; 1027 - 1028 - out_err: 1029 - dfprintk(MOUNT, "NFS: unable to contact server on host " 1030 - NIPQUAD_FMT "\n", NIPQUAD(sin.sin_addr.s_addr)); 1055 + dfprintk(MOUNT, "NFS: unable to mount server " NIPQUAD_FMT 1056 + ", error %d\n", NIPQUAD(sin.sin_addr.s_addr), status); 1031 1057 return status; 1032 1058 } 1033 1059 ··· 1047 1079 * XXX: as far as I can tell, changing the NFS program number is not 1048 1080 * supported in the NFS client. 1049 1081 */ 1050 - static int nfs_validate_mount_data(struct nfs_mount_data **options, 1082 + static int nfs_validate_mount_data(void *options, 1083 + struct nfs_parsed_mount_data *args, 1051 1084 struct nfs_fh *mntfh, 1052 1085 const char *dev_name) 1053 1086 { 1054 - struct nfs_mount_data *data = *options; 1087 + struct nfs_mount_data *data = (struct nfs_mount_data *)options; 1055 1088 1056 1089 if (data == NULL) 1057 1090 goto out_no_data; 1091 + 1092 + memset(args, 0, sizeof(*args)); 1093 + args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP); 1094 + args->rsize = NFS_MAX_FILE_IO_SIZE; 1095 + args->wsize = NFS_MAX_FILE_IO_SIZE; 1096 + args->timeo = 600; 1097 + args->retrans = 2; 1098 + args->acregmin = 3; 1099 + args->acregmax = 60; 1100 + args->acdirmin = 30; 1101 + args->acdirmax = 60; 1102 + args->mount_server.protocol = XPRT_TRANSPORT_UDP; 1103 + args->mount_server.program = NFS_MNT_PROGRAM; 1104 + args->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1105 + args->nfs_server.program = NFS_PROGRAM; 1058 1106 1059 1107 switch (data->version) { 1060 1108 case 1: ··· 1100 1116 if (mntfh->size < sizeof(mntfh->data)) 1101 1117 memset(mntfh->data + mntfh->size, 0, 1102 1118 sizeof(mntfh->data) - mntfh->size); 1119 + 1120 + if (!nfs_verify_server_address((struct sockaddr *) &data->addr)) 1121 + goto out_no_address; 1122 + 1123 + /* 1124 + * Translate to nfs_parsed_mount_data, which nfs_fill_super 1125 + * can deal with. 1126 + */ 1127 + args->flags = data->flags; 1128 + args->rsize = data->rsize; 1129 + args->wsize = data->wsize; 1130 + args->flags = data->flags; 1131 + args->timeo = data->timeo; 1132 + args->retrans = data->retrans; 1133 + args->acregmin = data->acregmin; 1134 + args->acregmax = data->acregmax; 1135 + args->acdirmin = data->acdirmin; 1136 + args->acdirmax = data->acdirmax; 1137 + args->nfs_server.address = data->addr; 1138 + if (!(data->flags & NFS_MOUNT_TCP)) 1139 + args->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1140 + /* N.B. caller will free nfs_server.hostname in all cases */ 1141 + args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); 1142 + args->namlen = data->namlen; 1143 + args->bsize = data->bsize; 1144 + args->auth_flavors[0] = data->pseudoflavor; 1103 1145 break; 1104 1146 default: { 1105 1147 unsigned int len; 1106 1148 char *c; 1107 1149 int status; 1108 - struct nfs_parsed_mount_data args = { 1109 - .flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP), 1110 - .rsize = NFS_MAX_FILE_IO_SIZE, 1111 - .wsize = NFS_MAX_FILE_IO_SIZE, 1112 - .timeo = 600, 1113 - .retrans = 2, 1114 - .acregmin = 3, 1115 - .acregmax = 60, 1116 - .acdirmin = 30, 1117 - .acdirmax = 60, 1118 - .mount_server.protocol = IPPROTO_UDP, 1119 - .mount_server.program = NFS_MNT_PROGRAM, 1120 - .nfs_server.protocol = IPPROTO_TCP, 1121 - .nfs_server.program = NFS_PROGRAM, 1122 - }; 1123 1150 1124 - if (nfs_parse_mount_options((char *) *options, &args) == 0) 1151 + if (nfs_parse_mount_options((char *)options, args) == 0) 1125 1152 return -EINVAL; 1126 1153 1127 - data = kzalloc(sizeof(*data), GFP_KERNEL); 1128 - if (data == NULL) 1129 - return -ENOMEM; 1130 - 1131 - /* 1132 - * NB: after this point, caller will free "data" 1133 - * if we return an error 1134 - */ 1135 - *options = data; 1154 + if (!nfs_verify_server_address((struct sockaddr *) 1155 + &args->nfs_server.address)) 1156 + goto out_no_address; 1136 1157 1137 1158 c = strchr(dev_name, ':'); 1138 1159 if (c == NULL) 1139 1160 return -EINVAL; 1140 1161 len = c - dev_name; 1141 - if (len > sizeof(data->hostname)) 1142 - return -ENAMETOOLONG; 1143 - strncpy(data->hostname, dev_name, len); 1144 - args.nfs_server.hostname = data->hostname; 1162 + /* N.B. caller will free nfs_server.hostname in all cases */ 1163 + args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); 1145 1164 1146 1165 c++; 1147 1166 if (strlen(c) > NFS_MAXPATHLEN) 1148 1167 return -ENAMETOOLONG; 1149 - args.nfs_server.export_path = c; 1168 + args->nfs_server.export_path = c; 1150 1169 1151 - status = nfs_try_mount(&args, mntfh); 1170 + status = nfs_try_mount(args, mntfh); 1152 1171 if (status) 1153 1172 return status; 1154 - 1155 - /* 1156 - * Translate to nfs_mount_data, which nfs_fill_super 1157 - * can deal with. 1158 - */ 1159 - data->version = 6; 1160 - data->flags = args.flags; 1161 - data->rsize = args.rsize; 1162 - data->wsize = args.wsize; 1163 - data->timeo = args.timeo; 1164 - data->retrans = args.retrans; 1165 - data->acregmin = args.acregmin; 1166 - data->acregmax = args.acregmax; 1167 - data->acdirmin = args.acdirmin; 1168 - data->acdirmax = args.acdirmax; 1169 - data->addr = args.nfs_server.address; 1170 - data->namlen = args.namlen; 1171 - data->bsize = args.bsize; 1172 - data->pseudoflavor = args.auth_flavors[0]; 1173 1173 1174 1174 break; 1175 1175 } 1176 1176 } 1177 1177 1178 - if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) 1179 - data->pseudoflavor = RPC_AUTH_UNIX; 1178 + if (!(args->flags & NFS_MOUNT_SECFLAVOUR)) 1179 + args->auth_flavors[0] = RPC_AUTH_UNIX; 1180 1180 1181 1181 #ifndef CONFIG_NFS_V3 1182 - if (data->flags & NFS_MOUNT_VER3) 1182 + if (args->flags & NFS_MOUNT_VER3) 1183 1183 goto out_v3_not_compiled; 1184 1184 #endif /* !CONFIG_NFS_V3 */ 1185 - 1186 - if (!nfs_verify_server_address((struct sockaddr *) &data->addr)) 1187 - goto out_no_address; 1188 1185 1189 1186 return 0; 1190 1187 ··· 1223 1258 /* 1224 1259 * Finish setting up an NFS2/3 superblock 1225 1260 */ 1226 - static void nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data) 1261 + static void nfs_fill_super(struct super_block *sb, 1262 + struct nfs_parsed_mount_data *data) 1227 1263 { 1228 1264 struct nfs_server *server = NFS_SB(sb); 1229 1265 ··· 1345 1379 struct nfs_server *server = NULL; 1346 1380 struct super_block *s; 1347 1381 struct nfs_fh mntfh; 1348 - struct nfs_mount_data *data = raw_data; 1382 + struct nfs_parsed_mount_data data; 1349 1383 struct dentry *mntroot; 1350 1384 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 1351 1385 struct nfs_sb_mountdata sb_mntdata = { ··· 1354 1388 int error; 1355 1389 1356 1390 /* Validate the mount data */ 1357 - error = nfs_validate_mount_data(&data, &mntfh, dev_name); 1391 + error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name); 1358 1392 if (error < 0) 1359 1393 goto out; 1360 1394 1361 1395 /* Get a volume representation */ 1362 - server = nfs_create_server(data, &mntfh); 1396 + server = nfs_create_server(&data, &mntfh); 1363 1397 if (IS_ERR(server)) { 1364 1398 error = PTR_ERR(server); 1365 1399 goto out; ··· 1383 1417 1384 1418 if (!s->s_root) { 1385 1419 /* initial superblock/root creation */ 1386 - nfs_fill_super(s, data); 1420 + nfs_fill_super(s, &data); 1387 1421 } 1388 1422 1389 1423 mntroot = nfs_get_root(s, &mntfh); ··· 1398 1432 error = 0; 1399 1433 1400 1434 out: 1401 - if (data != raw_data) 1402 - kfree(data); 1435 + kfree(data.nfs_server.hostname); 1403 1436 return error; 1404 1437 1405 1438 out_err_nosb: ··· 1524 1559 /* 1525 1560 * Validate NFSv4 mount options 1526 1561 */ 1527 - static int nfs4_validate_mount_data(struct nfs4_mount_data **options, 1528 - const char *dev_name, 1529 - struct sockaddr_in *addr, 1530 - rpc_authflavor_t *authflavour, 1531 - char **hostname, 1532 - char **mntpath, 1533 - char **ip_addr) 1562 + static int nfs4_validate_mount_data(void *options, 1563 + struct nfs_parsed_mount_data *args, 1564 + const char *dev_name) 1534 1565 { 1535 - struct nfs4_mount_data *data = *options; 1566 + struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; 1536 1567 char *c; 1537 1568 1538 1569 if (data == NULL) 1539 1570 goto out_no_data; 1540 1571 1572 + memset(args, 0, sizeof(*args)); 1573 + args->rsize = NFS_MAX_FILE_IO_SIZE; 1574 + args->wsize = NFS_MAX_FILE_IO_SIZE; 1575 + args->timeo = 600; 1576 + args->retrans = 2; 1577 + args->acregmin = 3; 1578 + args->acregmax = 60; 1579 + args->acdirmin = 30; 1580 + args->acdirmax = 60; 1581 + args->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1582 + 1541 1583 switch (data->version) { 1542 1584 case 1: 1543 - if (data->host_addrlen != sizeof(*addr)) 1585 + if (data->host_addrlen != sizeof(args->nfs_server.address)) 1544 1586 goto out_no_address; 1545 - if (copy_from_user(addr, data->host_addr, sizeof(*addr))) 1587 + if (copy_from_user(&args->nfs_server.address, 1588 + data->host_addr, 1589 + sizeof(args->nfs_server.address))) 1546 1590 return -EFAULT; 1547 - if (addr->sin_port == 0) 1548 - addr->sin_port = htons(NFS_PORT); 1549 - if (!nfs_verify_server_address((struct sockaddr *) addr)) 1591 + if (args->nfs_server.address.sin_port == 0) 1592 + args->nfs_server.address.sin_port = htons(NFS_PORT); 1593 + if (!nfs_verify_server_address((struct sockaddr *) 1594 + &args->nfs_server.address)) 1550 1595 goto out_no_address; 1551 1596 1552 1597 switch (data->auth_flavourlen) { 1553 1598 case 0: 1554 - *authflavour = RPC_AUTH_UNIX; 1599 + args->auth_flavors[0] = RPC_AUTH_UNIX; 1555 1600 break; 1556 1601 case 1: 1557 - if (copy_from_user(authflavour, data->auth_flavours, 1558 - sizeof(*authflavour))) 1602 + if (copy_from_user(&args->auth_flavors[0], 1603 + data->auth_flavours, 1604 + sizeof(args->auth_flavors[0]))) 1559 1605 return -EFAULT; 1560 1606 break; 1561 1607 default: ··· 1576 1600 c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); 1577 1601 if (IS_ERR(c)) 1578 1602 return PTR_ERR(c); 1579 - *hostname = c; 1603 + args->nfs_server.hostname = c; 1580 1604 1581 1605 c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); 1582 1606 if (IS_ERR(c)) 1583 1607 return PTR_ERR(c); 1584 - *mntpath = c; 1585 - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *mntpath); 1608 + args->nfs_server.export_path = c; 1609 + dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); 1586 1610 1587 1611 c = strndup_user(data->client_addr.data, 16); 1588 1612 if (IS_ERR(c)) 1589 1613 return PTR_ERR(c); 1590 - *ip_addr = c; 1614 + args->client_address = c; 1615 + 1616 + /* 1617 + * Translate to nfs_parsed_mount_data, which nfs4_fill_super 1618 + * can deal with. 1619 + */ 1620 + 1621 + args->flags = data->flags & NFS4_MOUNT_FLAGMASK; 1622 + args->rsize = data->rsize; 1623 + args->wsize = data->wsize; 1624 + args->timeo = data->timeo; 1625 + args->retrans = data->retrans; 1626 + args->acregmin = data->acregmin; 1627 + args->acregmax = data->acregmax; 1628 + args->acdirmin = data->acdirmin; 1629 + args->acdirmax = data->acdirmax; 1630 + args->nfs_server.protocol = data->proto; 1591 1631 1592 1632 break; 1593 1633 default: { 1594 1634 unsigned int len; 1595 - struct nfs_parsed_mount_data args = { 1596 - .rsize = NFS_MAX_FILE_IO_SIZE, 1597 - .wsize = NFS_MAX_FILE_IO_SIZE, 1598 - .timeo = 600, 1599 - .retrans = 2, 1600 - .acregmin = 3, 1601 - .acregmax = 60, 1602 - .acdirmin = 30, 1603 - .acdirmax = 60, 1604 - .nfs_server.protocol = IPPROTO_TCP, 1605 - }; 1606 1635 1607 - if (nfs_parse_mount_options((char *) *options, &args) == 0) 1636 + if (nfs_parse_mount_options((char *)options, args) == 0) 1608 1637 return -EINVAL; 1609 1638 1610 1639 if (!nfs_verify_server_address((struct sockaddr *) 1611 - &args.nfs_server.address)) 1640 + &args->nfs_server.address)) 1612 1641 return -EINVAL; 1613 - *addr = args.nfs_server.address; 1614 1642 1615 - switch (args.auth_flavor_len) { 1643 + switch (args->auth_flavor_len) { 1616 1644 case 0: 1617 - *authflavour = RPC_AUTH_UNIX; 1645 + args->auth_flavors[0] = RPC_AUTH_UNIX; 1618 1646 break; 1619 1647 case 1: 1620 - *authflavour = (rpc_authflavor_t) args.auth_flavors[0]; 1621 1648 break; 1622 1649 default: 1623 1650 goto out_inval_auth; 1624 1651 } 1625 - 1626 - /* 1627 - * Translate to nfs4_mount_data, which nfs4_fill_super 1628 - * can deal with. 1629 - */ 1630 - data = kzalloc(sizeof(*data), GFP_KERNEL); 1631 - if (data == NULL) 1632 - return -ENOMEM; 1633 - *options = data; 1634 - 1635 - data->version = 1; 1636 - data->flags = args.flags & NFS4_MOUNT_FLAGMASK; 1637 - data->rsize = args.rsize; 1638 - data->wsize = args.wsize; 1639 - data->timeo = args.timeo; 1640 - data->retrans = args.retrans; 1641 - data->acregmin = args.acregmin; 1642 - data->acregmax = args.acregmax; 1643 - data->acdirmin = args.acdirmin; 1644 - data->acdirmax = args.acdirmax; 1645 - data->proto = args.nfs_server.protocol; 1646 1652 1647 1653 /* 1648 1654 * Split "dev_name" into "hostname:mntpath". ··· 1636 1678 len = c - dev_name; 1637 1679 if (len > NFS4_MAXNAMLEN) 1638 1680 return -ENAMETOOLONG; 1639 - *hostname = kzalloc(len, GFP_KERNEL); 1640 - if (*hostname == NULL) 1681 + args->nfs_server.hostname = kzalloc(len, GFP_KERNEL); 1682 + if (args->nfs_server.hostname == NULL) 1641 1683 return -ENOMEM; 1642 - strncpy(*hostname, dev_name, len - 1); 1684 + strncpy(args->nfs_server.hostname, dev_name, len - 1); 1643 1685 1644 1686 c++; /* step over the ':' */ 1645 1687 len = strlen(c); 1646 1688 if (len > NFS4_MAXPATHLEN) 1647 1689 return -ENAMETOOLONG; 1648 - *mntpath = kzalloc(len + 1, GFP_KERNEL); 1649 - if (*mntpath == NULL) 1690 + args->nfs_server.export_path = kzalloc(len + 1, GFP_KERNEL); 1691 + if (args->nfs_server.export_path == NULL) 1650 1692 return -ENOMEM; 1651 - strncpy(*mntpath, c, len); 1693 + strncpy(args->nfs_server.export_path, c, len); 1652 1694 1653 - dprintk("MNTPATH: %s\n", *mntpath); 1695 + dprintk("MNTPATH: %s\n", args->nfs_server.export_path); 1654 1696 1655 - if (args.client_address == NULL) 1697 + if (args->client_address == NULL) 1656 1698 goto out_no_client_address; 1657 - 1658 - *ip_addr = args.client_address; 1659 1699 1660 1700 break; 1661 1701 } ··· 1685 1729 static int nfs4_get_sb(struct file_system_type *fs_type, 1686 1730 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 1687 1731 { 1688 - struct nfs4_mount_data *data = raw_data; 1732 + struct nfs_parsed_mount_data data; 1689 1733 struct super_block *s; 1690 1734 struct nfs_server *server; 1691 - struct sockaddr_in addr; 1692 - rpc_authflavor_t authflavour; 1693 1735 struct nfs_fh mntfh; 1694 1736 struct dentry *mntroot; 1695 - char *mntpath = NULL, *hostname = NULL, *ip_addr = NULL; 1696 1737 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 1697 1738 struct nfs_sb_mountdata sb_mntdata = { 1698 1739 .mntflags = flags, ··· 1697 1744 int error; 1698 1745 1699 1746 /* Validate the mount data */ 1700 - error = nfs4_validate_mount_data(&data, dev_name, &addr, &authflavour, 1701 - &hostname, &mntpath, &ip_addr); 1747 + error = nfs4_validate_mount_data(raw_data, &data, dev_name); 1702 1748 if (error < 0) 1703 1749 goto out; 1704 1750 1705 1751 /* Get a volume representation */ 1706 - server = nfs4_create_server(data, hostname, &addr, mntpath, ip_addr, 1707 - authflavour, &mntfh); 1752 + server = nfs4_create_server(&data, &mntfh); 1708 1753 if (IS_ERR(server)) { 1709 1754 error = PTR_ERR(server); 1710 1755 goto out; ··· 1741 1790 error = 0; 1742 1791 1743 1792 out: 1744 - kfree(ip_addr); 1745 - kfree(mntpath); 1746 - kfree(hostname); 1793 + kfree(data.client_address); 1794 + kfree(data.nfs_server.export_path); 1795 + kfree(data.nfs_server.hostname); 1747 1796 return error; 1748 1797 1749 1798 out_free:

-3

fs/nfs/unlink.c

··· 66 66 .rpc_cred = data->cred, 67 67 }; 68 68 69 - nfs_begin_data_update(dir); 70 69 NFS_PROTO(dir)->unlink_setup(&msg, dir); 71 70 rpc_call_setup(task, &msg, 0); 72 71 } ··· 83 84 84 85 if (!NFS_PROTO(dir)->unlink_done(task, dir)) 85 86 rpc_restart_call(task); 86 - else 87 - nfs_end_data_update(dir); 88 87 } 89 88 90 89 /**

+83 -130

fs/nfs/write.c

··· 110 110 nfs_writedata_free(wdata); 111 111 } 112 112 113 + static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) 114 + { 115 + ctx->error = error; 116 + smp_wmb(); 117 + set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 118 + } 119 + 113 120 static struct nfs_page *nfs_page_find_request_locked(struct page *page) 114 121 { 115 122 struct nfs_page *req = NULL; ··· 250 243 251 244 /* 252 245 * Find an associated nfs write request, and prepare to flush it out 253 - * Returns 1 if there was no write request, or if the request was 254 - * already tagged by nfs_set_page_dirty.Returns 0 if the request 255 - * was not tagged. 256 - * May also return an error if the user signalled nfs_wait_on_request(). 246 + * May return an error if the user signalled nfs_wait_on_request(). 257 247 */ 258 248 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, 259 249 struct page *page) ··· 265 261 req = nfs_page_find_request_locked(page); 266 262 if (req == NULL) { 267 263 spin_unlock(&inode->i_lock); 268 - return 1; 264 + return 0; 269 265 } 270 266 if (nfs_lock_request_dontget(req)) 271 267 break; ··· 286 282 spin_unlock(&inode->i_lock); 287 283 nfs_unlock_request(req); 288 284 nfs_pageio_complete(pgio); 289 - return 1; 285 + return 0; 290 286 } 291 287 if (nfs_set_page_writeback(page) != 0) { 292 288 spin_unlock(&inode->i_lock); ··· 294 290 } 295 291 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, 296 292 NFS_PAGE_TAG_LOCKED); 297 - ret = test_bit(PG_NEED_FLUSH, &req->wb_flags); 298 293 spin_unlock(&inode->i_lock); 299 294 nfs_pageio_add_request(pgio, req); 300 - return ret; 295 + return 0; 296 + } 297 + 298 + static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) 299 + { 300 + struct inode *inode = page->mapping->host; 301 + 302 + nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 303 + nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 304 + 305 + nfs_pageio_cond_complete(pgio, page->index); 306 + return nfs_page_async_flush(pgio, page); 301 307 } 302 308 303 309 /* ··· 315 301 */ 316 302 static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) 317 303 { 318 - struct nfs_pageio_descriptor mypgio, *pgio; 319 - struct nfs_open_context *ctx; 320 - struct inode *inode = page->mapping->host; 321 - unsigned offset; 304 + struct nfs_pageio_descriptor pgio; 322 305 int err; 323 306 324 - nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 325 - nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); 326 - 327 - if (wbc->for_writepages) 328 - pgio = wbc->fs_private; 329 - else { 330 - nfs_pageio_init_write(&mypgio, inode, wb_priority(wbc)); 331 - pgio = &mypgio; 332 - } 333 - 334 - nfs_pageio_cond_complete(pgio, page->index); 335 - 336 - err = nfs_page_async_flush(pgio, page); 337 - if (err <= 0) 338 - goto out; 339 - err = 0; 340 - offset = nfs_page_length(page); 341 - if (!offset) 342 - goto out; 343 - 344 - nfs_pageio_cond_complete(pgio, page->index); 345 - 346 - ctx = nfs_find_open_context(inode, NULL, FMODE_WRITE); 347 - if (ctx == NULL) { 348 - err = -EBADF; 349 - goto out; 350 - } 351 - err = nfs_writepage_setup(ctx, page, 0, offset); 352 - put_nfs_open_context(ctx); 353 - if (err != 0) 354 - goto out; 355 - err = nfs_page_async_flush(pgio, page); 356 - if (err > 0) 357 - err = 0; 358 - out: 359 - if (!wbc->for_writepages) 360 - nfs_pageio_complete(pgio); 361 - return err; 307 + nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc)); 308 + err = nfs_do_writepage(page, wbc, &pgio); 309 + nfs_pageio_complete(&pgio); 310 + if (err < 0) 311 + return err; 312 + if (pgio.pg_error < 0) 313 + return pgio.pg_error; 314 + return 0; 362 315 } 363 316 364 317 int nfs_writepage(struct page *page, struct writeback_control *wbc) 365 318 { 366 - int err; 319 + int ret; 367 320 368 - err = nfs_writepage_locked(page, wbc); 321 + ret = nfs_writepage_locked(page, wbc); 369 322 unlock_page(page); 370 - return err; 323 + return ret; 324 + } 325 + 326 + static int nfs_writepages_callback(struct page *page, struct writeback_control *wbc, void *data) 327 + { 328 + int ret; 329 + 330 + ret = nfs_do_writepage(page, wbc, data); 331 + unlock_page(page); 332 + return ret; 371 333 } 372 334 373 335 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) ··· 355 365 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 356 366 357 367 nfs_pageio_init_write(&pgio, inode, wb_priority(wbc)); 358 - wbc->fs_private = &pgio; 359 - err = generic_writepages(mapping, wbc); 368 + err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); 360 369 nfs_pageio_complete(&pgio); 361 - if (err) 370 + if (err < 0) 362 371 return err; 363 - if (pgio.pg_error) 372 + if (pgio.pg_error < 0) 364 373 return pgio.pg_error; 365 374 return 0; 366 375 } ··· 378 389 return error; 379 390 if (!nfsi->npages) { 380 391 igrab(inode); 381 - nfs_begin_data_update(inode); 382 392 if (nfs_have_delegation(inode, FMODE_WRITE)) 383 393 nfsi->change_attr++; 384 394 } 385 395 SetPagePrivate(req->wb_page); 386 396 set_page_private(req->wb_page, (unsigned long)req); 387 - if (PageDirty(req->wb_page)) 388 - set_bit(PG_NEED_FLUSH, &req->wb_flags); 389 397 nfsi->npages++; 390 398 kref_get(&req->wb_kref); 391 399 return 0; ··· 402 416 set_page_private(req->wb_page, 0); 403 417 ClearPagePrivate(req->wb_page); 404 418 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); 405 - if (test_and_clear_bit(PG_NEED_FLUSH, &req->wb_flags)) 406 - __set_page_dirty_nobuffers(req->wb_page); 407 419 nfsi->npages--; 408 420 if (!nfsi->npages) { 409 421 spin_unlock(&inode->i_lock); 410 - nfs_end_data_update(inode); 411 422 iput(inode); 412 423 } else 413 424 spin_unlock(&inode->i_lock); ··· 665 682 666 683 int nfs_flush_incompatible(struct file *file, struct page *page) 667 684 { 668 - struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; 685 + struct nfs_open_context *ctx = nfs_file_open_context(file); 669 686 struct nfs_page *req; 670 687 int do_flush, status; 671 688 /* ··· 699 716 int nfs_updatepage(struct file *file, struct page *page, 700 717 unsigned int offset, unsigned int count) 701 718 { 702 - struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; 719 + struct nfs_open_context *ctx = nfs_file_open_context(file); 703 720 struct inode *inode = page->mapping->host; 704 721 int status = 0; 705 722 ··· 950 967 951 968 if (task->tk_status < 0) { 952 969 nfs_set_pageerror(page); 953 - req->wb_context->error = task->tk_status; 970 + nfs_context_set_write_error(req->wb_context, task->tk_status); 954 971 dprintk(", error = %d\n", task->tk_status); 955 972 goto out; 956 973 } ··· 1013 1030 1014 1031 if (task->tk_status < 0) { 1015 1032 nfs_set_pageerror(page); 1016 - req->wb_context->error = task->tk_status; 1033 + nfs_context_set_write_error(req->wb_context, task->tk_status); 1017 1034 dprintk(", error = %d\n", task->tk_status); 1018 1035 goto remove_request; 1019 1036 } ··· 1227 1244 req->wb_bytes, 1228 1245 (long long)req_offset(req)); 1229 1246 if (task->tk_status < 0) { 1230 - req->wb_context->error = task->tk_status; 1247 + nfs_context_set_write_error(req->wb_context, task->tk_status); 1231 1248 nfs_inode_remove_request(req); 1232 1249 dprintk(", error = %d\n", task->tk_status); 1233 1250 goto next; ··· 1330 1347 return ret; 1331 1348 } 1332 1349 1333 - /* 1334 - * flush the inode to disk. 1335 - */ 1336 - int nfs_wb_all(struct inode *inode) 1350 + static int __nfs_write_mapping(struct address_space *mapping, struct writeback_control *wbc, int how) 1337 1351 { 1338 - struct address_space *mapping = inode->i_mapping; 1352 + int ret; 1353 + 1354 + ret = nfs_writepages(mapping, wbc); 1355 + if (ret < 0) 1356 + goto out; 1357 + ret = nfs_sync_mapping_wait(mapping, wbc, how); 1358 + if (ret < 0) 1359 + goto out; 1360 + return 0; 1361 + out: 1362 + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1363 + return ret; 1364 + } 1365 + 1366 + /* Two pass sync: first using WB_SYNC_NONE, then WB_SYNC_ALL */ 1367 + static int nfs_write_mapping(struct address_space *mapping, int how) 1368 + { 1339 1369 struct writeback_control wbc = { 1340 1370 .bdi = mapping->backing_dev_info, 1341 - .sync_mode = WB_SYNC_ALL, 1371 + .sync_mode = WB_SYNC_NONE, 1342 1372 .nr_to_write = LONG_MAX, 1343 1373 .for_writepages = 1, 1344 1374 .range_cyclic = 1, 1345 1375 }; 1346 1376 int ret; 1347 1377 1348 - ret = nfs_writepages(mapping, &wbc); 1378 + ret = __nfs_write_mapping(mapping, &wbc, how); 1349 1379 if (ret < 0) 1350 - goto out; 1351 - ret = nfs_sync_mapping_wait(mapping, &wbc, 0); 1352 - if (ret >= 0) 1353 - return 0; 1354 - out: 1355 - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1356 - return ret; 1380 + return ret; 1381 + wbc.sync_mode = WB_SYNC_ALL; 1382 + return __nfs_write_mapping(mapping, &wbc, how); 1357 1383 } 1358 1384 1359 - int nfs_sync_mapping_range(struct address_space *mapping, loff_t range_start, loff_t range_end, int how) 1385 + /* 1386 + * flush the inode to disk. 1387 + */ 1388 + int nfs_wb_all(struct inode *inode) 1360 1389 { 1361 - struct writeback_control wbc = { 1362 - .bdi = mapping->backing_dev_info, 1363 - .sync_mode = WB_SYNC_ALL, 1364 - .nr_to_write = LONG_MAX, 1365 - .range_start = range_start, 1366 - .range_end = range_end, 1367 - .for_writepages = 1, 1368 - }; 1369 - int ret; 1390 + return nfs_write_mapping(inode->i_mapping, 0); 1391 + } 1370 1392 1371 - ret = nfs_writepages(mapping, &wbc); 1372 - if (ret < 0) 1373 - goto out; 1374 - ret = nfs_sync_mapping_wait(mapping, &wbc, how); 1375 - if (ret >= 0) 1376 - return 0; 1377 - out: 1378 - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1379 - return ret; 1393 + int nfs_wb_nocommit(struct inode *inode) 1394 + { 1395 + return nfs_write_mapping(inode->i_mapping, FLUSH_NOCOMMIT); 1380 1396 } 1381 1397 1382 1398 int nfs_wb_page_cancel(struct inode *inode, struct page *page) ··· 1458 1476 { 1459 1477 return nfs_wb_page_priority(inode, page, FLUSH_STABLE); 1460 1478 } 1461 - 1462 - int nfs_set_page_dirty(struct page *page) 1463 - { 1464 - struct address_space *mapping = page->mapping; 1465 - struct inode *inode; 1466 - struct nfs_page *req; 1467 - int ret; 1468 - 1469 - if (!mapping) 1470 - goto out_raced; 1471 - inode = mapping->host; 1472 - if (!inode) 1473 - goto out_raced; 1474 - spin_lock(&inode->i_lock); 1475 - req = nfs_page_find_request_locked(page); 1476 - if (req != NULL) { 1477 - /* Mark any existing write requests for flushing */ 1478 - ret = !test_and_set_bit(PG_NEED_FLUSH, &req->wb_flags); 1479 - spin_unlock(&inode->i_lock); 1480 - nfs_release_request(req); 1481 - return ret; 1482 - } 1483 - ret = __set_page_dirty_nobuffers(page); 1484 - spin_unlock(&inode->i_lock); 1485 - return ret; 1486 - out_raced: 1487 - return !TestSetPageDirty(page); 1488 - } 1489 - 1490 1479 1491 1480 int __init nfs_init_writepagecache(void) 1492 1481 {

+10 -6

fs/nfsd/nfs4xdr.c

··· 102 102 out: \ 103 103 return status; \ 104 104 xdr_error: \ 105 - printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \ 105 + dprintk("NFSD: xdr error (%s:%d)\n", \ 106 + __FILE__, __LINE__); \ 106 107 status = nfserr_bad_xdr; \ 107 108 goto out 108 109 ··· 125 124 if (!(x = (p==argp->tmp || p == argp->tmpp) ? \ 126 125 savemem(argp, p, nbytes) : \ 127 126 (char *)p)) { \ 128 - printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \ 127 + dprintk("NFSD: xdr error (%s:%d)\n", \ 128 + __FILE__, __LINE__); \ 129 129 goto xdr_error; \ 130 130 } \ 131 131 p += XDR_QUADLEN(nbytes); \ ··· 142 140 p = argp->p; \ 143 141 argp->p += XDR_QUADLEN(nbytes); \ 144 142 } else if (!(p = read_buf(argp, nbytes))) { \ 145 - printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); \ 143 + dprintk("NFSD: xdr error (%s:%d)\n", \ 144 + __FILE__, __LINE__); \ 146 145 goto xdr_error; \ 147 146 } \ 148 147 } while (0) ··· 951 948 */ 952 949 avail = (char*)argp->end - (char*)argp->p; 953 950 if (avail + argp->pagelen < write->wr_buflen) { 954 - printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); 951 + dprintk("NFSD: xdr error (%s:%d)\n", 952 + __FILE__, __LINE__); 955 953 goto xdr_error; 956 954 } 957 955 argp->rqstp->rq_vec[0].iov_base = p; ··· 1023 1019 argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); 1024 1020 if (!argp->ops) { 1025 1021 argp->ops = argp->iops; 1026 - printk(KERN_INFO "nfsd: couldn't allocate room for COMPOUND\n"); 1022 + dprintk("nfsd: couldn't allocate room for COMPOUND\n"); 1027 1023 goto xdr_error; 1028 1024 } 1029 1025 } ··· 1330 1326 path = exp->ex_path; 1331 1327 1332 1328 if (strncmp(path, rootpath, strlen(rootpath))) { 1333 - printk("nfsd: fs_locations failed;" 1329 + dprintk("nfsd: fs_locations failed;" 1334 1330 "%s is not contained in %s\n", path, rootpath); 1335 1331 *stat = nfserr_notsupp; 1336 1332 return NULL;

+4

include/linux/jiffies.h

··· 109 109 ((long)(a) - (long)(b) >= 0)) 110 110 #define time_before_eq(a,b) time_after_eq(b,a) 111 111 112 + #define time_in_range(a,b,c) \ 113 + (time_after_eq(a,b) && \ 114 + time_before_eq(a,c)) 115 + 112 116 /* Same as above, but does so with platform independent 64bit types. 113 117 * These must be used when utilizing jiffies_64 (i.e. return value of 114 118 * get_jiffies_64() */

+37 -53

include/linux/nfs_fs.h

··· 47 47 #include <linux/nfs3.h> 48 48 #include <linux/nfs4.h> 49 49 #include <linux/nfs_xdr.h> 50 - 51 50 #include <linux/nfs_fs_sb.h> 52 51 53 - #include <linux/rwsem.h> 54 52 #include <linux/mempool.h> 55 53 56 54 /* ··· 75 77 struct nfs4_state *state; 76 78 fl_owner_t lockowner; 77 79 int mode; 80 + 81 + unsigned long flags; 82 + #define NFS_CONTEXT_ERROR_WRITE (0) 78 83 int error; 79 84 80 85 struct list_head list; ··· 134 133 * server. 135 134 */ 136 135 unsigned long cache_change_attribute; 137 - /* 138 - * Counter indicating the number of outstanding requests that 139 - * will cause a file data update. 140 - */ 141 - atomic_t data_updates; 142 136 143 137 struct rb_root access_cache; 144 138 struct list_head access_cache_entry_lru; ··· 201 205 #define NFS_CLIENT(inode) (NFS_SERVER(inode)->client) 202 206 #define NFS_PROTO(inode) (NFS_SERVER(inode)->nfs_client->rpc_ops) 203 207 #define NFS_COOKIEVERF(inode) (NFS_I(inode)->cookieverf) 204 - #define NFS_READTIME(inode) (NFS_I(inode)->read_cache_jiffies) 205 - #define NFS_CHANGE_ATTR(inode) (NFS_I(inode)->change_attr) 206 - #define NFS_ATTRTIMEO(inode) (NFS_I(inode)->attrtimeo) 207 208 #define NFS_MINATTRTIMEO(inode) \ 208 209 (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \ 209 210 : NFS_SERVER(inode)->acregmin) 210 211 #define NFS_MAXATTRTIMEO(inode) \ 211 212 (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmax \ 212 213 : NFS_SERVER(inode)->acregmax) 213 - #define NFS_ATTRTIMEO_UPDATE(inode) (NFS_I(inode)->attrtimeo_timestamp) 214 214 215 215 #define NFS_FLAGS(inode) (NFS_I(inode)->flags) 216 216 #define NFS_STALE(inode) (test_bit(NFS_INO_STALE, &NFS_FLAGS(inode))) 217 217 218 218 #define NFS_FILEID(inode) (NFS_I(inode)->fileid) 219 - 220 - static inline int nfs_caches_unstable(struct inode *inode) 221 - { 222 - return atomic_read(&NFS_I(inode)->data_updates) != 0; 223 - } 224 219 225 220 static inline void nfs_mark_for_revalidate(struct inode *inode) 226 221 { ··· 224 237 spin_unlock(&inode->i_lock); 225 238 } 226 239 227 - static inline void NFS_CACHEINV(struct inode *inode) 228 - { 229 - if (!nfs_caches_unstable(inode)) 230 - nfs_mark_for_revalidate(inode); 231 - } 232 - 233 240 static inline int nfs_server_capable(struct inode *inode, int cap) 234 241 { 235 242 return NFS_SERVER(inode)->caps & cap; ··· 234 253 return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode)); 235 254 } 236 255 237 - /** 238 - * nfs_save_change_attribute - Returns the inode attribute change cookie 239 - * @inode - pointer to inode 240 - * The "change attribute" is updated every time we finish an operation 241 - * that will result in a metadata change on the server. 242 - */ 243 - static inline long nfs_save_change_attribute(struct inode *inode) 256 + static inline void nfs_set_verifier(struct dentry * dentry, unsigned long verf) 244 257 { 245 - return NFS_I(inode)->cache_change_attribute; 258 + dentry->d_time = verf; 246 259 } 247 260 248 261 /** 249 - * nfs_verify_change_attribute - Detects NFS inode cache updates 250 - * @inode - pointer to inode 251 - * @chattr - previously saved change attribute 252 - * Return "false" if metadata has been updated (or is in the process of 253 - * being updated) since the change attribute was saved. 262 + * nfs_save_change_attribute - Returns the inode attribute change cookie 263 + * @dir - pointer to parent directory inode 264 + * The "change attribute" is updated every time we finish an operation 265 + * that will result in a metadata change on the server. 254 266 */ 255 - static inline int nfs_verify_change_attribute(struct inode *inode, unsigned long chattr) 267 + static inline unsigned long nfs_save_change_attribute(struct inode *dir) 256 268 { 257 - return !nfs_caches_unstable(inode) 258 - && time_after_eq(chattr, NFS_I(inode)->cache_change_attribute); 269 + return NFS_I(dir)->cache_change_attribute; 270 + } 271 + 272 + /** 273 + * nfs_verify_change_attribute - Detects NFS remote directory changes 274 + * @dir - pointer to parent directory inode 275 + * @chattr - previously saved change attribute 276 + * Return "false" if the verifiers doesn't match the change attribute. 277 + * This would usually indicate that the directory contents have changed on 278 + * the server, and that any dentries need revalidating. 279 + */ 280 + static inline int nfs_verify_change_attribute(struct inode *dir, unsigned long chattr) 281 + { 282 + return chattr == NFS_I(dir)->cache_change_attribute; 259 283 } 260 284 261 285 /* ··· 269 283 extern int nfs_sync_mapping(struct address_space *mapping); 270 284 extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping); 271 285 extern void nfs_zap_caches(struct inode *); 286 + extern void nfs_invalidate_atime(struct inode *); 272 287 extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, 273 288 struct nfs_fattr *); 274 289 extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); 275 290 extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); 291 + extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); 276 292 extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 277 293 extern int nfs_permission(struct inode *, int, struct nameidata *); 278 - extern int nfs_access_get_cached(struct inode *, struct rpc_cred *, struct nfs_access_entry *); 279 - extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); 280 - extern void nfs_access_zap_cache(struct inode *inode); 281 294 extern int nfs_open(struct inode *, struct file *); 282 295 extern int nfs_release(struct inode *, struct file *); 283 296 extern int nfs_attribute_timeout(struct inode *inode); ··· 286 301 extern int nfs_revalidate_mapping_nolock(struct inode *inode, struct address_space *mapping); 287 302 extern int nfs_setattr(struct dentry *, struct iattr *); 288 303 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr); 289 - extern void nfs_begin_attr_update(struct inode *); 290 - extern void nfs_end_attr_update(struct inode *); 291 - extern void nfs_begin_data_update(struct inode *); 292 - extern void nfs_end_data_update(struct inode *); 293 304 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); 294 305 extern void put_nfs_open_context(struct nfs_open_context *ctx); 295 306 extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode); 307 + extern u64 nfs_compat_user_ino64(u64 fileid); 296 308 297 309 /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ 298 310 extern __be32 root_nfs_parse_addr(char *name); /*__init*/ ··· 310 328 extern const struct file_operations nfs_file_operations; 311 329 extern const struct address_space_operations nfs_file_aops; 312 330 331 + static inline struct nfs_open_context *nfs_file_open_context(struct file *filp) 332 + { 333 + return filp->private_data; 334 + } 335 + 313 336 static inline struct rpc_cred *nfs_file_cred(struct file *file) 314 337 { 315 - if (file != NULL) { 316 - struct nfs_open_context *ctx; 317 - 318 - ctx = (struct nfs_open_context*)file->private_data; 319 - return ctx->cred; 320 - } 338 + if (file != NULL) 339 + return nfs_file_open_context(file)->cred; 321 340 return NULL; 322 341 } 323 342 ··· 361 378 extern struct dentry_operations nfs_dentry_operations; 362 379 363 380 extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); 381 + extern int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags); 382 + extern void nfs_access_zap_cache(struct inode *inode); 364 383 365 384 /* 366 385 * linux/fs/nfs/symlink.c ··· 405 420 extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); 406 421 extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); 407 422 extern void nfs_writedata_release(void *); 408 - extern int nfs_set_page_dirty(struct page *); 409 423 410 424 /* 411 425 * Try to write back everything synchronously (but check the 412 426 * return value!) 413 427 */ 414 428 extern long nfs_sync_mapping_wait(struct address_space *, struct writeback_control *, int); 415 - extern int nfs_sync_mapping_range(struct address_space *, loff_t, loff_t, int); 416 429 extern int nfs_wb_all(struct inode *inode); 430 + extern int nfs_wb_nocommit(struct inode *inode); 417 431 extern int nfs_wb_page(struct inode *inode, struct page* page); 418 432 extern int nfs_wb_page_priority(struct inode *inode, struct page* page, int how); 419 433 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);

-1

include/linux/nfs_page.h

··· 30 30 #define PG_BUSY 0 31 31 #define PG_NEED_COMMIT 1 32 32 #define PG_NEED_RESCHED 2 33 - #define PG_NEED_FLUSH 3 34 33 35 34 struct nfs_inode; 36 35 struct nfs_page {

+5 -1

include/linux/nfs_xdr.h

··· 62 62 #define NFS_ATTR_FATTR 0x0002 /* post-op attributes */ 63 63 #define NFS_ATTR_FATTR_V3 0x0004 /* NFSv3 attributes */ 64 64 #define NFS_ATTR_FATTR_V4 0x0008 /* NFSv4 change attribute */ 65 - #define NFS_ATTR_FATTR_V4_REFERRAL 0x0010 /* NFSv4 referral */ 65 + #define NFS_ATTR_WCC_V4 0x0010 /* pre-op change attribute */ 66 + #define NFS_ATTR_FATTR_V4_REFERRAL 0x0020 /* NFSv4 referral */ 66 67 67 68 /* 68 69 * Info on the file system ··· 539 538 540 539 struct nfs4_accessargs { 541 540 const struct nfs_fh * fh; 541 + const u32 * bitmask; 542 542 u32 access; 543 543 }; 544 544 545 545 struct nfs4_accessres { 546 + const struct nfs_server * server; 547 + struct nfs_fattr * fattr; 546 548 u32 supported; 547 549 u32 access; 548 550 };

+1 -1

include/linux/sunrpc/clnt.h

··· 117 117 118 118 struct rpc_clnt *rpc_create(struct rpc_create_args *args); 119 119 struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, 120 - struct rpc_program *, int); 120 + struct rpc_program *, u32); 121 121 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); 122 122 void rpc_shutdown_client(struct rpc_clnt *); 123 123 void rpc_release_client(struct rpc_clnt *);

+5

include/linux/sunrpc/debug.h

··· 88 88 CTL_SLOTTABLE_TCP, 89 89 CTL_MIN_RESVPORT, 90 90 CTL_MAX_RESVPORT, 91 + CTL_SLOTTABLE_RDMA, 92 + CTL_RDMA_MAXINLINEREAD, 93 + CTL_RDMA_MAXINLINEWRITE, 94 + CTL_RDMA_WRITEPADDING, 95 + CTL_RDMA_MEMREG, 91 96 }; 92 97 93 98 #endif /* _LINUX_SUNRPC_DEBUG_H_ */

+13

include/linux/sunrpc/msg_prot.h

··· 138 138 #define RPC_MAX_HEADER_WITH_AUTH \ 139 139 (RPC_CALLHDRSIZE + 2*(2+RPC_MAX_AUTH_SIZE/4)) 140 140 141 + /* 142 + * RFC1833/RFC3530 rpcbind (v3+) well-known netid's. 143 + */ 144 + #define RPCBIND_NETID_UDP "udp" 145 + #define RPCBIND_NETID_TCP "tcp" 146 + #define RPCBIND_NETID_UDP6 "udp6" 147 + #define RPCBIND_NETID_TCP6 "tcp6" 148 + 149 + /* 150 + * Note that RFC 1833 does not put any size restrictions on the 151 + * netid string, but all currently defined netid's fit in 4 bytes. 152 + */ 153 + #define RPCBIND_MAXNETIDLEN (4u) 141 154 142 155 #endif /* __KERNEL__ */ 143 156 #endif /* _LINUX_SUNRPC_MSGPROT_H_ */

+116

include/linux/sunrpc/rpc_rdma.h

··· 1 + /* 2 + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + */ 39 + 40 + #ifndef _LINUX_SUNRPC_RPC_RDMA_H 41 + #define _LINUX_SUNRPC_RPC_RDMA_H 42 + 43 + struct rpcrdma_segment { 44 + uint32_t rs_handle; /* Registered memory handle */ 45 + uint32_t rs_length; /* Length of the chunk in bytes */ 46 + uint64_t rs_offset; /* Chunk virtual address or offset */ 47 + }; 48 + 49 + /* 50 + * read chunk(s), encoded as a linked list. 51 + */ 52 + struct rpcrdma_read_chunk { 53 + uint32_t rc_discrim; /* 1 indicates presence */ 54 + uint32_t rc_position; /* Position in XDR stream */ 55 + struct rpcrdma_segment rc_target; 56 + }; 57 + 58 + /* 59 + * write chunk, and reply chunk. 60 + */ 61 + struct rpcrdma_write_chunk { 62 + struct rpcrdma_segment wc_target; 63 + }; 64 + 65 + /* 66 + * write chunk(s), encoded as a counted array. 67 + */ 68 + struct rpcrdma_write_array { 69 + uint32_t wc_discrim; /* 1 indicates presence */ 70 + uint32_t wc_nchunks; /* Array count */ 71 + struct rpcrdma_write_chunk wc_array[0]; 72 + }; 73 + 74 + struct rpcrdma_msg { 75 + uint32_t rm_xid; /* Mirrors the RPC header xid */ 76 + uint32_t rm_vers; /* Version of this protocol */ 77 + uint32_t rm_credit; /* Buffers requested/granted */ 78 + uint32_t rm_type; /* Type of message (enum rpcrdma_proc) */ 79 + union { 80 + 81 + struct { /* no chunks */ 82 + uint32_t rm_empty[3]; /* 3 empty chunk lists */ 83 + } rm_nochunks; 84 + 85 + struct { /* no chunks and padded */ 86 + uint32_t rm_align; /* Padding alignment */ 87 + uint32_t rm_thresh; /* Padding threshold */ 88 + uint32_t rm_pempty[3]; /* 3 empty chunk lists */ 89 + } rm_padded; 90 + 91 + uint32_t rm_chunks[0]; /* read, write and reply chunks */ 92 + 93 + } rm_body; 94 + }; 95 + 96 + #define RPCRDMA_HDRLEN_MIN 28 97 + 98 + enum rpcrdma_errcode { 99 + ERR_VERS = 1, 100 + ERR_CHUNK = 2 101 + }; 102 + 103 + struct rpcrdma_err_vers { 104 + uint32_t rdma_vers_low; /* Version range supported by peer */ 105 + uint32_t rdma_vers_high; 106 + }; 107 + 108 + enum rpcrdma_proc { 109 + RDMA_MSG = 0, /* An RPC call or reply msg */ 110 + RDMA_NOMSG = 1, /* An RPC call or reply msg - separate body */ 111 + RDMA_MSGP = 2, /* An RPC call or reply msg with padding */ 112 + RDMA_DONE = 3, /* Client signals reply completion */ 113 + RDMA_ERROR = 4 /* An RPC RDMA encoding error */ 114 + }; 115 + 116 + #endif /* _LINUX_SUNRPC_RPC_RDMA_H */

+4 -1

include/linux/sunrpc/xdr.h

··· 70 70 71 71 struct page ** pages; /* Array of contiguous pages */ 72 72 unsigned int page_base, /* Start of page data */ 73 - page_len; /* Length of page data */ 73 + page_len, /* Length of page data */ 74 + flags; /* Flags for data disposition */ 75 + #define XDRBUF_READ 0x01 /* target of file read */ 76 + #define XDRBUF_WRITE 0x02 /* source of file write */ 74 77 75 78 unsigned int buflen, /* Total length of storage buffer */ 76 79 len; /* Length of XDR encoded message */

+17 -25

include/linux/sunrpc/xprt.h

··· 19 19 20 20 #ifdef __KERNEL__ 21 21 22 - extern unsigned int xprt_udp_slot_table_entries; 23 - extern unsigned int xprt_tcp_slot_table_entries; 24 - 25 22 #define RPC_MIN_SLOT_TABLE (2U) 26 23 #define RPC_DEF_SLOT_TABLE (16U) 27 24 #define RPC_MAX_SLOT_TABLE (128U) 28 - 29 - /* 30 - * Parameters for choosing a free port 31 - */ 32 - extern unsigned int xprt_min_resvport; 33 - extern unsigned int xprt_max_resvport; 34 - 35 - #define RPC_MIN_RESVPORT (1U) 36 - #define RPC_MAX_RESVPORT (65535U) 37 - #define RPC_DEF_MIN_RESVPORT (665U) 38 - #define RPC_DEF_MAX_RESVPORT (1023U) 39 25 40 26 /* 41 27 * This describes a timeout strategy ··· 39 53 RPC_DISPLAY_PORT, 40 54 RPC_DISPLAY_PROTO, 41 55 RPC_DISPLAY_ALL, 56 + RPC_DISPLAY_HEX_ADDR, 57 + RPC_DISPLAY_HEX_PORT, 58 + RPC_DISPLAY_UNIVERSAL_ADDR, 59 + RPC_DISPLAY_NETID, 42 60 RPC_DISPLAY_MAX, 43 61 }; 44 62 ··· 186 196 char * address_strings[RPC_DISPLAY_MAX]; 187 197 }; 188 198 189 - struct rpc_xprtsock_create { 190 - int proto; /* IPPROTO_UDP or IPPROTO_TCP */ 199 + struct xprt_create { 200 + int ident; /* XPRT_TRANSPORT identifier */ 191 201 struct sockaddr * srcaddr; /* optional local address */ 192 202 struct sockaddr * dstaddr; /* remote peer address */ 193 203 size_t addrlen; 194 204 struct rpc_timeout * timeout; /* optional timeout parameters */ 205 + }; 206 + 207 + struct xprt_class { 208 + struct list_head list; 209 + int ident; /* XPRT_TRANSPORT identifier */ 210 + struct rpc_xprt * (*setup)(struct xprt_create *); 211 + struct module *owner; 212 + char name[32]; 195 213 }; 196 214 197 215 /* ··· 210 212 /* 211 213 * Generic internal transport functions 212 214 */ 213 - struct rpc_xprt * xprt_create_transport(struct rpc_xprtsock_create *args); 215 + struct rpc_xprt *xprt_create_transport(struct xprt_create *args); 214 216 void xprt_connect(struct rpc_task *task); 215 217 void xprt_reserve(struct rpc_task *task); 216 218 int xprt_reserve_xprt(struct rpc_task *task); ··· 233 235 /* 234 236 * Transport switch helper functions 235 237 */ 238 + int xprt_register_transport(struct xprt_class *type); 239 + int xprt_unregister_transport(struct xprt_class *type); 236 240 void xprt_set_retrans_timeout_def(struct rpc_task *task); 237 241 void xprt_set_retrans_timeout_rtt(struct rpc_task *task); 238 242 void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status); ··· 246 246 void xprt_complete_rqst(struct rpc_task *task, int copied); 247 247 void xprt_release_rqst_cong(struct rpc_task *task); 248 248 void xprt_disconnect(struct rpc_xprt *xprt); 249 - 250 - /* 251 - * Socket transport setup operations 252 - */ 253 - struct rpc_xprt * xs_setup_udp(struct rpc_xprtsock_create *args); 254 - struct rpc_xprt * xs_setup_tcp(struct rpc_xprtsock_create *args); 255 - int init_socket_xprt(void); 256 - void cleanup_socket_xprt(void); 257 249 258 250 /* 259 251 * Reserved bit positions in xprt->state

+85

include/linux/sunrpc/xprtrdma.h

··· 1 + /* 2 + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + */ 39 + 40 + #ifndef _LINUX_SUNRPC_XPRTRDMA_H 41 + #define _LINUX_SUNRPC_XPRTRDMA_H 42 + 43 + /* 44 + * RPC transport identifier for RDMA 45 + */ 46 + #define XPRT_TRANSPORT_RDMA 256 47 + 48 + /* 49 + * rpcbind (v3+) RDMA netid. 50 + */ 51 + #define RPCBIND_NETID_RDMA "rdma" 52 + 53 + /* 54 + * Constants. Max RPC/NFS header is big enough to account for 55 + * additional marshaling buffers passed down by Linux client. 56 + * 57 + * RDMA header is currently fixed max size, and is big enough for a 58 + * fully-chunked NFS message (read chunks are the largest). Note only 59 + * a single chunk type per message is supported currently. 60 + */ 61 + #define RPCRDMA_MIN_SLOT_TABLE (2U) 62 + #define RPCRDMA_DEF_SLOT_TABLE (32U) 63 + #define RPCRDMA_MAX_SLOT_TABLE (256U) 64 + 65 + #define RPCRDMA_DEF_INLINE (1024) /* default inline max */ 66 + 67 + #define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */ 68 + 69 + #define RDMA_RESOLVE_TIMEOUT (5*HZ) /* TBD 5 seconds */ 70 + #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ 71 + 72 + /* memory registration strategies */ 73 + #define RPCRDMA_PERSISTENT_REGISTRATION (1) 74 + 75 + enum rpcrdma_memreg { 76 + RPCRDMA_BOUNCEBUFFERS = 0, 77 + RPCRDMA_REGISTER, 78 + RPCRDMA_MEMWINDOWS, 79 + RPCRDMA_MEMWINDOWS_ASYNC, 80 + RPCRDMA_MTHCAFMR, 81 + RPCRDMA_ALLPHYSICAL, 82 + RPCRDMA_LAST 83 + }; 84 + 85 + #endif /* _LINUX_SUNRPC_XPRTRDMA_H */

+51

include/linux/sunrpc/xprtsock.h

··· 1 + /* 2 + * linux/include/linux/sunrpc/xprtsock.h 3 + * 4 + * Declarations for the RPC transport socket provider. 5 + */ 6 + 7 + #ifndef _LINUX_SUNRPC_XPRTSOCK_H 8 + #define _LINUX_SUNRPC_XPRTSOCK_H 9 + 10 + #ifdef __KERNEL__ 11 + 12 + /* 13 + * Socket transport setup operations 14 + */ 15 + struct rpc_xprt *xs_setup_udp(struct xprt_create *args); 16 + struct rpc_xprt *xs_setup_tcp(struct xprt_create *args); 17 + 18 + int init_socket_xprt(void); 19 + void cleanup_socket_xprt(void); 20 + 21 + /* 22 + * RPC transport identifiers for UDP, TCP 23 + * 24 + * To preserve compatibility with the historical use of raw IP protocol 25 + * id's for transport selection, these are specified with the previous 26 + * values. No such restriction exists for new transports, except that 27 + * they may not collide with these values (17 and 6, respectively). 28 + */ 29 + #define XPRT_TRANSPORT_UDP IPPROTO_UDP 30 + #define XPRT_TRANSPORT_TCP IPPROTO_TCP 31 + 32 + /* 33 + * RPC slot table sizes for UDP, TCP transports 34 + */ 35 + extern unsigned int xprt_udp_slot_table_entries; 36 + extern unsigned int xprt_tcp_slot_table_entries; 37 + 38 + /* 39 + * Parameters for choosing a free port 40 + */ 41 + extern unsigned int xprt_min_resvport; 42 + extern unsigned int xprt_max_resvport; 43 + 44 + #define RPC_MIN_RESVPORT (1U) 45 + #define RPC_MAX_RESVPORT (65535U) 46 + #define RPC_DEF_MIN_RESVPORT (665U) 47 + #define RPC_DEF_MAX_RESVPORT (1023U) 48 + 49 + #endif /* __KERNEL__ */ 50 + 51 + #endif /* _LINUX_SUNRPC_XPRTSOCK_H */

-2

include/linux/writeback.h

··· 62 62 unsigned for_reclaim:1; /* Invoked from the page allocator */ 63 63 unsigned for_writepages:1; /* This is a writepages() call */ 64 64 unsigned range_cyclic:1; /* range_start is cyclic */ 65 - 66 - void *fs_private; /* For use by ->writepages() */ 67 65 }; 68 66 69 67 /*

+1

kernel/auditsc.c

··· 1525 1525 context->names[idx].ino = (unsigned long)-1; 1526 1526 } 1527 1527 } 1528 + EXPORT_SYMBOL_GPL(__audit_inode_child); 1528 1529 1529 1530 /** 1530 1531 * auditsc_get_stamp - get local copies of audit_context values

+1

net/sunrpc/Makefile

··· 5 5 6 6 obj-$(CONFIG_SUNRPC) += sunrpc.o 7 7 obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ 8 + obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ 8 9 9 10 sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ 10 11 auth.o auth_null.o auth_unix.o \

+3 -3

net/sunrpc/auth_gss/gss_krb5_wrap.c

··· 42 42 { 43 43 u8 *ptr; 44 44 u8 pad; 45 - int len = buf->len; 45 + size_t len = buf->len; 46 46 47 47 if (len <= buf->head[0].iov_len) { 48 48 pad = *(u8 *)(buf->head[0].iov_base + len - 1); ··· 53 53 } else 54 54 len -= buf->head[0].iov_len; 55 55 if (len <= buf->page_len) { 56 - int last = (buf->page_base + len - 1) 56 + unsigned int last = (buf->page_base + len - 1) 57 57 >>PAGE_CACHE_SHIFT; 58 - int offset = (buf->page_base + len - 1) 58 + unsigned int offset = (buf->page_base + len - 1) 59 59 & (PAGE_CACHE_SIZE - 1); 60 60 ptr = kmap_atomic(buf->pages[last], KM_USER0); 61 61 pad = *(ptr + offset);

+36 -16

net/sunrpc/clnt.c

··· 127 127 struct rpc_clnt *clnt = NULL; 128 128 struct rpc_auth *auth; 129 129 int err; 130 - int len; 130 + size_t len; 131 + 132 + /* sanity check the name before trying to print it */ 133 + err = -EINVAL; 134 + len = strlen(servname); 135 + if (len > RPC_MAXNETNAMELEN) 136 + goto out_no_rpciod; 137 + len++; 131 138 132 139 dprintk("RPC: creating %s client for %s (xprt %p)\n", 133 140 program->name, servname, xprt); ··· 155 148 clnt->cl_parent = clnt; 156 149 157 150 clnt->cl_server = clnt->cl_inline_name; 158 - len = strlen(servname) + 1; 159 151 if (len > sizeof(clnt->cl_inline_name)) { 160 152 char *buf = kmalloc(len, GFP_KERNEL); 161 153 if (buf != 0) ··· 240 234 { 241 235 struct rpc_xprt *xprt; 242 236 struct rpc_clnt *clnt; 243 - struct rpc_xprtsock_create xprtargs = { 244 - .proto = args->protocol, 237 + struct xprt_create xprtargs = { 238 + .ident = args->protocol, 245 239 .srcaddr = args->saddress, 246 240 .dstaddr = args->address, 247 241 .addrlen = args->addrsize, ··· 259 253 */ 260 254 if (args->servername == NULL) { 261 255 struct sockaddr_in *addr = 262 - (struct sockaddr_in *) &args->address; 256 + (struct sockaddr_in *) args->address; 263 257 snprintf(servername, sizeof(servername), NIPQUAD_FMT, 264 258 NIPQUAD(addr->sin_addr.s_addr)); 265 259 args->servername = servername; ··· 274 268 xprt->resvport = 1; 275 269 if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) 276 270 xprt->resvport = 0; 277 - 278 - dprintk("RPC: creating %s client for %s (xprt %p)\n", 279 - args->program->name, args->servername, xprt); 280 271 281 272 clnt = rpc_new_client(xprt, args->servername, args->program, 282 273 args->version, args->authflavor); ··· 442 439 */ 443 440 struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, 444 441 struct rpc_program *program, 445 - int vers) 442 + u32 vers) 446 443 { 447 444 struct rpc_clnt *clnt; 448 445 struct rpc_version *version; ··· 846 843 dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid); 847 844 848 845 if (RPC_IS_ASYNC(task) || !signalled()) { 849 - xprt_release(task); 850 - task->tk_action = call_reserve; 846 + task->tk_action = call_allocate; 851 847 rpc_delay(task, HZ>>4); 852 848 return; 853 849 } ··· 873 871 buf->head[0].iov_len = len; 874 872 buf->tail[0].iov_len = 0; 875 873 buf->page_len = 0; 874 + buf->flags = 0; 876 875 buf->len = 0; 877 876 buf->buflen = len; 878 877 } ··· 940 937 static void 941 938 call_bind_status(struct rpc_task *task) 942 939 { 943 - int status = -EACCES; 940 + int status = -EIO; 944 941 945 942 if (task->tk_status >= 0) { 946 943 dprint_status(task); ··· 950 947 } 951 948 952 949 switch (task->tk_status) { 950 + case -EAGAIN: 951 + dprintk("RPC: %5u rpcbind waiting for another request " 952 + "to finish\n", task->tk_pid); 953 + /* avoid busy-waiting here -- could be a network outage. */ 954 + rpc_delay(task, 5*HZ); 955 + goto retry_timeout; 953 956 case -EACCES: 954 957 dprintk("RPC: %5u remote rpcbind: RPC program/version " 955 958 "unavailable\n", task->tk_pid); 959 + /* fail immediately if this is an RPC ping */ 960 + if (task->tk_msg.rpc_proc->p_proc == 0) { 961 + status = -EOPNOTSUPP; 962 + break; 963 + } 956 964 rpc_delay(task, 3*HZ); 957 965 goto retry_timeout; 958 966 case -ETIMEDOUT: ··· 971 957 task->tk_pid); 972 958 goto retry_timeout; 973 959 case -EPFNOSUPPORT: 960 + /* server doesn't support any rpcbind version we know of */ 974 961 dprintk("RPC: %5u remote rpcbind service unavailable\n", 975 962 task->tk_pid); 976 963 break; ··· 984 969 default: 985 970 dprintk("RPC: %5u unrecognized rpcbind error (%d)\n", 986 971 task->tk_pid, -task->tk_status); 987 - status = -EIO; 988 972 } 989 973 990 974 rpc_exit(task, status); ··· 1271 1257 { 1272 1258 dprint_status(task); 1273 1259 1274 - xprt_release(task); /* Must do to obtain new XID */ 1275 1260 task->tk_action = call_refreshresult; 1276 1261 task->tk_status = 0; 1277 1262 task->tk_client->cl_stats->rpcauthrefresh++; ··· 1388 1375 dprintk("RPC: %5u %s: retry stale creds\n", 1389 1376 task->tk_pid, __FUNCTION__); 1390 1377 rpcauth_invalcred(task); 1378 + /* Ensure we obtain a new XID! */ 1379 + xprt_release(task); 1391 1380 task->tk_action = call_refresh; 1392 1381 goto out_retry; 1393 1382 case RPC_AUTH_BADCRED: ··· 1538 1523 spin_lock(&clnt->cl_lock); 1539 1524 list_for_each_entry(t, &clnt->cl_tasks, tk_task) { 1540 1525 const char *rpc_waitq = "none"; 1526 + int proc; 1527 + 1528 + if (t->tk_msg.rpc_proc) 1529 + proc = t->tk_msg.rpc_proc->p_proc; 1530 + else 1531 + proc = -1; 1541 1532 1542 1533 if (RPC_IS_QUEUED(t)) 1543 1534 rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq); 1544 1535 1545 1536 printk("%5u %04d %04x %6d %8p %6d %8p %8ld %8s %8p %8p\n", 1546 - t->tk_pid, 1547 - (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1), 1537 + t->tk_pid, proc, 1548 1538 t->tk_flags, t->tk_status, 1549 1539 t->tk_client, 1550 1540 (t->tk_client ? t->tk_client->cl_prog : 0),

+5 -3

net/sunrpc/rpc_pipe.c

··· 14 14 #include <linux/pagemap.h> 15 15 #include <linux/mount.h> 16 16 #include <linux/namei.h> 17 - #include <linux/dnotify.h> 17 + #include <linux/fsnotify.h> 18 18 #include <linux/kernel.h> 19 19 20 20 #include <asm/ioctls.h> ··· 329 329 clnt->cl_prog, clnt->cl_vers); 330 330 seq_printf(m, "address: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); 331 331 seq_printf(m, "protocol: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PROTO)); 332 + seq_printf(m, "port: %s\n", rpc_peeraddr2str(clnt, RPC_DISPLAY_PORT)); 332 333 return 0; 333 334 } 334 335 ··· 586 585 if (S_ISDIR(mode)) 587 586 inc_nlink(dir); 588 587 d_add(dentry, inode); 588 + fsnotify_create(dir, dentry); 589 589 } 590 590 mutex_unlock(&dir->i_mutex); 591 591 return 0; ··· 608 606 inode->i_ino = iunique(dir->i_sb, 100); 609 607 d_instantiate(dentry, inode); 610 608 inc_nlink(dir); 611 - inode_dir_notify(dir, DN_CREATE); 609 + fsnotify_mkdir(dir, dentry); 612 610 return 0; 613 611 out_err: 614 612 printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n", ··· 750 748 rpci->flags = flags; 751 749 rpci->ops = ops; 752 750 rpci->nkern_readwriters = 1; 753 - inode_dir_notify(dir, DN_CREATE); 751 + fsnotify_create(dir, dentry); 754 752 dget(dentry); 755 753 out: 756 754 mutex_unlock(&dir->i_mutex);

+96 -51

net/sunrpc/rpcb_clnt.c

··· 16 16 17 17 #include <linux/types.h> 18 18 #include <linux/socket.h> 19 + #include <linux/in.h> 20 + #include <linux/in6.h> 19 21 #include <linux/kernel.h> 20 22 #include <linux/errno.h> 21 23 22 24 #include <linux/sunrpc/clnt.h> 23 25 #include <linux/sunrpc/sched.h> 26 + #include <linux/sunrpc/xprtsock.h> 24 27 25 28 #ifdef RPC_DEBUG 26 29 # define RPCDBG_FACILITY RPCDBG_BIND ··· 94 91 #define RPCB_MAXADDRLEN (128u) 95 92 96 93 /* 97 - * r_netid 98 - * 99 - * Quoting RFC 3530, section 2.2: 100 - * 101 - * For TCP over IPv4 the value of r_netid is the string "tcp". For UDP 102 - * over IPv4 the value of r_netid is the string "udp". 103 - * 104 - * ... 105 - * 106 - * For TCP over IPv6 the value of r_netid is the string "tcp6". For UDP 107 - * over IPv6 the value of r_netid is the string "udp6". 108 - */ 109 - #define RPCB_NETID_UDP "\165\144\160" /* "udp" */ 110 - #define RPCB_NETID_TCP "\164\143\160" /* "tcp" */ 111 - #define RPCB_NETID_UDP6 "\165\144\160\066" /* "udp6" */ 112 - #define RPCB_NETID_TCP6 "\164\143\160\066" /* "tcp6" */ 113 - 114 - #define RPCB_MAXNETIDLEN (4u) 115 - 116 - /* 117 94 * r_owner 118 95 * 119 96 * The "owner" is allowed to unset a service in the rpcbind database. ··· 103 120 #define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) 104 121 105 122 static void rpcb_getport_done(struct rpc_task *, void *); 106 - extern struct rpc_program rpcb_program; 123 + static struct rpc_program rpcb_program; 107 124 108 125 struct rpcbind_args { 109 126 struct rpc_xprt * r_xprt; ··· 120 137 static struct rpc_procinfo rpcb_procedures2[]; 121 138 static struct rpc_procinfo rpcb_procedures3[]; 122 139 123 - static struct rpcb_info { 140 + struct rpcb_info { 124 141 int rpc_vers; 125 142 struct rpc_procinfo * rpc_proc; 126 - } rpcb_next_version[]; 143 + }; 144 + 145 + static struct rpcb_info rpcb_next_version[]; 146 + static struct rpcb_info rpcb_next_version6[]; 127 147 128 148 static void rpcb_getport_prepare(struct rpc_task *task, void *calldata) 129 149 { ··· 176 190 RPC_CLNT_CREATE_INTR), 177 191 }; 178 192 179 - ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT); 193 + switch (srvaddr->sa_family) { 194 + case AF_INET: 195 + ((struct sockaddr_in *)srvaddr)->sin_port = htons(RPCBIND_PORT); 196 + break; 197 + case AF_INET6: 198 + ((struct sockaddr_in6 *)srvaddr)->sin6_port = htons(RPCBIND_PORT); 199 + break; 200 + default: 201 + return NULL; 202 + } 203 + 180 204 if (!privileged) 181 205 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; 182 206 return rpc_create(&args); ··· 230 234 prog, vers, prot, port); 231 235 232 236 rpcb_clnt = rpcb_create("localhost", (struct sockaddr *) &sin, 233 - IPPROTO_UDP, 2, 1); 237 + XPRT_TRANSPORT_UDP, 2, 1); 234 238 if (IS_ERR(rpcb_clnt)) 235 239 return PTR_ERR(rpcb_clnt); 236 240 ··· 312 316 struct rpc_task *child; 313 317 struct sockaddr addr; 314 318 int status; 319 + struct rpcb_info *info; 315 320 316 321 dprintk("RPC: %5u %s(%s, %u, %u, %d)\n", 317 322 task->tk_pid, __FUNCTION__, ··· 322 325 BUG_ON(clnt->cl_parent != clnt); 323 326 324 327 if (xprt_test_and_set_binding(xprt)) { 325 - status = -EACCES; /* tell caller to check again */ 328 + status = -EAGAIN; /* tell caller to check again */ 326 329 dprintk("RPC: %5u %s: waiting for another binder\n", 327 330 task->tk_pid, __FUNCTION__); 328 331 goto bailout_nowake; ··· 340 343 goto bailout_nofree; 341 344 } 342 345 343 - if (rpcb_next_version[xprt->bind_index].rpc_proc == NULL) { 346 + rpc_peeraddr(clnt, (void *)&addr, sizeof(addr)); 347 + 348 + /* Don't ever use rpcbind v2 for AF_INET6 requests */ 349 + switch (addr.sa_family) { 350 + case AF_INET: 351 + info = rpcb_next_version; 352 + break; 353 + case AF_INET6: 354 + info = rpcb_next_version6; 355 + break; 356 + default: 357 + status = -EAFNOSUPPORT; 358 + dprintk("RPC: %5u %s: bad address family\n", 359 + task->tk_pid, __FUNCTION__); 360 + goto bailout_nofree; 361 + } 362 + if (info[xprt->bind_index].rpc_proc == NULL) { 344 363 xprt->bind_index = 0; 345 - status = -EACCES; /* tell caller to try again later */ 364 + status = -EPFNOSUPPORT; 346 365 dprintk("RPC: %5u %s: no more getport versions available\n", 347 366 task->tk_pid, __FUNCTION__); 348 367 goto bailout_nofree; 349 368 } 350 - bind_version = rpcb_next_version[xprt->bind_index].rpc_vers; 369 + bind_version = info[xprt->bind_index].rpc_vers; 351 370 352 371 dprintk("RPC: %5u %s: trying rpcbind version %u\n", 353 372 task->tk_pid, __FUNCTION__, bind_version); 373 + 374 + rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, 375 + bind_version, 0); 376 + if (IS_ERR(rpcb_clnt)) { 377 + status = PTR_ERR(rpcb_clnt); 378 + dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n", 379 + task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt)); 380 + goto bailout_nofree; 381 + } 354 382 355 383 map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC); 356 384 if (!map) { ··· 389 367 map->r_prot = xprt->prot; 390 368 map->r_port = 0; 391 369 map->r_xprt = xprt_get(xprt); 392 - map->r_netid = (xprt->prot == IPPROTO_TCP) ? RPCB_NETID_TCP : 393 - RPCB_NETID_UDP; 394 - memcpy(&map->r_addr, rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR), 395 - sizeof(map->r_addr)); 370 + map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID); 371 + memcpy(map->r_addr, 372 + rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR), 373 + sizeof(map->r_addr)); 396 374 map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */ 397 - 398 - rpc_peeraddr(clnt, (void *)&addr, sizeof(addr)); 399 - rpcb_clnt = rpcb_create(clnt->cl_server, &addr, xprt->prot, bind_version, 0); 400 - if (IS_ERR(rpcb_clnt)) { 401 - status = PTR_ERR(rpcb_clnt); 402 - dprintk("RPC: %5u %s: rpcb_create failed, error %ld\n", 403 - task->tk_pid, __FUNCTION__, PTR_ERR(rpcb_clnt)); 404 - goto bailout; 405 - } 406 375 407 376 child = rpc_run_task(rpcb_clnt, RPC_TASK_ASYNC, &rpcb_getport_ops, map); 408 377 rpc_release_client(rpcb_clnt); ··· 401 388 status = -EIO; 402 389 dprintk("RPC: %5u %s: rpc_run_task failed\n", 403 390 task->tk_pid, __FUNCTION__); 404 - goto bailout_nofree; 391 + goto bailout; 405 392 } 406 393 rpc_put_task(child); 407 394 ··· 416 403 bailout_nowake: 417 404 task->tk_status = status; 418 405 } 406 + EXPORT_SYMBOL_GPL(rpcb_getport_async); 419 407 420 408 /* 421 409 * Rpcbind child task calls this callback via tk_exit. ··· 426 412 struct rpcbind_args *map = data; 427 413 struct rpc_xprt *xprt = map->r_xprt; 428 414 int status = child->tk_status; 415 + 416 + /* Garbage reply: retry with a lesser rpcbind version */ 417 + if (status == -EIO) 418 + status = -EPROTONOSUPPORT; 429 419 430 420 /* rpcbind server doesn't support this rpcbind protocol version */ 431 421 if (status == -EPROTONOSUPPORT) ··· 508 490 unsigned short *portp) 509 491 { 510 492 char *addr; 511 - int addr_len, c, i, f, first, val; 493 + u32 addr_len; 494 + int c, i, f, first, val; 512 495 513 496 *portp = 0; 514 - addr_len = (unsigned int) ntohl(*p++); 515 - if (addr_len > RPCB_MAXADDRLEN) /* sanity */ 516 - return -EINVAL; 497 + addr_len = ntohl(*p++); 517 498 518 - dprintk("RPC: rpcb_decode_getaddr returned string: '%s'\n", 519 - (char *) p); 499 + /* 500 + * Simple sanity check. The smallest possible universal 501 + * address is an IPv4 address string containing 11 bytes. 502 + */ 503 + if (addr_len < 11 || addr_len > RPCB_MAXADDRLEN) 504 + goto out_err; 520 505 506 + /* 507 + * Start at the end and walk backwards until the first dot 508 + * is encountered. When the second dot is found, we have 509 + * both parts of the port number. 510 + */ 521 511 addr = (char *)p; 522 512 val = 0; 523 513 first = 1; ··· 547 521 } 548 522 } 549 523 524 + /* 525 + * Simple sanity check. If we never saw a dot in the reply, 526 + * then this was probably just garbage. 527 + */ 528 + if (first) 529 + goto out_err; 530 + 550 531 dprintk("RPC: rpcb_decode_getaddr port=%u\n", *portp); 551 532 return 0; 533 + 534 + out_err: 535 + dprintk("RPC: rpcbind server returned malformed reply\n"); 536 + return -EIO; 552 537 } 553 538 554 539 #define RPCB_program_sz (1u) ··· 568 531 #define RPCB_port_sz (1u) 569 532 #define RPCB_boolean_sz (1u) 570 533 571 - #define RPCB_netid_sz (1+XDR_QUADLEN(RPCB_MAXNETIDLEN)) 534 + #define RPCB_netid_sz (1+XDR_QUADLEN(RPCBIND_MAXNETIDLEN)) 572 535 #define RPCB_addr_sz (1+XDR_QUADLEN(RPCB_MAXADDRLEN)) 573 536 #define RPCB_ownerstring_sz (1+XDR_QUADLEN(RPCB_MAXOWNERLEN)) 574 537 ··· 630 593 { 0, NULL }, 631 594 }; 632 595 596 + static struct rpcb_info rpcb_next_version6[] = { 597 + #ifdef CONFIG_SUNRPC_BIND34 598 + { 4, &rpcb_procedures4[RPCBPROC_GETVERSADDR] }, 599 + { 3, &rpcb_procedures3[RPCBPROC_GETADDR] }, 600 + #endif 601 + { 0, NULL }, 602 + }; 603 + 633 604 static struct rpc_version rpcb_version2 = { 634 605 .number = 2, 635 606 .nrprocs = RPCB_HIGHPROC_2, ··· 666 621 667 622 static struct rpc_stat rpcb_stats; 668 623 669 - struct rpc_program rpcb_program = { 624 + static struct rpc_program rpcb_program = { 670 625 .name = "rpcbind", 671 626 .number = RPCBIND_PROGRAM, 672 627 .nrvers = ARRAY_SIZE(rpcb_version),

+2

net/sunrpc/sched.c

··· 777 777 task->tk_pid, size, buf); 778 778 return &buf->data; 779 779 } 780 + EXPORT_SYMBOL_GPL(rpc_malloc); 780 781 781 782 /** 782 783 * rpc_free - free buffer allocated via rpc_malloc ··· 803 802 else 804 803 kfree(buf); 805 804 } 805 + EXPORT_SYMBOL_GPL(rpc_free); 806 806 807 807 /* 808 808 * Creation and deletion of RPC task structures

+3

net/sunrpc/socklib.c

··· 34 34 desc->offset += len; 35 35 return len; 36 36 } 37 + EXPORT_SYMBOL_GPL(xdr_skb_read_bits); 37 38 38 39 /** 39 40 * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer ··· 138 137 out: 139 138 return copied; 140 139 } 140 + EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb); 141 141 142 142 /** 143 143 * csum_partial_copy_to_xdr - checksum and copy data ··· 181 179 return -1; 182 180 return 0; 183 181 } 182 + EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);

+1 -1

net/sunrpc/sunrpc_syms.c

··· 20 20 #include <linux/sunrpc/auth.h> 21 21 #include <linux/workqueue.h> 22 22 #include <linux/sunrpc/rpc_pipe_fs.h> 23 - 23 + #include <linux/sunrpc/xprtsock.h> 24 24 25 25 /* RPC scheduler */ 26 26 EXPORT_SYMBOL(rpc_execute);

+4

net/sunrpc/timer.c

··· 17 17 18 18 #include <linux/types.h> 19 19 #include <linux/unistd.h> 20 + #include <linux/module.h> 20 21 21 22 #include <linux/sunrpc/clnt.h> 22 23 ··· 41 40 rt->ntimeouts[i] = 0; 42 41 } 43 42 } 43 + EXPORT_SYMBOL_GPL(rpc_init_rtt); 44 44 45 45 /* 46 46 * NB: When computing the smoothed RTT and standard deviation, ··· 77 75 if (*sdrtt < RPC_RTO_MIN) 78 76 *sdrtt = RPC_RTO_MIN; 79 77 } 78 + EXPORT_SYMBOL_GPL(rpc_update_rtt); 80 79 81 80 /* 82 81 * Estimate rto for an nfs rpc sent via. an unreliable datagram. ··· 106 103 107 104 return res; 108 105 } 106 + EXPORT_SYMBOL_GPL(rpc_calc_rto);

+104 -12

net/sunrpc/xprt.c

··· 62 62 static void xprt_connect_status(struct rpc_task *task); 63 63 static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); 64 64 65 + static spinlock_t xprt_list_lock = SPIN_LOCK_UNLOCKED; 66 + static LIST_HEAD(xprt_list); 67 + 65 68 /* 66 69 * The transport code maintains an estimate on the maximum number of out- 67 70 * standing RPC requests, using a smoothed version of the congestion ··· 82 79 #define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) 83 80 84 81 #define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) 82 + 83 + /** 84 + * xprt_register_transport - register a transport implementation 85 + * @transport: transport to register 86 + * 87 + * If a transport implementation is loaded as a kernel module, it can 88 + * call this interface to make itself known to the RPC client. 89 + * 90 + * Returns: 91 + * 0: transport successfully registered 92 + * -EEXIST: transport already registered 93 + * -EINVAL: transport module being unloaded 94 + */ 95 + int xprt_register_transport(struct xprt_class *transport) 96 + { 97 + struct xprt_class *t; 98 + int result; 99 + 100 + result = -EEXIST; 101 + spin_lock(&xprt_list_lock); 102 + list_for_each_entry(t, &xprt_list, list) { 103 + /* don't register the same transport class twice */ 104 + if (t->ident == transport->ident) 105 + goto out; 106 + } 107 + 108 + result = -EINVAL; 109 + if (try_module_get(THIS_MODULE)) { 110 + list_add_tail(&transport->list, &xprt_list); 111 + printk(KERN_INFO "RPC: Registered %s transport module.\n", 112 + transport->name); 113 + result = 0; 114 + } 115 + 116 + out: 117 + spin_unlock(&xprt_list_lock); 118 + return result; 119 + } 120 + EXPORT_SYMBOL_GPL(xprt_register_transport); 121 + 122 + /** 123 + * xprt_unregister_transport - unregister a transport implementation 124 + * transport: transport to unregister 125 + * 126 + * Returns: 127 + * 0: transport successfully unregistered 128 + * -ENOENT: transport never registered 129 + */ 130 + int xprt_unregister_transport(struct xprt_class *transport) 131 + { 132 + struct xprt_class *t; 133 + int result; 134 + 135 + result = 0; 136 + spin_lock(&xprt_list_lock); 137 + list_for_each_entry(t, &xprt_list, list) { 138 + if (t == transport) { 139 + printk(KERN_INFO 140 + "RPC: Unregistered %s transport module.\n", 141 + transport->name); 142 + list_del_init(&transport->list); 143 + module_put(THIS_MODULE); 144 + goto out; 145 + } 146 + } 147 + result = -ENOENT; 148 + 149 + out: 150 + spin_unlock(&xprt_list_lock); 151 + return result; 152 + } 153 + EXPORT_SYMBOL_GPL(xprt_unregister_transport); 85 154 86 155 /** 87 156 * xprt_reserve_xprt - serialize write access to transports ··· 193 118 rpc_sleep_on(&xprt->sending, task, NULL, NULL); 194 119 return 0; 195 120 } 121 + EXPORT_SYMBOL_GPL(xprt_reserve_xprt); 196 122 197 123 static void xprt_clear_locked(struct rpc_xprt *xprt) 198 124 { ··· 243 167 rpc_sleep_on(&xprt->sending, task, NULL, NULL); 244 168 return 0; 245 169 } 170 + EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); 246 171 247 172 static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) 248 173 { ··· 323 246 __xprt_lock_write_next(xprt); 324 247 } 325 248 } 249 + EXPORT_SYMBOL_GPL(xprt_release_xprt); 326 250 327 251 /** 328 252 * xprt_release_xprt_cong - allow other requests to use a transport ··· 340 262 __xprt_lock_write_next_cong(xprt); 341 263 } 342 264 } 265 + EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); 343 266 344 267 static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) 345 268 { ··· 393 314 { 394 315 __xprt_put_cong(task->tk_xprt, task->tk_rqstp); 395 316 } 317 + EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); 396 318 397 319 /** 398 320 * xprt_adjust_cwnd - adjust transport congestion window ··· 425 345 xprt->cwnd = cwnd; 426 346 __xprt_put_cong(xprt, req); 427 347 } 348 + EXPORT_SYMBOL_GPL(xprt_adjust_cwnd); 428 349 429 350 /** 430 351 * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue ··· 440 359 else 441 360 rpc_wake_up(&xprt->pending); 442 361 } 362 + EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); 443 363 444 364 /** 445 365 * xprt_wait_for_buffer_space - wait for transport output buffer to clear ··· 455 373 task->tk_timeout = req->rq_timeout; 456 374 rpc_sleep_on(&xprt->pending, task, NULL, NULL); 457 375 } 376 + EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); 458 377 459 378 /** 460 379 * xprt_write_space - wake the task waiting for transport output buffer space ··· 476 393 } 477 394 spin_unlock_bh(&xprt->transport_lock); 478 395 } 396 + EXPORT_SYMBOL_GPL(xprt_write_space); 479 397 480 398 /** 481 399 * xprt_set_retrans_timeout_def - set a request's retransmit timeout ··· 490 406 { 491 407 task->tk_timeout = task->tk_rqstp->rq_timeout; 492 408 } 409 + EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_def); 493 410 494 411 /* 495 412 * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout ··· 510 425 if (task->tk_timeout > max_timeout || task->tk_timeout == 0) 511 426 task->tk_timeout = max_timeout; 512 427 } 428 + EXPORT_SYMBOL_GPL(xprt_set_retrans_timeout_rtt); 513 429 514 430 static void xprt_reset_majortimeo(struct rpc_rqst *req) 515 431 { ··· 586 500 xprt_wake_pending_tasks(xprt, -ENOTCONN); 587 501 spin_unlock_bh(&xprt->transport_lock); 588 502 } 503 + EXPORT_SYMBOL_GPL(xprt_disconnect); 589 504 590 505 static void 591 506 xprt_init_autodisconnect(unsigned long data) ··· 697 610 xprt->stat.bad_xids++; 698 611 return NULL; 699 612 } 613 + EXPORT_SYMBOL_GPL(xprt_lookup_rqst); 700 614 701 615 /** 702 616 * xprt_update_rtt - update an RPC client's RTT state after receiving a reply ··· 717 629 rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); 718 630 } 719 631 } 632 + EXPORT_SYMBOL_GPL(xprt_update_rtt); 720 633 721 634 /** 722 635 * xprt_complete_rqst - called when reply processing is complete ··· 742 653 req->rq_received = req->rq_private_buf.len = copied; 743 654 rpc_wake_up_task(task); 744 655 } 656 + EXPORT_SYMBOL_GPL(xprt_complete_rqst); 745 657 746 658 static void xprt_timer(struct rpc_task *task) 747 659 { ··· 979 889 * @args: rpc transport creation arguments 980 890 * 981 891 */ 982 - struct rpc_xprt *xprt_create_transport(struct rpc_xprtsock_create *args) 892 + struct rpc_xprt *xprt_create_transport(struct xprt_create *args) 983 893 { 984 894 struct rpc_xprt *xprt; 985 895 struct rpc_rqst *req; 896 + struct xprt_class *t; 986 897 987 - switch (args->proto) { 988 - case IPPROTO_UDP: 989 - xprt = xs_setup_udp(args); 990 - break; 991 - case IPPROTO_TCP: 992 - xprt = xs_setup_tcp(args); 993 - break; 994 - default: 995 - printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n", 996 - args->proto); 997 - return ERR_PTR(-EIO); 898 + spin_lock(&xprt_list_lock); 899 + list_for_each_entry(t, &xprt_list, list) { 900 + if (t->ident == args->ident) { 901 + spin_unlock(&xprt_list_lock); 902 + goto found; 903 + } 998 904 } 905 + spin_unlock(&xprt_list_lock); 906 + printk(KERN_ERR "RPC: transport (%d) not supported\n", args->ident); 907 + return ERR_PTR(-EIO); 908 + 909 + found: 910 + xprt = t->setup(args); 999 911 if (IS_ERR(xprt)) { 1000 912 dprintk("RPC: xprt_create_transport: failed, %ld\n", 1001 913 -PTR_ERR(xprt));

+3

net/sunrpc/xprtrdma/Makefile

··· 1 + obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o 2 + 3 + xprtrdma-y := transport.o rpc_rdma.o verbs.o

+868

net/sunrpc/xprtrdma/rpc_rdma.c

··· 1 + /* 2 + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + */ 39 + 40 + /* 41 + * rpc_rdma.c 42 + * 43 + * This file contains the guts of the RPC RDMA protocol, and 44 + * does marshaling/unmarshaling, etc. It is also where interfacing 45 + * to the Linux RPC framework lives. 46 + */ 47 + 48 + #include "xprt_rdma.h" 49 + 50 + #include <linux/highmem.h> 51 + 52 + #ifdef RPC_DEBUG 53 + # define RPCDBG_FACILITY RPCDBG_TRANS 54 + #endif 55 + 56 + enum rpcrdma_chunktype { 57 + rpcrdma_noch = 0, 58 + rpcrdma_readch, 59 + rpcrdma_areadch, 60 + rpcrdma_writech, 61 + rpcrdma_replych 62 + }; 63 + 64 + #ifdef RPC_DEBUG 65 + static const char transfertypes[][12] = { 66 + "pure inline", /* no chunks */ 67 + " read chunk", /* some argument via rdma read */ 68 + "*read chunk", /* entire request via rdma read */ 69 + "write chunk", /* some result via rdma write */ 70 + "reply chunk" /* entire reply via rdma write */ 71 + }; 72 + #endif 73 + 74 + /* 75 + * Chunk assembly from upper layer xdr_buf. 76 + * 77 + * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk 78 + * elements. Segments are then coalesced when registered, if possible 79 + * within the selected memreg mode. 80 + * 81 + * Note, this routine is never called if the connection's memory 82 + * registration strategy is 0 (bounce buffers). 83 + */ 84 + 85 + static int 86 + rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, int pos, 87 + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) 88 + { 89 + int len, n = 0, p; 90 + 91 + if (pos == 0 && xdrbuf->head[0].iov_len) { 92 + seg[n].mr_page = NULL; 93 + seg[n].mr_offset = xdrbuf->head[0].iov_base; 94 + seg[n].mr_len = xdrbuf->head[0].iov_len; 95 + pos += xdrbuf->head[0].iov_len; 96 + ++n; 97 + } 98 + 99 + if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) { 100 + if (n == nsegs) 101 + return 0; 102 + seg[n].mr_page = xdrbuf->pages[0]; 103 + seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base; 104 + seg[n].mr_len = min_t(u32, 105 + PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len); 106 + len = xdrbuf->page_len - seg[n].mr_len; 107 + pos += len; 108 + ++n; 109 + p = 1; 110 + while (len > 0) { 111 + if (n == nsegs) 112 + return 0; 113 + seg[n].mr_page = xdrbuf->pages[p]; 114 + seg[n].mr_offset = NULL; 115 + seg[n].mr_len = min_t(u32, PAGE_SIZE, len); 116 + len -= seg[n].mr_len; 117 + ++n; 118 + ++p; 119 + } 120 + } 121 + 122 + if (pos < xdrbuf->len && xdrbuf->tail[0].iov_len) { 123 + if (n == nsegs) 124 + return 0; 125 + seg[n].mr_page = NULL; 126 + seg[n].mr_offset = xdrbuf->tail[0].iov_base; 127 + seg[n].mr_len = xdrbuf->tail[0].iov_len; 128 + pos += xdrbuf->tail[0].iov_len; 129 + ++n; 130 + } 131 + 132 + if (pos < xdrbuf->len) 133 + dprintk("RPC: %s: marshaled only %d of %d\n", 134 + __func__, pos, xdrbuf->len); 135 + 136 + return n; 137 + } 138 + 139 + /* 140 + * Create read/write chunk lists, and reply chunks, for RDMA 141 + * 142 + * Assume check against THRESHOLD has been done, and chunks are required. 143 + * Assume only encoding one list entry for read|write chunks. The NFSv3 144 + * protocol is simple enough to allow this as it only has a single "bulk 145 + * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The 146 + * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.) 147 + * 148 + * When used for a single reply chunk (which is a special write 149 + * chunk used for the entire reply, rather than just the data), it 150 + * is used primarily for READDIR and READLINK which would otherwise 151 + * be severely size-limited by a small rdma inline read max. The server 152 + * response will come back as an RDMA Write, followed by a message 153 + * of type RDMA_NOMSG carrying the xid and length. As a result, reply 154 + * chunks do not provide data alignment, however they do not require 155 + * "fixup" (moving the response to the upper layer buffer) either. 156 + * 157 + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 158 + * 159 + * Read chunklist (a linked list): 160 + * N elements, position P (same P for all chunks of same arg!): 161 + * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 162 + * 163 + * Write chunklist (a list of (one) counted array): 164 + * N elements: 165 + * 1 - N - HLOO - HLOO - ... - HLOO - 0 166 + * 167 + * Reply chunk (a counted array): 168 + * N elements: 169 + * 1 - N - HLOO - HLOO - ... - HLOO 170 + */ 171 + 172 + static unsigned int 173 + rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, 174 + struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) 175 + { 176 + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 177 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_task->tk_xprt); 178 + int nsegs, nchunks = 0; 179 + int pos; 180 + struct rpcrdma_mr_seg *seg = req->rl_segments; 181 + struct rpcrdma_read_chunk *cur_rchunk = NULL; 182 + struct rpcrdma_write_array *warray = NULL; 183 + struct rpcrdma_write_chunk *cur_wchunk = NULL; 184 + u32 *iptr = headerp->rm_body.rm_chunks; 185 + 186 + if (type == rpcrdma_readch || type == rpcrdma_areadch) { 187 + /* a read chunk - server will RDMA Read our memory */ 188 + cur_rchunk = (struct rpcrdma_read_chunk *) iptr; 189 + } else { 190 + /* a write or reply chunk - server will RDMA Write our memory */ 191 + *iptr++ = xdr_zero; /* encode a NULL read chunk list */ 192 + if (type == rpcrdma_replych) 193 + *iptr++ = xdr_zero; /* a NULL write chunk list */ 194 + warray = (struct rpcrdma_write_array *) iptr; 195 + cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1); 196 + } 197 + 198 + if (type == rpcrdma_replych || type == rpcrdma_areadch) 199 + pos = 0; 200 + else 201 + pos = target->head[0].iov_len; 202 + 203 + nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); 204 + if (nsegs == 0) 205 + return 0; 206 + 207 + do { 208 + /* bind/register the memory, then build chunk from result. */ 209 + int n = rpcrdma_register_external(seg, nsegs, 210 + cur_wchunk != NULL, r_xprt); 211 + if (n <= 0) 212 + goto out; 213 + if (cur_rchunk) { /* read */ 214 + cur_rchunk->rc_discrim = xdr_one; 215 + /* all read chunks have the same "position" */ 216 + cur_rchunk->rc_position = htonl(pos); 217 + cur_rchunk->rc_target.rs_handle = htonl(seg->mr_rkey); 218 + cur_rchunk->rc_target.rs_length = htonl(seg->mr_len); 219 + xdr_encode_hyper( 220 + (u32 *)&cur_rchunk->rc_target.rs_offset, 221 + seg->mr_base); 222 + dprintk("RPC: %s: read chunk " 223 + "elem %d@0x%llx:0x%x pos %d (%s)\n", __func__, 224 + seg->mr_len, seg->mr_base, seg->mr_rkey, pos, 225 + n < nsegs ? "more" : "last"); 226 + cur_rchunk++; 227 + r_xprt->rx_stats.read_chunk_count++; 228 + } else { /* write/reply */ 229 + cur_wchunk->wc_target.rs_handle = htonl(seg->mr_rkey); 230 + cur_wchunk->wc_target.rs_length = htonl(seg->mr_len); 231 + xdr_encode_hyper( 232 + (u32 *)&cur_wchunk->wc_target.rs_offset, 233 + seg->mr_base); 234 + dprintk("RPC: %s: %s chunk " 235 + "elem %d@0x%llx:0x%x (%s)\n", __func__, 236 + (type == rpcrdma_replych) ? "reply" : "write", 237 + seg->mr_len, seg->mr_base, seg->mr_rkey, 238 + n < nsegs ? "more" : "last"); 239 + cur_wchunk++; 240 + if (type == rpcrdma_replych) 241 + r_xprt->rx_stats.reply_chunk_count++; 242 + else 243 + r_xprt->rx_stats.write_chunk_count++; 244 + r_xprt->rx_stats.total_rdma_request += seg->mr_len; 245 + } 246 + nchunks++; 247 + seg += n; 248 + nsegs -= n; 249 + } while (nsegs); 250 + 251 + /* success. all failures return above */ 252 + req->rl_nchunks = nchunks; 253 + 254 + BUG_ON(nchunks == 0); 255 + 256 + /* 257 + * finish off header. If write, marshal discrim and nchunks. 258 + */ 259 + if (cur_rchunk) { 260 + iptr = (u32 *) cur_rchunk; 261 + *iptr++ = xdr_zero; /* finish the read chunk list */ 262 + *iptr++ = xdr_zero; /* encode a NULL write chunk list */ 263 + *iptr++ = xdr_zero; /* encode a NULL reply chunk */ 264 + } else { 265 + warray->wc_discrim = xdr_one; 266 + warray->wc_nchunks = htonl(nchunks); 267 + iptr = (u32 *) cur_wchunk; 268 + if (type == rpcrdma_writech) { 269 + *iptr++ = xdr_zero; /* finish the write chunk list */ 270 + *iptr++ = xdr_zero; /* encode a NULL reply chunk */ 271 + } 272 + } 273 + 274 + /* 275 + * Return header size. 276 + */ 277 + return (unsigned char *)iptr - (unsigned char *)headerp; 278 + 279 + out: 280 + for (pos = 0; nchunks--;) 281 + pos += rpcrdma_deregister_external( 282 + &req->rl_segments[pos], r_xprt, NULL); 283 + return 0; 284 + } 285 + 286 + /* 287 + * Copy write data inline. 288 + * This function is used for "small" requests. Data which is passed 289 + * to RPC via iovecs (or page list) is copied directly into the 290 + * pre-registered memory buffer for this request. For small amounts 291 + * of data, this is efficient. The cutoff value is tunable. 292 + */ 293 + static int 294 + rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) 295 + { 296 + int i, npages, curlen; 297 + int copy_len; 298 + unsigned char *srcp, *destp; 299 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 300 + 301 + destp = rqst->rq_svec[0].iov_base; 302 + curlen = rqst->rq_svec[0].iov_len; 303 + destp += curlen; 304 + /* 305 + * Do optional padding where it makes sense. Alignment of write 306 + * payload can help the server, if our setting is accurate. 307 + */ 308 + pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/); 309 + if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH) 310 + pad = 0; /* don't pad this request */ 311 + 312 + dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n", 313 + __func__, pad, destp, rqst->rq_slen, curlen); 314 + 315 + copy_len = rqst->rq_snd_buf.page_len; 316 + r_xprt->rx_stats.pullup_copy_count += copy_len; 317 + npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT; 318 + for (i = 0; copy_len && i < npages; i++) { 319 + if (i == 0) 320 + curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base; 321 + else 322 + curlen = PAGE_SIZE; 323 + if (curlen > copy_len) 324 + curlen = copy_len; 325 + dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", 326 + __func__, i, destp, copy_len, curlen); 327 + srcp = kmap_atomic(rqst->rq_snd_buf.pages[i], 328 + KM_SKB_SUNRPC_DATA); 329 + if (i == 0) 330 + memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen); 331 + else 332 + memcpy(destp, srcp, curlen); 333 + kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA); 334 + rqst->rq_svec[0].iov_len += curlen; 335 + destp += curlen; 336 + copy_len -= curlen; 337 + } 338 + if (rqst->rq_snd_buf.tail[0].iov_len) { 339 + curlen = rqst->rq_snd_buf.tail[0].iov_len; 340 + if (destp != rqst->rq_snd_buf.tail[0].iov_base) { 341 + memcpy(destp, 342 + rqst->rq_snd_buf.tail[0].iov_base, curlen); 343 + r_xprt->rx_stats.pullup_copy_count += curlen; 344 + } 345 + dprintk("RPC: %s: tail destp 0x%p len %d curlen %d\n", 346 + __func__, destp, copy_len, curlen); 347 + rqst->rq_svec[0].iov_len += curlen; 348 + } 349 + /* header now contains entire send message */ 350 + return pad; 351 + } 352 + 353 + /* 354 + * Marshal a request: the primary job of this routine is to choose 355 + * the transfer modes. See comments below. 356 + * 357 + * Uses multiple RDMA IOVs for a request: 358 + * [0] -- RPC RDMA header, which uses memory from the *start* of the 359 + * preregistered buffer that already holds the RPC data in 360 + * its middle. 361 + * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. 362 + * [2] -- optional padding. 363 + * [3] -- if padded, header only in [1] and data here. 364 + */ 365 + 366 + int 367 + rpcrdma_marshal_req(struct rpc_rqst *rqst) 368 + { 369 + struct rpc_xprt *xprt = rqst->rq_task->tk_xprt; 370 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 371 + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 372 + char *base; 373 + size_t hdrlen, rpclen, padlen; 374 + enum rpcrdma_chunktype rtype, wtype; 375 + struct rpcrdma_msg *headerp; 376 + 377 + /* 378 + * rpclen gets amount of data in first buffer, which is the 379 + * pre-registered buffer. 380 + */ 381 + base = rqst->rq_svec[0].iov_base; 382 + rpclen = rqst->rq_svec[0].iov_len; 383 + 384 + /* build RDMA header in private area at front */ 385 + headerp = (struct rpcrdma_msg *) req->rl_base; 386 + /* don't htonl XID, it's already done in request */ 387 + headerp->rm_xid = rqst->rq_xid; 388 + headerp->rm_vers = xdr_one; 389 + headerp->rm_credit = htonl(r_xprt->rx_buf.rb_max_requests); 390 + headerp->rm_type = __constant_htonl(RDMA_MSG); 391 + 392 + /* 393 + * Chunks needed for results? 394 + * 395 + * o If the expected result is under the inline threshold, all ops 396 + * return as inline (but see later). 397 + * o Large non-read ops return as a single reply chunk. 398 + * o Large read ops return data as write chunk(s), header as inline. 399 + * 400 + * Note: the NFS code sending down multiple result segments implies 401 + * the op is one of read, readdir[plus], readlink or NFSv4 getacl. 402 + */ 403 + 404 + /* 405 + * This code can handle read chunks, write chunks OR reply 406 + * chunks -- only one type. If the request is too big to fit 407 + * inline, then we will choose read chunks. If the request is 408 + * a READ, then use write chunks to separate the file data 409 + * into pages; otherwise use reply chunks. 410 + */ 411 + if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) 412 + wtype = rpcrdma_noch; 413 + else if (rqst->rq_rcv_buf.page_len == 0) 414 + wtype = rpcrdma_replych; 415 + else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) 416 + wtype = rpcrdma_writech; 417 + else 418 + wtype = rpcrdma_replych; 419 + 420 + /* 421 + * Chunks needed for arguments? 422 + * 423 + * o If the total request is under the inline threshold, all ops 424 + * are sent as inline. 425 + * o Large non-write ops are sent with the entire message as a 426 + * single read chunk (protocol 0-position special case). 427 + * o Large write ops transmit data as read chunk(s), header as 428 + * inline. 429 + * 430 + * Note: the NFS code sending down multiple argument segments 431 + * implies the op is a write. 432 + * TBD check NFSv4 setacl 433 + */ 434 + if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) 435 + rtype = rpcrdma_noch; 436 + else if (rqst->rq_snd_buf.page_len == 0) 437 + rtype = rpcrdma_areadch; 438 + else 439 + rtype = rpcrdma_readch; 440 + 441 + /* The following simplification is not true forever */ 442 + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) 443 + wtype = rpcrdma_noch; 444 + BUG_ON(rtype != rpcrdma_noch && wtype != rpcrdma_noch); 445 + 446 + if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS && 447 + (rtype != rpcrdma_noch || wtype != rpcrdma_noch)) { 448 + /* forced to "pure inline"? */ 449 + dprintk("RPC: %s: too much data (%d/%d) for inline\n", 450 + __func__, rqst->rq_rcv_buf.len, rqst->rq_snd_buf.len); 451 + return -1; 452 + } 453 + 454 + hdrlen = 28; /*sizeof *headerp;*/ 455 + padlen = 0; 456 + 457 + /* 458 + * Pull up any extra send data into the preregistered buffer. 459 + * When padding is in use and applies to the transfer, insert 460 + * it and change the message type. 461 + */ 462 + if (rtype == rpcrdma_noch) { 463 + 464 + padlen = rpcrdma_inline_pullup(rqst, 465 + RPCRDMA_INLINE_PAD_VALUE(rqst)); 466 + 467 + if (padlen) { 468 + headerp->rm_type = __constant_htonl(RDMA_MSGP); 469 + headerp->rm_body.rm_padded.rm_align = 470 + htonl(RPCRDMA_INLINE_PAD_VALUE(rqst)); 471 + headerp->rm_body.rm_padded.rm_thresh = 472 + __constant_htonl(RPCRDMA_INLINE_PAD_THRESH); 473 + headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero; 474 + headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; 475 + headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; 476 + hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ 477 + BUG_ON(wtype != rpcrdma_noch); 478 + 479 + } else { 480 + headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; 481 + headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; 482 + headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; 483 + /* new length after pullup */ 484 + rpclen = rqst->rq_svec[0].iov_len; 485 + /* 486 + * Currently we try to not actually use read inline. 487 + * Reply chunks have the desirable property that 488 + * they land, packed, directly in the target buffers 489 + * without headers, so they require no fixup. The 490 + * additional RDMA Write op sends the same amount 491 + * of data, streams on-the-wire and adds no overhead 492 + * on receive. Therefore, we request a reply chunk 493 + * for non-writes wherever feasible and efficient. 494 + */ 495 + if (wtype == rpcrdma_noch && 496 + r_xprt->rx_ia.ri_memreg_strategy > RPCRDMA_REGISTER) 497 + wtype = rpcrdma_replych; 498 + } 499 + } 500 + 501 + /* 502 + * Marshal chunks. This routine will return the header length 503 + * consumed by marshaling. 504 + */ 505 + if (rtype != rpcrdma_noch) { 506 + hdrlen = rpcrdma_create_chunks(rqst, 507 + &rqst->rq_snd_buf, headerp, rtype); 508 + wtype = rtype; /* simplify dprintk */ 509 + 510 + } else if (wtype != rpcrdma_noch) { 511 + hdrlen = rpcrdma_create_chunks(rqst, 512 + &rqst->rq_rcv_buf, headerp, wtype); 513 + } 514 + 515 + if (hdrlen == 0) 516 + return -1; 517 + 518 + dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd\n" 519 + " headerp 0x%p base 0x%p lkey 0x%x\n", 520 + __func__, transfertypes[wtype], hdrlen, rpclen, padlen, 521 + headerp, base, req->rl_iov.lkey); 522 + 523 + /* 524 + * initialize send_iov's - normally only two: rdma chunk header and 525 + * single preregistered RPC header buffer, but if padding is present, 526 + * then use a preregistered (and zeroed) pad buffer between the RPC 527 + * header and any write data. In all non-rdma cases, any following 528 + * data has been copied into the RPC header buffer. 529 + */ 530 + req->rl_send_iov[0].addr = req->rl_iov.addr; 531 + req->rl_send_iov[0].length = hdrlen; 532 + req->rl_send_iov[0].lkey = req->rl_iov.lkey; 533 + 534 + req->rl_send_iov[1].addr = req->rl_iov.addr + (base - req->rl_base); 535 + req->rl_send_iov[1].length = rpclen; 536 + req->rl_send_iov[1].lkey = req->rl_iov.lkey; 537 + 538 + req->rl_niovs = 2; 539 + 540 + if (padlen) { 541 + struct rpcrdma_ep *ep = &r_xprt->rx_ep; 542 + 543 + req->rl_send_iov[2].addr = ep->rep_pad.addr; 544 + req->rl_send_iov[2].length = padlen; 545 + req->rl_send_iov[2].lkey = ep->rep_pad.lkey; 546 + 547 + req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen; 548 + req->rl_send_iov[3].length = rqst->rq_slen - rpclen; 549 + req->rl_send_iov[3].lkey = req->rl_iov.lkey; 550 + 551 + req->rl_niovs = 4; 552 + } 553 + 554 + return 0; 555 + } 556 + 557 + /* 558 + * Chase down a received write or reply chunklist to get length 559 + * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) 560 + */ 561 + static int 562 + rpcrdma_count_chunks(struct rpcrdma_rep *rep, int max, int wrchunk, u32 **iptrp) 563 + { 564 + unsigned int i, total_len; 565 + struct rpcrdma_write_chunk *cur_wchunk; 566 + 567 + i = ntohl(**iptrp); /* get array count */ 568 + if (i > max) 569 + return -1; 570 + cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); 571 + total_len = 0; 572 + while (i--) { 573 + struct rpcrdma_segment *seg = &cur_wchunk->wc_target; 574 + ifdebug(FACILITY) { 575 + u64 off; 576 + xdr_decode_hyper((u32 *)&seg->rs_offset, &off); 577 + dprintk("RPC: %s: chunk %d@0x%llx:0x%x\n", 578 + __func__, 579 + ntohl(seg->rs_length), 580 + off, 581 + ntohl(seg->rs_handle)); 582 + } 583 + total_len += ntohl(seg->rs_length); 584 + ++cur_wchunk; 585 + } 586 + /* check and adjust for properly terminated write chunk */ 587 + if (wrchunk) { 588 + u32 *w = (u32 *) cur_wchunk; 589 + if (*w++ != xdr_zero) 590 + return -1; 591 + cur_wchunk = (struct rpcrdma_write_chunk *) w; 592 + } 593 + if ((char *) cur_wchunk > rep->rr_base + rep->rr_len) 594 + return -1; 595 + 596 + *iptrp = (u32 *) cur_wchunk; 597 + return total_len; 598 + } 599 + 600 + /* 601 + * Scatter inline received data back into provided iov's. 602 + */ 603 + static void 604 + rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len) 605 + { 606 + int i, npages, curlen, olen; 607 + char *destp; 608 + 609 + curlen = rqst->rq_rcv_buf.head[0].iov_len; 610 + if (curlen > copy_len) { /* write chunk header fixup */ 611 + curlen = copy_len; 612 + rqst->rq_rcv_buf.head[0].iov_len = curlen; 613 + } 614 + 615 + dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", 616 + __func__, srcp, copy_len, curlen); 617 + 618 + /* Shift pointer for first receive segment only */ 619 + rqst->rq_rcv_buf.head[0].iov_base = srcp; 620 + srcp += curlen; 621 + copy_len -= curlen; 622 + 623 + olen = copy_len; 624 + i = 0; 625 + rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; 626 + if (copy_len && rqst->rq_rcv_buf.page_len) { 627 + npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base + 628 + rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; 629 + for (; i < npages; i++) { 630 + if (i == 0) 631 + curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base; 632 + else 633 + curlen = PAGE_SIZE; 634 + if (curlen > copy_len) 635 + curlen = copy_len; 636 + dprintk("RPC: %s: page %d" 637 + " srcp 0x%p len %d curlen %d\n", 638 + __func__, i, srcp, copy_len, curlen); 639 + destp = kmap_atomic(rqst->rq_rcv_buf.pages[i], 640 + KM_SKB_SUNRPC_DATA); 641 + if (i == 0) 642 + memcpy(destp + rqst->rq_rcv_buf.page_base, 643 + srcp, curlen); 644 + else 645 + memcpy(destp, srcp, curlen); 646 + flush_dcache_page(rqst->rq_rcv_buf.pages[i]); 647 + kunmap_atomic(destp, KM_SKB_SUNRPC_DATA); 648 + srcp += curlen; 649 + copy_len -= curlen; 650 + if (copy_len == 0) 651 + break; 652 + } 653 + rqst->rq_rcv_buf.page_len = olen - copy_len; 654 + } else 655 + rqst->rq_rcv_buf.page_len = 0; 656 + 657 + if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { 658 + curlen = copy_len; 659 + if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) 660 + curlen = rqst->rq_rcv_buf.tail[0].iov_len; 661 + if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) 662 + memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); 663 + dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", 664 + __func__, srcp, copy_len, curlen); 665 + rqst->rq_rcv_buf.tail[0].iov_len = curlen; 666 + copy_len -= curlen; ++i; 667 + } else 668 + rqst->rq_rcv_buf.tail[0].iov_len = 0; 669 + 670 + if (copy_len) 671 + dprintk("RPC: %s: %d bytes in" 672 + " %d extra segments (%d lost)\n", 673 + __func__, olen, i, copy_len); 674 + 675 + /* TBD avoid a warning from call_decode() */ 676 + rqst->rq_private_buf = rqst->rq_rcv_buf; 677 + } 678 + 679 + /* 680 + * This function is called when an async event is posted to 681 + * the connection which changes the connection state. All it 682 + * does at this point is mark the connection up/down, the rpc 683 + * timers do the rest. 684 + */ 685 + void 686 + rpcrdma_conn_func(struct rpcrdma_ep *ep) 687 + { 688 + struct rpc_xprt *xprt = ep->rep_xprt; 689 + 690 + spin_lock_bh(&xprt->transport_lock); 691 + if (ep->rep_connected > 0) { 692 + if (!xprt_test_and_set_connected(xprt)) 693 + xprt_wake_pending_tasks(xprt, 0); 694 + } else { 695 + if (xprt_test_and_clear_connected(xprt)) 696 + xprt_wake_pending_tasks(xprt, ep->rep_connected); 697 + } 698 + spin_unlock_bh(&xprt->transport_lock); 699 + } 700 + 701 + /* 702 + * This function is called when memory window unbind which we are waiting 703 + * for completes. Just use rr_func (zeroed by upcall) to signal completion. 704 + */ 705 + static void 706 + rpcrdma_unbind_func(struct rpcrdma_rep *rep) 707 + { 708 + wake_up(&rep->rr_unbind); 709 + } 710 + 711 + /* 712 + * Called as a tasklet to do req/reply match and complete a request 713 + * Errors must result in the RPC task either being awakened, or 714 + * allowed to timeout, to discover the errors at that time. 715 + */ 716 + void 717 + rpcrdma_reply_handler(struct rpcrdma_rep *rep) 718 + { 719 + struct rpcrdma_msg *headerp; 720 + struct rpcrdma_req *req; 721 + struct rpc_rqst *rqst; 722 + struct rpc_xprt *xprt = rep->rr_xprt; 723 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 724 + u32 *iptr; 725 + int i, rdmalen, status; 726 + 727 + /* Check status. If bad, signal disconnect and return rep to pool */ 728 + if (rep->rr_len == ~0U) { 729 + rpcrdma_recv_buffer_put(rep); 730 + if (r_xprt->rx_ep.rep_connected == 1) { 731 + r_xprt->rx_ep.rep_connected = -EIO; 732 + rpcrdma_conn_func(&r_xprt->rx_ep); 733 + } 734 + return; 735 + } 736 + if (rep->rr_len < 28) { 737 + dprintk("RPC: %s: short/invalid reply\n", __func__); 738 + goto repost; 739 + } 740 + headerp = (struct rpcrdma_msg *) rep->rr_base; 741 + if (headerp->rm_vers != xdr_one) { 742 + dprintk("RPC: %s: invalid version %d\n", 743 + __func__, ntohl(headerp->rm_vers)); 744 + goto repost; 745 + } 746 + 747 + /* Get XID and try for a match. */ 748 + spin_lock(&xprt->transport_lock); 749 + rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); 750 + if (rqst == NULL) { 751 + spin_unlock(&xprt->transport_lock); 752 + dprintk("RPC: %s: reply 0x%p failed " 753 + "to match any request xid 0x%08x len %d\n", 754 + __func__, rep, headerp->rm_xid, rep->rr_len); 755 + repost: 756 + r_xprt->rx_stats.bad_reply_count++; 757 + rep->rr_func = rpcrdma_reply_handler; 758 + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) 759 + rpcrdma_recv_buffer_put(rep); 760 + 761 + return; 762 + } 763 + 764 + /* get request object */ 765 + req = rpcr_to_rdmar(rqst); 766 + 767 + dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" 768 + " RPC request 0x%p xid 0x%08x\n", 769 + __func__, rep, req, rqst, headerp->rm_xid); 770 + 771 + BUG_ON(!req || req->rl_reply); 772 + 773 + /* from here on, the reply is no longer an orphan */ 774 + req->rl_reply = rep; 775 + 776 + /* check for expected message types */ 777 + /* The order of some of these tests is important. */ 778 + switch (headerp->rm_type) { 779 + case __constant_htonl(RDMA_MSG): 780 + /* never expect read chunks */ 781 + /* never expect reply chunks (two ways to check) */ 782 + /* never expect write chunks without having offered RDMA */ 783 + if (headerp->rm_body.rm_chunks[0] != xdr_zero || 784 + (headerp->rm_body.rm_chunks[1] == xdr_zero && 785 + headerp->rm_body.rm_chunks[2] != xdr_zero) || 786 + (headerp->rm_body.rm_chunks[1] != xdr_zero && 787 + req->rl_nchunks == 0)) 788 + goto badheader; 789 + if (headerp->rm_body.rm_chunks[1] != xdr_zero) { 790 + /* count any expected write chunks in read reply */ 791 + /* start at write chunk array count */ 792 + iptr = &headerp->rm_body.rm_chunks[2]; 793 + rdmalen = rpcrdma_count_chunks(rep, 794 + req->rl_nchunks, 1, &iptr); 795 + /* check for validity, and no reply chunk after */ 796 + if (rdmalen < 0 || *iptr++ != xdr_zero) 797 + goto badheader; 798 + rep->rr_len -= 799 + ((unsigned char *)iptr - (unsigned char *)headerp); 800 + status = rep->rr_len + rdmalen; 801 + r_xprt->rx_stats.total_rdma_reply += rdmalen; 802 + } else { 803 + /* else ordinary inline */ 804 + iptr = (u32 *)((unsigned char *)headerp + 28); 805 + rep->rr_len -= 28; /*sizeof *headerp;*/ 806 + status = rep->rr_len; 807 + } 808 + /* Fix up the rpc results for upper layer */ 809 + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len); 810 + break; 811 + 812 + case __constant_htonl(RDMA_NOMSG): 813 + /* never expect read or write chunks, always reply chunks */ 814 + if (headerp->rm_body.rm_chunks[0] != xdr_zero || 815 + headerp->rm_body.rm_chunks[1] != xdr_zero || 816 + headerp->rm_body.rm_chunks[2] != xdr_one || 817 + req->rl_nchunks == 0) 818 + goto badheader; 819 + iptr = (u32 *)((unsigned char *)headerp + 28); 820 + rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); 821 + if (rdmalen < 0) 822 + goto badheader; 823 + r_xprt->rx_stats.total_rdma_reply += rdmalen; 824 + /* Reply chunk buffer already is the reply vector - no fixup. */ 825 + status = rdmalen; 826 + break; 827 + 828 + badheader: 829 + default: 830 + dprintk("%s: invalid rpcrdma reply header (type %d):" 831 + " chunks[012] == %d %d %d" 832 + " expected chunks <= %d\n", 833 + __func__, ntohl(headerp->rm_type), 834 + headerp->rm_body.rm_chunks[0], 835 + headerp->rm_body.rm_chunks[1], 836 + headerp->rm_body.rm_chunks[2], 837 + req->rl_nchunks); 838 + status = -EIO; 839 + r_xprt->rx_stats.bad_reply_count++; 840 + break; 841 + } 842 + 843 + /* If using mw bind, start the deregister process now. */ 844 + /* (Note: if mr_free(), cannot perform it here, in tasklet context) */ 845 + if (req->rl_nchunks) switch (r_xprt->rx_ia.ri_memreg_strategy) { 846 + case RPCRDMA_MEMWINDOWS: 847 + for (i = 0; req->rl_nchunks-- > 1;) 848 + i += rpcrdma_deregister_external( 849 + &req->rl_segments[i], r_xprt, NULL); 850 + /* Optionally wait (not here) for unbinds to complete */ 851 + rep->rr_func = rpcrdma_unbind_func; 852 + (void) rpcrdma_deregister_external(&req->rl_segments[i], 853 + r_xprt, rep); 854 + break; 855 + case RPCRDMA_MEMWINDOWS_ASYNC: 856 + for (i = 0; req->rl_nchunks--;) 857 + i += rpcrdma_deregister_external(&req->rl_segments[i], 858 + r_xprt, NULL); 859 + break; 860 + default: 861 + break; 862 + } 863 + 864 + dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", 865 + __func__, xprt, rqst, status); 866 + xprt_complete_rqst(rqst->rq_task, status); 867 + spin_unlock(&xprt->transport_lock); 868 + }

+800

net/sunrpc/xprtrdma/transport.c

··· 1 + /* 2 + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + */ 39 + 40 + /* 41 + * transport.c 42 + * 43 + * This file contains the top-level implementation of an RPC RDMA 44 + * transport. 45 + * 46 + * Naming convention: functions beginning with xprt_ are part of the 47 + * transport switch. All others are RPC RDMA internal. 48 + */ 49 + 50 + #include <linux/module.h> 51 + #include <linux/init.h> 52 + #include <linux/seq_file.h> 53 + 54 + #include "xprt_rdma.h" 55 + 56 + #ifdef RPC_DEBUG 57 + # define RPCDBG_FACILITY RPCDBG_TRANS 58 + #endif 59 + 60 + MODULE_LICENSE("Dual BSD/GPL"); 61 + 62 + MODULE_DESCRIPTION("RPC/RDMA Transport for Linux kernel NFS"); 63 + MODULE_AUTHOR("Network Appliance, Inc."); 64 + 65 + /* 66 + * tunables 67 + */ 68 + 69 + static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; 70 + static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; 71 + static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; 72 + static unsigned int xprt_rdma_inline_write_padding; 73 + #if !RPCRDMA_PERSISTENT_REGISTRATION 74 + static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_REGISTER; /* FMR? */ 75 + #else 76 + static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_ALLPHYSICAL; 77 + #endif 78 + 79 + #ifdef RPC_DEBUG 80 + 81 + static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; 82 + static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; 83 + static unsigned int zero; 84 + static unsigned int max_padding = PAGE_SIZE; 85 + static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; 86 + static unsigned int max_memreg = RPCRDMA_LAST - 1; 87 + 88 + static struct ctl_table_header *sunrpc_table_header; 89 + 90 + static ctl_table xr_tunables_table[] = { 91 + { 92 + .ctl_name = CTL_SLOTTABLE_RDMA, 93 + .procname = "rdma_slot_table_entries", 94 + .data = &xprt_rdma_slot_table_entries, 95 + .maxlen = sizeof(unsigned int), 96 + .mode = 0644, 97 + .proc_handler = &proc_dointvec_minmax, 98 + .strategy = &sysctl_intvec, 99 + .extra1 = &min_slot_table_size, 100 + .extra2 = &max_slot_table_size 101 + }, 102 + { 103 + .ctl_name = CTL_RDMA_MAXINLINEREAD, 104 + .procname = "rdma_max_inline_read", 105 + .data = &xprt_rdma_max_inline_read, 106 + .maxlen = sizeof(unsigned int), 107 + .mode = 0644, 108 + .proc_handler = &proc_dointvec, 109 + .strategy = &sysctl_intvec, 110 + }, 111 + { 112 + .ctl_name = CTL_RDMA_MAXINLINEWRITE, 113 + .procname = "rdma_max_inline_write", 114 + .data = &xprt_rdma_max_inline_write, 115 + .maxlen = sizeof(unsigned int), 116 + .mode = 0644, 117 + .proc_handler = &proc_dointvec, 118 + .strategy = &sysctl_intvec, 119 + }, 120 + { 121 + .ctl_name = CTL_RDMA_WRITEPADDING, 122 + .procname = "rdma_inline_write_padding", 123 + .data = &xprt_rdma_inline_write_padding, 124 + .maxlen = sizeof(unsigned int), 125 + .mode = 0644, 126 + .proc_handler = &proc_dointvec_minmax, 127 + .strategy = &sysctl_intvec, 128 + .extra1 = &zero, 129 + .extra2 = &max_padding, 130 + }, 131 + { 132 + .ctl_name = CTL_RDMA_MEMREG, 133 + .procname = "rdma_memreg_strategy", 134 + .data = &xprt_rdma_memreg_strategy, 135 + .maxlen = sizeof(unsigned int), 136 + .mode = 0644, 137 + .proc_handler = &proc_dointvec_minmax, 138 + .strategy = &sysctl_intvec, 139 + .extra1 = &min_memreg, 140 + .extra2 = &max_memreg, 141 + }, 142 + { 143 + .ctl_name = 0, 144 + }, 145 + }; 146 + 147 + static ctl_table sunrpc_table[] = { 148 + { 149 + .ctl_name = CTL_SUNRPC, 150 + .procname = "sunrpc", 151 + .mode = 0555, 152 + .child = xr_tunables_table 153 + }, 154 + { 155 + .ctl_name = 0, 156 + }, 157 + }; 158 + 159 + #endif 160 + 161 + static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ 162 + 163 + static void 164 + xprt_rdma_format_addresses(struct rpc_xprt *xprt) 165 + { 166 + struct sockaddr_in *addr = (struct sockaddr_in *) 167 + &rpcx_to_rdmad(xprt).addr; 168 + char *buf; 169 + 170 + buf = kzalloc(20, GFP_KERNEL); 171 + if (buf) 172 + snprintf(buf, 20, NIPQUAD_FMT, NIPQUAD(addr->sin_addr.s_addr)); 173 + xprt->address_strings[RPC_DISPLAY_ADDR] = buf; 174 + 175 + buf = kzalloc(8, GFP_KERNEL); 176 + if (buf) 177 + snprintf(buf, 8, "%u", ntohs(addr->sin_port)); 178 + xprt->address_strings[RPC_DISPLAY_PORT] = buf; 179 + 180 + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; 181 + 182 + buf = kzalloc(48, GFP_KERNEL); 183 + if (buf) 184 + snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s", 185 + NIPQUAD(addr->sin_addr.s_addr), 186 + ntohs(addr->sin_port), "rdma"); 187 + xprt->address_strings[RPC_DISPLAY_ALL] = buf; 188 + 189 + buf = kzalloc(10, GFP_KERNEL); 190 + if (buf) 191 + snprintf(buf, 10, "%02x%02x%02x%02x", 192 + NIPQUAD(addr->sin_addr.s_addr)); 193 + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf; 194 + 195 + buf = kzalloc(8, GFP_KERNEL); 196 + if (buf) 197 + snprintf(buf, 8, "%4hx", ntohs(addr->sin_port)); 198 + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf; 199 + 200 + buf = kzalloc(30, GFP_KERNEL); 201 + if (buf) 202 + snprintf(buf, 30, NIPQUAD_FMT".%u.%u", 203 + NIPQUAD(addr->sin_addr.s_addr), 204 + ntohs(addr->sin_port) >> 8, 205 + ntohs(addr->sin_port) & 0xff); 206 + xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf; 207 + 208 + /* netid */ 209 + xprt->address_strings[RPC_DISPLAY_NETID] = "rdma"; 210 + } 211 + 212 + static void 213 + xprt_rdma_free_addresses(struct rpc_xprt *xprt) 214 + { 215 + kfree(xprt->address_strings[RPC_DISPLAY_ADDR]); 216 + kfree(xprt->address_strings[RPC_DISPLAY_PORT]); 217 + kfree(xprt->address_strings[RPC_DISPLAY_ALL]); 218 + kfree(xprt->address_strings[RPC_DISPLAY_HEX_ADDR]); 219 + kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]); 220 + kfree(xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR]); 221 + } 222 + 223 + static void 224 + xprt_rdma_connect_worker(struct work_struct *work) 225 + { 226 + struct rpcrdma_xprt *r_xprt = 227 + container_of(work, struct rpcrdma_xprt, rdma_connect.work); 228 + struct rpc_xprt *xprt = &r_xprt->xprt; 229 + int rc = 0; 230 + 231 + if (!xprt->shutdown) { 232 + xprt_clear_connected(xprt); 233 + 234 + dprintk("RPC: %s: %sconnect\n", __func__, 235 + r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); 236 + rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); 237 + if (rc) 238 + goto out; 239 + } 240 + goto out_clear; 241 + 242 + out: 243 + xprt_wake_pending_tasks(xprt, rc); 244 + 245 + out_clear: 246 + dprintk("RPC: %s: exit\n", __func__); 247 + xprt_clear_connecting(xprt); 248 + } 249 + 250 + /* 251 + * xprt_rdma_destroy 252 + * 253 + * Destroy the xprt. 254 + * Free all memory associated with the object, including its own. 255 + * NOTE: none of the *destroy methods free memory for their top-level 256 + * objects, even though they may have allocated it (they do free 257 + * private memory). It's up to the caller to handle it. In this 258 + * case (RDMA transport), all structure memory is inlined with the 259 + * struct rpcrdma_xprt. 260 + */ 261 + static void 262 + xprt_rdma_destroy(struct rpc_xprt *xprt) 263 + { 264 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 265 + int rc; 266 + 267 + dprintk("RPC: %s: called\n", __func__); 268 + 269 + cancel_delayed_work(&r_xprt->rdma_connect); 270 + flush_scheduled_work(); 271 + 272 + xprt_clear_connected(xprt); 273 + 274 + rpcrdma_buffer_destroy(&r_xprt->rx_buf); 275 + rc = rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); 276 + if (rc) 277 + dprintk("RPC: %s: rpcrdma_ep_destroy returned %i\n", 278 + __func__, rc); 279 + rpcrdma_ia_close(&r_xprt->rx_ia); 280 + 281 + xprt_rdma_free_addresses(xprt); 282 + 283 + kfree(xprt->slot); 284 + xprt->slot = NULL; 285 + kfree(xprt); 286 + 287 + dprintk("RPC: %s: returning\n", __func__); 288 + 289 + module_put(THIS_MODULE); 290 + } 291 + 292 + /** 293 + * xprt_setup_rdma - Set up transport to use RDMA 294 + * 295 + * @args: rpc transport arguments 296 + */ 297 + static struct rpc_xprt * 298 + xprt_setup_rdma(struct xprt_create *args) 299 + { 300 + struct rpcrdma_create_data_internal cdata; 301 + struct rpc_xprt *xprt; 302 + struct rpcrdma_xprt *new_xprt; 303 + struct rpcrdma_ep *new_ep; 304 + struct sockaddr_in *sin; 305 + int rc; 306 + 307 + if (args->addrlen > sizeof(xprt->addr)) { 308 + dprintk("RPC: %s: address too large\n", __func__); 309 + return ERR_PTR(-EBADF); 310 + } 311 + 312 + xprt = kzalloc(sizeof(struct rpcrdma_xprt), GFP_KERNEL); 313 + if (xprt == NULL) { 314 + dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", 315 + __func__); 316 + return ERR_PTR(-ENOMEM); 317 + } 318 + 319 + xprt->max_reqs = xprt_rdma_slot_table_entries; 320 + xprt->slot = kcalloc(xprt->max_reqs, 321 + sizeof(struct rpc_rqst), GFP_KERNEL); 322 + if (xprt->slot == NULL) { 323 + kfree(xprt); 324 + dprintk("RPC: %s: couldn't allocate %d slots\n", 325 + __func__, xprt->max_reqs); 326 + return ERR_PTR(-ENOMEM); 327 + } 328 + 329 + /* 60 second timeout, no retries */ 330 + xprt_set_timeout(&xprt->timeout, 0, 60UL * HZ); 331 + xprt->bind_timeout = (60U * HZ); 332 + xprt->connect_timeout = (60U * HZ); 333 + xprt->reestablish_timeout = (5U * HZ); 334 + xprt->idle_timeout = (5U * 60 * HZ); 335 + 336 + xprt->resvport = 0; /* privileged port not needed */ 337 + xprt->tsh_size = 0; /* RPC-RDMA handles framing */ 338 + xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE; 339 + xprt->ops = &xprt_rdma_procs; 340 + 341 + /* 342 + * Set up RDMA-specific connect data. 343 + */ 344 + 345 + /* Put server RDMA address in local cdata */ 346 + memcpy(&cdata.addr, args->dstaddr, args->addrlen); 347 + 348 + /* Ensure xprt->addr holds valid server TCP (not RDMA) 349 + * address, for any side protocols which peek at it */ 350 + xprt->prot = IPPROTO_TCP; 351 + xprt->addrlen = args->addrlen; 352 + memcpy(&xprt->addr, &cdata.addr, xprt->addrlen); 353 + 354 + sin = (struct sockaddr_in *)&cdata.addr; 355 + if (ntohs(sin->sin_port) != 0) 356 + xprt_set_bound(xprt); 357 + 358 + dprintk("RPC: %s: %u.%u.%u.%u:%u\n", __func__, 359 + NIPQUAD(sin->sin_addr.s_addr), ntohs(sin->sin_port)); 360 + 361 + /* Set max requests */ 362 + cdata.max_requests = xprt->max_reqs; 363 + 364 + /* Set some length limits */ 365 + cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */ 366 + cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */ 367 + 368 + cdata.inline_wsize = xprt_rdma_max_inline_write; 369 + if (cdata.inline_wsize > cdata.wsize) 370 + cdata.inline_wsize = cdata.wsize; 371 + 372 + cdata.inline_rsize = xprt_rdma_max_inline_read; 373 + if (cdata.inline_rsize > cdata.rsize) 374 + cdata.inline_rsize = cdata.rsize; 375 + 376 + cdata.padding = xprt_rdma_inline_write_padding; 377 + 378 + /* 379 + * Create new transport instance, which includes initialized 380 + * o ia 381 + * o endpoint 382 + * o buffers 383 + */ 384 + 385 + new_xprt = rpcx_to_rdmax(xprt); 386 + 387 + rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr, 388 + xprt_rdma_memreg_strategy); 389 + if (rc) 390 + goto out1; 391 + 392 + /* 393 + * initialize and create ep 394 + */ 395 + new_xprt->rx_data = cdata; 396 + new_ep = &new_xprt->rx_ep; 397 + new_ep->rep_remote_addr = cdata.addr; 398 + 399 + rc = rpcrdma_ep_create(&new_xprt->rx_ep, 400 + &new_xprt->rx_ia, &new_xprt->rx_data); 401 + if (rc) 402 + goto out2; 403 + 404 + /* 405 + * Allocate pre-registered send and receive buffers for headers and 406 + * any inline data. Also specify any padding which will be provided 407 + * from a preregistered zero buffer. 408 + */ 409 + rc = rpcrdma_buffer_create(&new_xprt->rx_buf, new_ep, &new_xprt->rx_ia, 410 + &new_xprt->rx_data); 411 + if (rc) 412 + goto out3; 413 + 414 + /* 415 + * Register a callback for connection events. This is necessary because 416 + * connection loss notification is async. We also catch connection loss 417 + * when reaping receives. 418 + */ 419 + INIT_DELAYED_WORK(&new_xprt->rdma_connect, xprt_rdma_connect_worker); 420 + new_ep->rep_func = rpcrdma_conn_func; 421 + new_ep->rep_xprt = xprt; 422 + 423 + xprt_rdma_format_addresses(xprt); 424 + 425 + if (!try_module_get(THIS_MODULE)) 426 + goto out4; 427 + 428 + return xprt; 429 + 430 + out4: 431 + xprt_rdma_free_addresses(xprt); 432 + rc = -EINVAL; 433 + out3: 434 + (void) rpcrdma_ep_destroy(new_ep, &new_xprt->rx_ia); 435 + out2: 436 + rpcrdma_ia_close(&new_xprt->rx_ia); 437 + out1: 438 + kfree(xprt->slot); 439 + kfree(xprt); 440 + return ERR_PTR(rc); 441 + } 442 + 443 + /* 444 + * Close a connection, during shutdown or timeout/reconnect 445 + */ 446 + static void 447 + xprt_rdma_close(struct rpc_xprt *xprt) 448 + { 449 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 450 + 451 + dprintk("RPC: %s: closing\n", __func__); 452 + xprt_disconnect(xprt); 453 + (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); 454 + } 455 + 456 + static void 457 + xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) 458 + { 459 + struct sockaddr_in *sap; 460 + 461 + sap = (struct sockaddr_in *)&xprt->addr; 462 + sap->sin_port = htons(port); 463 + sap = (struct sockaddr_in *)&rpcx_to_rdmad(xprt).addr; 464 + sap->sin_port = htons(port); 465 + dprintk("RPC: %s: %u\n", __func__, port); 466 + } 467 + 468 + static void 469 + xprt_rdma_connect(struct rpc_task *task) 470 + { 471 + struct rpc_xprt *xprt = (struct rpc_xprt *)task->tk_xprt; 472 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 473 + 474 + if (!xprt_test_and_set_connecting(xprt)) { 475 + if (r_xprt->rx_ep.rep_connected != 0) { 476 + /* Reconnect */ 477 + schedule_delayed_work(&r_xprt->rdma_connect, 478 + xprt->reestablish_timeout); 479 + } else { 480 + schedule_delayed_work(&r_xprt->rdma_connect, 0); 481 + if (!RPC_IS_ASYNC(task)) 482 + flush_scheduled_work(); 483 + } 484 + } 485 + } 486 + 487 + static int 488 + xprt_rdma_reserve_xprt(struct rpc_task *task) 489 + { 490 + struct rpc_xprt *xprt = task->tk_xprt; 491 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 492 + int credits = atomic_read(&r_xprt->rx_buf.rb_credits); 493 + 494 + /* == RPC_CWNDSCALE @ init, but *after* setup */ 495 + if (r_xprt->rx_buf.rb_cwndscale == 0UL) { 496 + r_xprt->rx_buf.rb_cwndscale = xprt->cwnd; 497 + dprintk("RPC: %s: cwndscale %lu\n", __func__, 498 + r_xprt->rx_buf.rb_cwndscale); 499 + BUG_ON(r_xprt->rx_buf.rb_cwndscale <= 0); 500 + } 501 + xprt->cwnd = credits * r_xprt->rx_buf.rb_cwndscale; 502 + return xprt_reserve_xprt_cong(task); 503 + } 504 + 505 + /* 506 + * The RDMA allocate/free functions need the task structure as a place 507 + * to hide the struct rpcrdma_req, which is necessary for the actual send/recv 508 + * sequence. For this reason, the recv buffers are attached to send 509 + * buffers for portions of the RPC. Note that the RPC layer allocates 510 + * both send and receive buffers in the same call. We may register 511 + * the receive buffer portion when using reply chunks. 512 + */ 513 + static void * 514 + xprt_rdma_allocate(struct rpc_task *task, size_t size) 515 + { 516 + struct rpc_xprt *xprt = task->tk_xprt; 517 + struct rpcrdma_req *req, *nreq; 518 + 519 + req = rpcrdma_buffer_get(&rpcx_to_rdmax(xprt)->rx_buf); 520 + BUG_ON(NULL == req); 521 + 522 + if (size > req->rl_size) { 523 + dprintk("RPC: %s: size %zd too large for buffer[%zd]: " 524 + "prog %d vers %d proc %d\n", 525 + __func__, size, req->rl_size, 526 + task->tk_client->cl_prog, task->tk_client->cl_vers, 527 + task->tk_msg.rpc_proc->p_proc); 528 + /* 529 + * Outgoing length shortage. Our inline write max must have 530 + * been configured to perform direct i/o. 531 + * 532 + * This is therefore a large metadata operation, and the 533 + * allocate call was made on the maximum possible message, 534 + * e.g. containing long filename(s) or symlink data. In 535 + * fact, while these metadata operations *might* carry 536 + * large outgoing payloads, they rarely *do*. However, we 537 + * have to commit to the request here, so reallocate and 538 + * register it now. The data path will never require this 539 + * reallocation. 540 + * 541 + * If the allocation or registration fails, the RPC framework 542 + * will (doggedly) retry. 543 + */ 544 + if (rpcx_to_rdmax(xprt)->rx_ia.ri_memreg_strategy == 545 + RPCRDMA_BOUNCEBUFFERS) { 546 + /* forced to "pure inline" */ 547 + dprintk("RPC: %s: too much data (%zd) for inline " 548 + "(r/w max %d/%d)\n", __func__, size, 549 + rpcx_to_rdmad(xprt).inline_rsize, 550 + rpcx_to_rdmad(xprt).inline_wsize); 551 + size = req->rl_size; 552 + rpc_exit(task, -EIO); /* fail the operation */ 553 + rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; 554 + goto out; 555 + } 556 + if (task->tk_flags & RPC_TASK_SWAPPER) 557 + nreq = kmalloc(sizeof *req + size, GFP_ATOMIC); 558 + else 559 + nreq = kmalloc(sizeof *req + size, GFP_NOFS); 560 + if (nreq == NULL) 561 + goto outfail; 562 + 563 + if (rpcrdma_register_internal(&rpcx_to_rdmax(xprt)->rx_ia, 564 + nreq->rl_base, size + sizeof(struct rpcrdma_req) 565 + - offsetof(struct rpcrdma_req, rl_base), 566 + &nreq->rl_handle, &nreq->rl_iov)) { 567 + kfree(nreq); 568 + goto outfail; 569 + } 570 + rpcx_to_rdmax(xprt)->rx_stats.hardway_register_count += size; 571 + nreq->rl_size = size; 572 + nreq->rl_niovs = 0; 573 + nreq->rl_nchunks = 0; 574 + nreq->rl_buffer = (struct rpcrdma_buffer *)req; 575 + nreq->rl_reply = req->rl_reply; 576 + memcpy(nreq->rl_segments, 577 + req->rl_segments, sizeof nreq->rl_segments); 578 + /* flag the swap with an unused field */ 579 + nreq->rl_iov.length = 0; 580 + req->rl_reply = NULL; 581 + req = nreq; 582 + } 583 + dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); 584 + out: 585 + return req->rl_xdr_buf; 586 + 587 + outfail: 588 + rpcrdma_buffer_put(req); 589 + rpcx_to_rdmax(xprt)->rx_stats.failed_marshal_count++; 590 + return NULL; 591 + } 592 + 593 + /* 594 + * This function returns all RDMA resources to the pool. 595 + */ 596 + static void 597 + xprt_rdma_free(void *buffer) 598 + { 599 + struct rpcrdma_req *req; 600 + struct rpcrdma_xprt *r_xprt; 601 + struct rpcrdma_rep *rep; 602 + int i; 603 + 604 + if (buffer == NULL) 605 + return; 606 + 607 + req = container_of(buffer, struct rpcrdma_req, rl_xdr_buf[0]); 608 + r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf); 609 + rep = req->rl_reply; 610 + 611 + dprintk("RPC: %s: called on 0x%p%s\n", 612 + __func__, rep, (rep && rep->rr_func) ? " (with waiter)" : ""); 613 + 614 + /* 615 + * Finish the deregistration. When using mw bind, this was 616 + * begun in rpcrdma_reply_handler(). In all other modes, we 617 + * do it here, in thread context. The process is considered 618 + * complete when the rr_func vector becomes NULL - this 619 + * was put in place during rpcrdma_reply_handler() - the wait 620 + * call below will not block if the dereg is "done". If 621 + * interrupted, our framework will clean up. 622 + */ 623 + for (i = 0; req->rl_nchunks;) { 624 + --req->rl_nchunks; 625 + i += rpcrdma_deregister_external( 626 + &req->rl_segments[i], r_xprt, NULL); 627 + } 628 + 629 + if (rep && wait_event_interruptible(rep->rr_unbind, !rep->rr_func)) { 630 + rep->rr_func = NULL; /* abandon the callback */ 631 + req->rl_reply = NULL; 632 + } 633 + 634 + if (req->rl_iov.length == 0) { /* see allocate above */ 635 + struct rpcrdma_req *oreq = (struct rpcrdma_req *)req->rl_buffer; 636 + oreq->rl_reply = req->rl_reply; 637 + (void) rpcrdma_deregister_internal(&r_xprt->rx_ia, 638 + req->rl_handle, 639 + &req->rl_iov); 640 + kfree(req); 641 + req = oreq; 642 + } 643 + 644 + /* Put back request+reply buffers */ 645 + rpcrdma_buffer_put(req); 646 + } 647 + 648 + /* 649 + * send_request invokes the meat of RPC RDMA. It must do the following: 650 + * 1. Marshal the RPC request into an RPC RDMA request, which means 651 + * putting a header in front of data, and creating IOVs for RDMA 652 + * from those in the request. 653 + * 2. In marshaling, detect opportunities for RDMA, and use them. 654 + * 3. Post a recv message to set up asynch completion, then send 655 + * the request (rpcrdma_ep_post). 656 + * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). 657 + */ 658 + 659 + static int 660 + xprt_rdma_send_request(struct rpc_task *task) 661 + { 662 + struct rpc_rqst *rqst = task->tk_rqstp; 663 + struct rpc_xprt *xprt = task->tk_xprt; 664 + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 665 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 666 + 667 + /* marshal the send itself */ 668 + if (req->rl_niovs == 0 && rpcrdma_marshal_req(rqst) != 0) { 669 + r_xprt->rx_stats.failed_marshal_count++; 670 + dprintk("RPC: %s: rpcrdma_marshal_req failed\n", 671 + __func__); 672 + return -EIO; 673 + } 674 + 675 + if (req->rl_reply == NULL) /* e.g. reconnection */ 676 + rpcrdma_recv_buffer_get(req); 677 + 678 + if (req->rl_reply) { 679 + req->rl_reply->rr_func = rpcrdma_reply_handler; 680 + /* this need only be done once, but... */ 681 + req->rl_reply->rr_xprt = xprt; 682 + } 683 + 684 + if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) { 685 + xprt_disconnect(xprt); 686 + return -ENOTCONN; /* implies disconnect */ 687 + } 688 + 689 + rqst->rq_bytes_sent = 0; 690 + return 0; 691 + } 692 + 693 + static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) 694 + { 695 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 696 + long idle_time = 0; 697 + 698 + if (xprt_connected(xprt)) 699 + idle_time = (long)(jiffies - xprt->last_used) / HZ; 700 + 701 + seq_printf(seq, 702 + "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu " 703 + "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n", 704 + 705 + 0, /* need a local port? */ 706 + xprt->stat.bind_count, 707 + xprt->stat.connect_count, 708 + xprt->stat.connect_time, 709 + idle_time, 710 + xprt->stat.sends, 711 + xprt->stat.recvs, 712 + xprt->stat.bad_xids, 713 + xprt->stat.req_u, 714 + xprt->stat.bklog_u, 715 + 716 + r_xprt->rx_stats.read_chunk_count, 717 + r_xprt->rx_stats.write_chunk_count, 718 + r_xprt->rx_stats.reply_chunk_count, 719 + r_xprt->rx_stats.total_rdma_request, 720 + r_xprt->rx_stats.total_rdma_reply, 721 + r_xprt->rx_stats.pullup_copy_count, 722 + r_xprt->rx_stats.fixup_copy_count, 723 + r_xprt->rx_stats.hardway_register_count, 724 + r_xprt->rx_stats.failed_marshal_count, 725 + r_xprt->rx_stats.bad_reply_count); 726 + } 727 + 728 + /* 729 + * Plumbing for rpc transport switch and kernel module 730 + */ 731 + 732 + static struct rpc_xprt_ops xprt_rdma_procs = { 733 + .reserve_xprt = xprt_rdma_reserve_xprt, 734 + .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ 735 + .release_request = xprt_release_rqst_cong, /* ditto */ 736 + .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ 737 + .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ 738 + .set_port = xprt_rdma_set_port, 739 + .connect = xprt_rdma_connect, 740 + .buf_alloc = xprt_rdma_allocate, 741 + .buf_free = xprt_rdma_free, 742 + .send_request = xprt_rdma_send_request, 743 + .close = xprt_rdma_close, 744 + .destroy = xprt_rdma_destroy, 745 + .print_stats = xprt_rdma_print_stats 746 + }; 747 + 748 + static struct xprt_class xprt_rdma = { 749 + .list = LIST_HEAD_INIT(xprt_rdma.list), 750 + .name = "rdma", 751 + .owner = THIS_MODULE, 752 + .ident = XPRT_TRANSPORT_RDMA, 753 + .setup = xprt_setup_rdma, 754 + }; 755 + 756 + static void __exit xprt_rdma_cleanup(void) 757 + { 758 + int rc; 759 + 760 + dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); 761 + #ifdef RPC_DEBUG 762 + if (sunrpc_table_header) { 763 + unregister_sysctl_table(sunrpc_table_header); 764 + sunrpc_table_header = NULL; 765 + } 766 + #endif 767 + rc = xprt_unregister_transport(&xprt_rdma); 768 + if (rc) 769 + dprintk("RPC: %s: xprt_unregister returned %i\n", 770 + __func__, rc); 771 + } 772 + 773 + static int __init xprt_rdma_init(void) 774 + { 775 + int rc; 776 + 777 + rc = xprt_register_transport(&xprt_rdma); 778 + 779 + if (rc) 780 + return rc; 781 + 782 + dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n"); 783 + 784 + dprintk(KERN_INFO "Defaults:\n"); 785 + dprintk(KERN_INFO "\tSlots %d\n" 786 + "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", 787 + xprt_rdma_slot_table_entries, 788 + xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); 789 + dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n", 790 + xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); 791 + 792 + #ifdef RPC_DEBUG 793 + if (!sunrpc_table_header) 794 + sunrpc_table_header = register_sysctl_table(sunrpc_table); 795 + #endif 796 + return 0; 797 + } 798 + 799 + module_init(xprt_rdma_init); 800 + module_exit(xprt_rdma_cleanup);

+1626

net/sunrpc/xprtrdma/verbs.c

··· 1 + /* 2 + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + */ 39 + 40 + /* 41 + * verbs.c 42 + * 43 + * Encapsulates the major functions managing: 44 + * o adapters 45 + * o endpoints 46 + * o connections 47 + * o buffer memory 48 + */ 49 + 50 + #include <linux/pci.h> /* for Tavor hack below */ 51 + 52 + #include "xprt_rdma.h" 53 + 54 + /* 55 + * Globals/Macros 56 + */ 57 + 58 + #ifdef RPC_DEBUG 59 + # define RPCDBG_FACILITY RPCDBG_TRANS 60 + #endif 61 + 62 + /* 63 + * internal functions 64 + */ 65 + 66 + /* 67 + * handle replies in tasklet context, using a single, global list 68 + * rdma tasklet function -- just turn around and call the func 69 + * for all replies on the list 70 + */ 71 + 72 + static DEFINE_SPINLOCK(rpcrdma_tk_lock_g); 73 + static LIST_HEAD(rpcrdma_tasklets_g); 74 + 75 + static void 76 + rpcrdma_run_tasklet(unsigned long data) 77 + { 78 + struct rpcrdma_rep *rep; 79 + void (*func)(struct rpcrdma_rep *); 80 + unsigned long flags; 81 + 82 + data = data; 83 + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 84 + while (!list_empty(&rpcrdma_tasklets_g)) { 85 + rep = list_entry(rpcrdma_tasklets_g.next, 86 + struct rpcrdma_rep, rr_list); 87 + list_del(&rep->rr_list); 88 + func = rep->rr_func; 89 + rep->rr_func = NULL; 90 + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 91 + 92 + if (func) 93 + func(rep); 94 + else 95 + rpcrdma_recv_buffer_put(rep); 96 + 97 + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 98 + } 99 + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 100 + } 101 + 102 + static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); 103 + 104 + static inline void 105 + rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep) 106 + { 107 + unsigned long flags; 108 + 109 + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 110 + list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g); 111 + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 112 + tasklet_schedule(&rpcrdma_tasklet_g); 113 + } 114 + 115 + static void 116 + rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) 117 + { 118 + struct rpcrdma_ep *ep = context; 119 + 120 + dprintk("RPC: %s: QP error %X on device %s ep %p\n", 121 + __func__, event->event, event->device->name, context); 122 + if (ep->rep_connected == 1) { 123 + ep->rep_connected = -EIO; 124 + ep->rep_func(ep); 125 + wake_up_all(&ep->rep_connect_wait); 126 + } 127 + } 128 + 129 + static void 130 + rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) 131 + { 132 + struct rpcrdma_ep *ep = context; 133 + 134 + dprintk("RPC: %s: CQ error %X on device %s ep %p\n", 135 + __func__, event->event, event->device->name, context); 136 + if (ep->rep_connected == 1) { 137 + ep->rep_connected = -EIO; 138 + ep->rep_func(ep); 139 + wake_up_all(&ep->rep_connect_wait); 140 + } 141 + } 142 + 143 + static inline 144 + void rpcrdma_event_process(struct ib_wc *wc) 145 + { 146 + struct rpcrdma_rep *rep = 147 + (struct rpcrdma_rep *)(unsigned long) wc->wr_id; 148 + 149 + dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n", 150 + __func__, rep, wc->status, wc->opcode, wc->byte_len); 151 + 152 + if (!rep) /* send or bind completion that we don't care about */ 153 + return; 154 + 155 + if (IB_WC_SUCCESS != wc->status) { 156 + dprintk("RPC: %s: %s WC status %X, connection lost\n", 157 + __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send", 158 + wc->status); 159 + rep->rr_len = ~0U; 160 + rpcrdma_schedule_tasklet(rep); 161 + return; 162 + } 163 + 164 + switch (wc->opcode) { 165 + case IB_WC_RECV: 166 + rep->rr_len = wc->byte_len; 167 + ib_dma_sync_single_for_cpu( 168 + rdmab_to_ia(rep->rr_buffer)->ri_id->device, 169 + rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE); 170 + /* Keep (only) the most recent credits, after check validity */ 171 + if (rep->rr_len >= 16) { 172 + struct rpcrdma_msg *p = 173 + (struct rpcrdma_msg *) rep->rr_base; 174 + unsigned int credits = ntohl(p->rm_credit); 175 + if (credits == 0) { 176 + dprintk("RPC: %s: server" 177 + " dropped credits to 0!\n", __func__); 178 + /* don't deadlock */ 179 + credits = 1; 180 + } else if (credits > rep->rr_buffer->rb_max_requests) { 181 + dprintk("RPC: %s: server" 182 + " over-crediting: %d (%d)\n", 183 + __func__, credits, 184 + rep->rr_buffer->rb_max_requests); 185 + credits = rep->rr_buffer->rb_max_requests; 186 + } 187 + atomic_set(&rep->rr_buffer->rb_credits, credits); 188 + } 189 + /* fall through */ 190 + case IB_WC_BIND_MW: 191 + rpcrdma_schedule_tasklet(rep); 192 + break; 193 + default: 194 + dprintk("RPC: %s: unexpected WC event %X\n", 195 + __func__, wc->opcode); 196 + break; 197 + } 198 + } 199 + 200 + static inline int 201 + rpcrdma_cq_poll(struct ib_cq *cq) 202 + { 203 + struct ib_wc wc; 204 + int rc; 205 + 206 + for (;;) { 207 + rc = ib_poll_cq(cq, 1, &wc); 208 + if (rc < 0) { 209 + dprintk("RPC: %s: ib_poll_cq failed %i\n", 210 + __func__, rc); 211 + return rc; 212 + } 213 + if (rc == 0) 214 + break; 215 + 216 + rpcrdma_event_process(&wc); 217 + } 218 + 219 + return 0; 220 + } 221 + 222 + /* 223 + * rpcrdma_cq_event_upcall 224 + * 225 + * This upcall handles recv, send, bind and unbind events. 226 + * It is reentrant but processes single events in order to maintain 227 + * ordering of receives to keep server credits. 228 + * 229 + * It is the responsibility of the scheduled tasklet to return 230 + * recv buffers to the pool. NOTE: this affects synchronization of 231 + * connection shutdown. That is, the structures required for 232 + * the completion of the reply handler must remain intact until 233 + * all memory has been reclaimed. 234 + * 235 + * Note that send events are suppressed and do not result in an upcall. 236 + */ 237 + static void 238 + rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context) 239 + { 240 + int rc; 241 + 242 + rc = rpcrdma_cq_poll(cq); 243 + if (rc) 244 + return; 245 + 246 + rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); 247 + if (rc) { 248 + dprintk("RPC: %s: ib_req_notify_cq failed %i\n", 249 + __func__, rc); 250 + return; 251 + } 252 + 253 + rpcrdma_cq_poll(cq); 254 + } 255 + 256 + #ifdef RPC_DEBUG 257 + static const char * const conn[] = { 258 + "address resolved", 259 + "address error", 260 + "route resolved", 261 + "route error", 262 + "connect request", 263 + "connect response", 264 + "connect error", 265 + "unreachable", 266 + "rejected", 267 + "established", 268 + "disconnected", 269 + "device removal" 270 + }; 271 + #endif 272 + 273 + static int 274 + rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) 275 + { 276 + struct rpcrdma_xprt *xprt = id->context; 277 + struct rpcrdma_ia *ia = &xprt->rx_ia; 278 + struct rpcrdma_ep *ep = &xprt->rx_ep; 279 + struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; 280 + struct ib_qp_attr attr; 281 + struct ib_qp_init_attr iattr; 282 + int connstate = 0; 283 + 284 + switch (event->event) { 285 + case RDMA_CM_EVENT_ADDR_RESOLVED: 286 + case RDMA_CM_EVENT_ROUTE_RESOLVED: 287 + complete(&ia->ri_done); 288 + break; 289 + case RDMA_CM_EVENT_ADDR_ERROR: 290 + ia->ri_async_rc = -EHOSTUNREACH; 291 + dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", 292 + __func__, ep); 293 + complete(&ia->ri_done); 294 + break; 295 + case RDMA_CM_EVENT_ROUTE_ERROR: 296 + ia->ri_async_rc = -ENETUNREACH; 297 + dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", 298 + __func__, ep); 299 + complete(&ia->ri_done); 300 + break; 301 + case RDMA_CM_EVENT_ESTABLISHED: 302 + connstate = 1; 303 + ib_query_qp(ia->ri_id->qp, &attr, 304 + IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, 305 + &iattr); 306 + dprintk("RPC: %s: %d responder resources" 307 + " (%d initiator)\n", 308 + __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic); 309 + goto connected; 310 + case RDMA_CM_EVENT_CONNECT_ERROR: 311 + connstate = -ENOTCONN; 312 + goto connected; 313 + case RDMA_CM_EVENT_UNREACHABLE: 314 + connstate = -ENETDOWN; 315 + goto connected; 316 + case RDMA_CM_EVENT_REJECTED: 317 + connstate = -ECONNREFUSED; 318 + goto connected; 319 + case RDMA_CM_EVENT_DISCONNECTED: 320 + connstate = -ECONNABORTED; 321 + goto connected; 322 + case RDMA_CM_EVENT_DEVICE_REMOVAL: 323 + connstate = -ENODEV; 324 + connected: 325 + dprintk("RPC: %s: %s: %u.%u.%u.%u:%u" 326 + " (ep 0x%p event 0x%x)\n", 327 + __func__, 328 + (event->event <= 11) ? conn[event->event] : 329 + "unknown connection error", 330 + NIPQUAD(addr->sin_addr.s_addr), 331 + ntohs(addr->sin_port), 332 + ep, event->event); 333 + atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); 334 + dprintk("RPC: %s: %sconnected\n", 335 + __func__, connstate > 0 ? "" : "dis"); 336 + ep->rep_connected = connstate; 337 + ep->rep_func(ep); 338 + wake_up_all(&ep->rep_connect_wait); 339 + break; 340 + default: 341 + ia->ri_async_rc = -EINVAL; 342 + dprintk("RPC: %s: unexpected CM event %X\n", 343 + __func__, event->event); 344 + complete(&ia->ri_done); 345 + break; 346 + } 347 + 348 + return 0; 349 + } 350 + 351 + static struct rdma_cm_id * 352 + rpcrdma_create_id(struct rpcrdma_xprt *xprt, 353 + struct rpcrdma_ia *ia, struct sockaddr *addr) 354 + { 355 + struct rdma_cm_id *id; 356 + int rc; 357 + 358 + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); 359 + if (IS_ERR(id)) { 360 + rc = PTR_ERR(id); 361 + dprintk("RPC: %s: rdma_create_id() failed %i\n", 362 + __func__, rc); 363 + return id; 364 + } 365 + 366 + ia->ri_async_rc = 0; 367 + rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); 368 + if (rc) { 369 + dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", 370 + __func__, rc); 371 + goto out; 372 + } 373 + wait_for_completion(&ia->ri_done); 374 + rc = ia->ri_async_rc; 375 + if (rc) 376 + goto out; 377 + 378 + ia->ri_async_rc = 0; 379 + rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); 380 + if (rc) { 381 + dprintk("RPC: %s: rdma_resolve_route() failed %i\n", 382 + __func__, rc); 383 + goto out; 384 + } 385 + wait_for_completion(&ia->ri_done); 386 + rc = ia->ri_async_rc; 387 + if (rc) 388 + goto out; 389 + 390 + return id; 391 + 392 + out: 393 + rdma_destroy_id(id); 394 + return ERR_PTR(rc); 395 + } 396 + 397 + /* 398 + * Drain any cq, prior to teardown. 399 + */ 400 + static void 401 + rpcrdma_clean_cq(struct ib_cq *cq) 402 + { 403 + struct ib_wc wc; 404 + int count = 0; 405 + 406 + while (1 == ib_poll_cq(cq, 1, &wc)) 407 + ++count; 408 + 409 + if (count) 410 + dprintk("RPC: %s: flushed %d events (last 0x%x)\n", 411 + __func__, count, wc.opcode); 412 + } 413 + 414 + /* 415 + * Exported functions. 416 + */ 417 + 418 + /* 419 + * Open and initialize an Interface Adapter. 420 + * o initializes fields of struct rpcrdma_ia, including 421 + * interface and provider attributes and protection zone. 422 + */ 423 + int 424 + rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) 425 + { 426 + int rc; 427 + struct rpcrdma_ia *ia = &xprt->rx_ia; 428 + 429 + init_completion(&ia->ri_done); 430 + 431 + ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 432 + if (IS_ERR(ia->ri_id)) { 433 + rc = PTR_ERR(ia->ri_id); 434 + goto out1; 435 + } 436 + 437 + ia->ri_pd = ib_alloc_pd(ia->ri_id->device); 438 + if (IS_ERR(ia->ri_pd)) { 439 + rc = PTR_ERR(ia->ri_pd); 440 + dprintk("RPC: %s: ib_alloc_pd() failed %i\n", 441 + __func__, rc); 442 + goto out2; 443 + } 444 + 445 + /* 446 + * Optionally obtain an underlying physical identity mapping in 447 + * order to do a memory window-based bind. This base registration 448 + * is protected from remote access - that is enabled only by binding 449 + * for the specific bytes targeted during each RPC operation, and 450 + * revoked after the corresponding completion similar to a storage 451 + * adapter. 452 + */ 453 + if (memreg > RPCRDMA_REGISTER) { 454 + int mem_priv = IB_ACCESS_LOCAL_WRITE; 455 + switch (memreg) { 456 + #if RPCRDMA_PERSISTENT_REGISTRATION 457 + case RPCRDMA_ALLPHYSICAL: 458 + mem_priv |= IB_ACCESS_REMOTE_WRITE; 459 + mem_priv |= IB_ACCESS_REMOTE_READ; 460 + break; 461 + #endif 462 + case RPCRDMA_MEMWINDOWS_ASYNC: 463 + case RPCRDMA_MEMWINDOWS: 464 + mem_priv |= IB_ACCESS_MW_BIND; 465 + break; 466 + default: 467 + break; 468 + } 469 + ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); 470 + if (IS_ERR(ia->ri_bind_mem)) { 471 + printk(KERN_ALERT "%s: ib_get_dma_mr for " 472 + "phys register failed with %lX\n\t" 473 + "Will continue with degraded performance\n", 474 + __func__, PTR_ERR(ia->ri_bind_mem)); 475 + memreg = RPCRDMA_REGISTER; 476 + ia->ri_bind_mem = NULL; 477 + } 478 + } 479 + 480 + /* Else will do memory reg/dereg for each chunk */ 481 + ia->ri_memreg_strategy = memreg; 482 + 483 + return 0; 484 + out2: 485 + rdma_destroy_id(ia->ri_id); 486 + out1: 487 + return rc; 488 + } 489 + 490 + /* 491 + * Clean up/close an IA. 492 + * o if event handles and PD have been initialized, free them. 493 + * o close the IA 494 + */ 495 + void 496 + rpcrdma_ia_close(struct rpcrdma_ia *ia) 497 + { 498 + int rc; 499 + 500 + dprintk("RPC: %s: entering\n", __func__); 501 + if (ia->ri_bind_mem != NULL) { 502 + rc = ib_dereg_mr(ia->ri_bind_mem); 503 + dprintk("RPC: %s: ib_dereg_mr returned %i\n", 504 + __func__, rc); 505 + } 506 + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp) 507 + rdma_destroy_qp(ia->ri_id); 508 + if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) { 509 + rc = ib_dealloc_pd(ia->ri_pd); 510 + dprintk("RPC: %s: ib_dealloc_pd returned %i\n", 511 + __func__, rc); 512 + } 513 + if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) 514 + rdma_destroy_id(ia->ri_id); 515 + } 516 + 517 + /* 518 + * Create unconnected endpoint. 519 + */ 520 + int 521 + rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 522 + struct rpcrdma_create_data_internal *cdata) 523 + { 524 + struct ib_device_attr devattr; 525 + int rc; 526 + 527 + rc = ib_query_device(ia->ri_id->device, &devattr); 528 + if (rc) { 529 + dprintk("RPC: %s: ib_query_device failed %d\n", 530 + __func__, rc); 531 + return rc; 532 + } 533 + 534 + /* check provider's send/recv wr limits */ 535 + if (cdata->max_requests > devattr.max_qp_wr) 536 + cdata->max_requests = devattr.max_qp_wr; 537 + 538 + ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 539 + ep->rep_attr.qp_context = ep; 540 + /* send_cq and recv_cq initialized below */ 541 + ep->rep_attr.srq = NULL; 542 + ep->rep_attr.cap.max_send_wr = cdata->max_requests; 543 + switch (ia->ri_memreg_strategy) { 544 + case RPCRDMA_MEMWINDOWS_ASYNC: 545 + case RPCRDMA_MEMWINDOWS: 546 + /* Add room for mw_binds+unbinds - overkill! */ 547 + ep->rep_attr.cap.max_send_wr++; 548 + ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS); 549 + if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) 550 + return -EINVAL; 551 + break; 552 + default: 553 + break; 554 + } 555 + ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 556 + ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); 557 + ep->rep_attr.cap.max_recv_sge = 1; 558 + ep->rep_attr.cap.max_inline_data = 0; 559 + ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 560 + ep->rep_attr.qp_type = IB_QPT_RC; 561 + ep->rep_attr.port_num = ~0; 562 + 563 + dprintk("RPC: %s: requested max: dtos: send %d recv %d; " 564 + "iovs: send %d recv %d\n", 565 + __func__, 566 + ep->rep_attr.cap.max_send_wr, 567 + ep->rep_attr.cap.max_recv_wr, 568 + ep->rep_attr.cap.max_send_sge, 569 + ep->rep_attr.cap.max_recv_sge); 570 + 571 + /* set trigger for requesting send completion */ 572 + ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/; 573 + switch (ia->ri_memreg_strategy) { 574 + case RPCRDMA_MEMWINDOWS_ASYNC: 575 + case RPCRDMA_MEMWINDOWS: 576 + ep->rep_cqinit -= RPCRDMA_MAX_SEGS; 577 + break; 578 + default: 579 + break; 580 + } 581 + if (ep->rep_cqinit <= 2) 582 + ep->rep_cqinit = 0; 583 + INIT_CQCOUNT(ep); 584 + ep->rep_ia = ia; 585 + init_waitqueue_head(&ep->rep_connect_wait); 586 + 587 + /* 588 + * Create a single cq for receive dto and mw_bind (only ever 589 + * care about unbind, really). Send completions are suppressed. 590 + * Use single threaded tasklet upcalls to maintain ordering. 591 + */ 592 + ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall, 593 + rpcrdma_cq_async_error_upcall, NULL, 594 + ep->rep_attr.cap.max_recv_wr + 595 + ep->rep_attr.cap.max_send_wr + 1, 0); 596 + if (IS_ERR(ep->rep_cq)) { 597 + rc = PTR_ERR(ep->rep_cq); 598 + dprintk("RPC: %s: ib_create_cq failed: %i\n", 599 + __func__, rc); 600 + goto out1; 601 + } 602 + 603 + rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP); 604 + if (rc) { 605 + dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 606 + __func__, rc); 607 + goto out2; 608 + } 609 + 610 + ep->rep_attr.send_cq = ep->rep_cq; 611 + ep->rep_attr.recv_cq = ep->rep_cq; 612 + 613 + /* Initialize cma parameters */ 614 + 615 + /* RPC/RDMA does not use private data */ 616 + ep->rep_remote_cma.private_data = NULL; 617 + ep->rep_remote_cma.private_data_len = 0; 618 + 619 + /* Client offers RDMA Read but does not initiate */ 620 + switch (ia->ri_memreg_strategy) { 621 + case RPCRDMA_BOUNCEBUFFERS: 622 + ep->rep_remote_cma.responder_resources = 0; 623 + break; 624 + case RPCRDMA_MTHCAFMR: 625 + case RPCRDMA_REGISTER: 626 + ep->rep_remote_cma.responder_resources = cdata->max_requests * 627 + (RPCRDMA_MAX_DATA_SEGS / 8); 628 + break; 629 + case RPCRDMA_MEMWINDOWS: 630 + case RPCRDMA_MEMWINDOWS_ASYNC: 631 + #if RPCRDMA_PERSISTENT_REGISTRATION 632 + case RPCRDMA_ALLPHYSICAL: 633 + #endif 634 + ep->rep_remote_cma.responder_resources = cdata->max_requests * 635 + (RPCRDMA_MAX_DATA_SEGS / 2); 636 + break; 637 + default: 638 + break; 639 + } 640 + if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom) 641 + ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom; 642 + ep->rep_remote_cma.initiator_depth = 0; 643 + 644 + ep->rep_remote_cma.retry_count = 7; 645 + ep->rep_remote_cma.flow_control = 0; 646 + ep->rep_remote_cma.rnr_retry_count = 0; 647 + 648 + return 0; 649 + 650 + out2: 651 + if (ib_destroy_cq(ep->rep_cq)) 652 + ; 653 + out1: 654 + return rc; 655 + } 656 + 657 + /* 658 + * rpcrdma_ep_destroy 659 + * 660 + * Disconnect and destroy endpoint. After this, the only 661 + * valid operations on the ep are to free it (if dynamically 662 + * allocated) or re-create it. 663 + * 664 + * The caller's error handling must be sure to not leak the endpoint 665 + * if this function fails. 666 + */ 667 + int 668 + rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 669 + { 670 + int rc; 671 + 672 + dprintk("RPC: %s: entering, connected is %d\n", 673 + __func__, ep->rep_connected); 674 + 675 + if (ia->ri_id->qp) { 676 + rc = rpcrdma_ep_disconnect(ep, ia); 677 + if (rc) 678 + dprintk("RPC: %s: rpcrdma_ep_disconnect" 679 + " returned %i\n", __func__, rc); 680 + } 681 + 682 + ep->rep_func = NULL; 683 + 684 + /* padding - could be done in rpcrdma_buffer_destroy... */ 685 + if (ep->rep_pad_mr) { 686 + rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad); 687 + ep->rep_pad_mr = NULL; 688 + } 689 + 690 + if (ia->ri_id->qp) { 691 + rdma_destroy_qp(ia->ri_id); 692 + ia->ri_id->qp = NULL; 693 + } 694 + 695 + rpcrdma_clean_cq(ep->rep_cq); 696 + rc = ib_destroy_cq(ep->rep_cq); 697 + if (rc) 698 + dprintk("RPC: %s: ib_destroy_cq returned %i\n", 699 + __func__, rc); 700 + 701 + return rc; 702 + } 703 + 704 + /* 705 + * Connect unconnected endpoint. 706 + */ 707 + int 708 + rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 709 + { 710 + struct rdma_cm_id *id; 711 + int rc = 0; 712 + int retry_count = 0; 713 + int reconnect = (ep->rep_connected != 0); 714 + 715 + if (reconnect) { 716 + struct rpcrdma_xprt *xprt; 717 + retry: 718 + rc = rpcrdma_ep_disconnect(ep, ia); 719 + if (rc && rc != -ENOTCONN) 720 + dprintk("RPC: %s: rpcrdma_ep_disconnect" 721 + " status %i\n", __func__, rc); 722 + rpcrdma_clean_cq(ep->rep_cq); 723 + 724 + xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 725 + id = rpcrdma_create_id(xprt, ia, 726 + (struct sockaddr *)&xprt->rx_data.addr); 727 + if (IS_ERR(id)) { 728 + rc = PTR_ERR(id); 729 + goto out; 730 + } 731 + /* TEMP TEMP TEMP - fail if new device: 732 + * Deregister/remarshal *all* requests! 733 + * Close and recreate adapter, pd, etc! 734 + * Re-determine all attributes still sane! 735 + * More stuff I haven't thought of! 736 + * Rrrgh! 737 + */ 738 + if (ia->ri_id->device != id->device) { 739 + printk("RPC: %s: can't reconnect on " 740 + "different device!\n", __func__); 741 + rdma_destroy_id(id); 742 + rc = -ENETDOWN; 743 + goto out; 744 + } 745 + /* END TEMP */ 746 + rdma_destroy_id(ia->ri_id); 747 + ia->ri_id = id; 748 + } 749 + 750 + rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); 751 + if (rc) { 752 + dprintk("RPC: %s: rdma_create_qp failed %i\n", 753 + __func__, rc); 754 + goto out; 755 + } 756 + 757 + /* XXX Tavor device performs badly with 2K MTU! */ 758 + if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) { 759 + struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device); 760 + if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR && 761 + (pcid->vendor == PCI_VENDOR_ID_MELLANOX || 762 + pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) { 763 + struct ib_qp_attr attr = { 764 + .path_mtu = IB_MTU_1024 765 + }; 766 + rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU); 767 + } 768 + } 769 + 770 + /* Theoretically a client initiator_depth > 0 is not needed, 771 + * but many peers fail to complete the connection unless they 772 + * == responder_resources! */ 773 + if (ep->rep_remote_cma.initiator_depth != 774 + ep->rep_remote_cma.responder_resources) 775 + ep->rep_remote_cma.initiator_depth = 776 + ep->rep_remote_cma.responder_resources; 777 + 778 + ep->rep_connected = 0; 779 + 780 + rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); 781 + if (rc) { 782 + dprintk("RPC: %s: rdma_connect() failed with %i\n", 783 + __func__, rc); 784 + goto out; 785 + } 786 + 787 + if (reconnect) 788 + return 0; 789 + 790 + wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); 791 + 792 + /* 793 + * Check state. A non-peer reject indicates no listener 794 + * (ECONNREFUSED), which may be a transient state. All 795 + * others indicate a transport condition which has already 796 + * undergone a best-effort. 797 + */ 798 + if (ep->rep_connected == -ECONNREFUSED 799 + && ++retry_count <= RDMA_CONNECT_RETRY_MAX) { 800 + dprintk("RPC: %s: non-peer_reject, retry\n", __func__); 801 + goto retry; 802 + } 803 + if (ep->rep_connected <= 0) { 804 + /* Sometimes, the only way to reliably connect to remote 805 + * CMs is to use same nonzero values for ORD and IRD. */ 806 + ep->rep_remote_cma.initiator_depth = 807 + ep->rep_remote_cma.responder_resources; 808 + if (ep->rep_remote_cma.initiator_depth == 0) 809 + ++ep->rep_remote_cma.initiator_depth; 810 + if (ep->rep_remote_cma.responder_resources == 0) 811 + ++ep->rep_remote_cma.responder_resources; 812 + if (retry_count++ == 0) 813 + goto retry; 814 + rc = ep->rep_connected; 815 + } else { 816 + dprintk("RPC: %s: connected\n", __func__); 817 + } 818 + 819 + out: 820 + if (rc) 821 + ep->rep_connected = rc; 822 + return rc; 823 + } 824 + 825 + /* 826 + * rpcrdma_ep_disconnect 827 + * 828 + * This is separate from destroy to facilitate the ability 829 + * to reconnect without recreating the endpoint. 830 + * 831 + * This call is not reentrant, and must not be made in parallel 832 + * on the same endpoint. 833 + */ 834 + int 835 + rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 836 + { 837 + int rc; 838 + 839 + rpcrdma_clean_cq(ep->rep_cq); 840 + rc = rdma_disconnect(ia->ri_id); 841 + if (!rc) { 842 + /* returns without wait if not connected */ 843 + wait_event_interruptible(ep->rep_connect_wait, 844 + ep->rep_connected != 1); 845 + dprintk("RPC: %s: after wait, %sconnected\n", __func__, 846 + (ep->rep_connected == 1) ? "still " : "dis"); 847 + } else { 848 + dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); 849 + ep->rep_connected = rc; 850 + } 851 + return rc; 852 + } 853 + 854 + /* 855 + * Initialize buffer memory 856 + */ 857 + int 858 + rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, 859 + struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) 860 + { 861 + char *p; 862 + size_t len; 863 + int i, rc; 864 + 865 + buf->rb_max_requests = cdata->max_requests; 866 + spin_lock_init(&buf->rb_lock); 867 + atomic_set(&buf->rb_credits, 1); 868 + 869 + /* Need to allocate: 870 + * 1. arrays for send and recv pointers 871 + * 2. arrays of struct rpcrdma_req to fill in pointers 872 + * 3. array of struct rpcrdma_rep for replies 873 + * 4. padding, if any 874 + * 5. mw's, if any 875 + * Send/recv buffers in req/rep need to be registered 876 + */ 877 + 878 + len = buf->rb_max_requests * 879 + (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); 880 + len += cdata->padding; 881 + switch (ia->ri_memreg_strategy) { 882 + case RPCRDMA_MTHCAFMR: 883 + /* TBD we are perhaps overallocating here */ 884 + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * 885 + sizeof(struct rpcrdma_mw); 886 + break; 887 + case RPCRDMA_MEMWINDOWS_ASYNC: 888 + case RPCRDMA_MEMWINDOWS: 889 + len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * 890 + sizeof(struct rpcrdma_mw); 891 + break; 892 + default: 893 + break; 894 + } 895 + 896 + /* allocate 1, 4 and 5 in one shot */ 897 + p = kzalloc(len, GFP_KERNEL); 898 + if (p == NULL) { 899 + dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", 900 + __func__, len); 901 + rc = -ENOMEM; 902 + goto out; 903 + } 904 + buf->rb_pool = p; /* for freeing it later */ 905 + 906 + buf->rb_send_bufs = (struct rpcrdma_req **) p; 907 + p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; 908 + buf->rb_recv_bufs = (struct rpcrdma_rep **) p; 909 + p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; 910 + 911 + /* 912 + * Register the zeroed pad buffer, if any. 913 + */ 914 + if (cdata->padding) { 915 + rc = rpcrdma_register_internal(ia, p, cdata->padding, 916 + &ep->rep_pad_mr, &ep->rep_pad); 917 + if (rc) 918 + goto out; 919 + } 920 + p += cdata->padding; 921 + 922 + /* 923 + * Allocate the fmr's, or mw's for mw_bind chunk registration. 924 + * We "cycle" the mw's in order to minimize rkey reuse, 925 + * and also reduce unbind-to-bind collision. 926 + */ 927 + INIT_LIST_HEAD(&buf->rb_mws); 928 + switch (ia->ri_memreg_strategy) { 929 + case RPCRDMA_MTHCAFMR: 930 + { 931 + struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; 932 + struct ib_fmr_attr fa = { 933 + RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT 934 + }; 935 + /* TBD we are perhaps overallocating here */ 936 + for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { 937 + r->r.fmr = ib_alloc_fmr(ia->ri_pd, 938 + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, 939 + &fa); 940 + if (IS_ERR(r->r.fmr)) { 941 + rc = PTR_ERR(r->r.fmr); 942 + dprintk("RPC: %s: ib_alloc_fmr" 943 + " failed %i\n", __func__, rc); 944 + goto out; 945 + } 946 + list_add(&r->mw_list, &buf->rb_mws); 947 + ++r; 948 + } 949 + } 950 + break; 951 + case RPCRDMA_MEMWINDOWS_ASYNC: 952 + case RPCRDMA_MEMWINDOWS: 953 + { 954 + struct rpcrdma_mw *r = (struct rpcrdma_mw *)p; 955 + /* Allocate one extra request's worth, for full cycling */ 956 + for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { 957 + r->r.mw = ib_alloc_mw(ia->ri_pd); 958 + if (IS_ERR(r->r.mw)) { 959 + rc = PTR_ERR(r->r.mw); 960 + dprintk("RPC: %s: ib_alloc_mw" 961 + " failed %i\n", __func__, rc); 962 + goto out; 963 + } 964 + list_add(&r->mw_list, &buf->rb_mws); 965 + ++r; 966 + } 967 + } 968 + break; 969 + default: 970 + break; 971 + } 972 + 973 + /* 974 + * Allocate/init the request/reply buffers. Doing this 975 + * using kmalloc for now -- one for each buf. 976 + */ 977 + for (i = 0; i < buf->rb_max_requests; i++) { 978 + struct rpcrdma_req *req; 979 + struct rpcrdma_rep *rep; 980 + 981 + len = cdata->inline_wsize + sizeof(struct rpcrdma_req); 982 + /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */ 983 + /* Typical ~2400b, so rounding up saves work later */ 984 + if (len < 4096) 985 + len = 4096; 986 + req = kmalloc(len, GFP_KERNEL); 987 + if (req == NULL) { 988 + dprintk("RPC: %s: request buffer %d alloc" 989 + " failed\n", __func__, i); 990 + rc = -ENOMEM; 991 + goto out; 992 + } 993 + memset(req, 0, sizeof(struct rpcrdma_req)); 994 + buf->rb_send_bufs[i] = req; 995 + buf->rb_send_bufs[i]->rl_buffer = buf; 996 + 997 + rc = rpcrdma_register_internal(ia, req->rl_base, 998 + len - offsetof(struct rpcrdma_req, rl_base), 999 + &buf->rb_send_bufs[i]->rl_handle, 1000 + &buf->rb_send_bufs[i]->rl_iov); 1001 + if (rc) 1002 + goto out; 1003 + 1004 + buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req); 1005 + 1006 + len = cdata->inline_rsize + sizeof(struct rpcrdma_rep); 1007 + rep = kmalloc(len, GFP_KERNEL); 1008 + if (rep == NULL) { 1009 + dprintk("RPC: %s: reply buffer %d alloc failed\n", 1010 + __func__, i); 1011 + rc = -ENOMEM; 1012 + goto out; 1013 + } 1014 + memset(rep, 0, sizeof(struct rpcrdma_rep)); 1015 + buf->rb_recv_bufs[i] = rep; 1016 + buf->rb_recv_bufs[i]->rr_buffer = buf; 1017 + init_waitqueue_head(&rep->rr_unbind); 1018 + 1019 + rc = rpcrdma_register_internal(ia, rep->rr_base, 1020 + len - offsetof(struct rpcrdma_rep, rr_base), 1021 + &buf->rb_recv_bufs[i]->rr_handle, 1022 + &buf->rb_recv_bufs[i]->rr_iov); 1023 + if (rc) 1024 + goto out; 1025 + 1026 + } 1027 + dprintk("RPC: %s: max_requests %d\n", 1028 + __func__, buf->rb_max_requests); 1029 + /* done */ 1030 + return 0; 1031 + out: 1032 + rpcrdma_buffer_destroy(buf); 1033 + return rc; 1034 + } 1035 + 1036 + /* 1037 + * Unregister and destroy buffer memory. Need to deal with 1038 + * partial initialization, so it's callable from failed create. 1039 + * Must be called before destroying endpoint, as registrations 1040 + * reference it. 1041 + */ 1042 + void 1043 + rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1044 + { 1045 + int rc, i; 1046 + struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1047 + 1048 + /* clean up in reverse order from create 1049 + * 1. recv mr memory (mr free, then kfree) 1050 + * 1a. bind mw memory 1051 + * 2. send mr memory (mr free, then kfree) 1052 + * 3. padding (if any) [moved to rpcrdma_ep_destroy] 1053 + * 4. arrays 1054 + */ 1055 + dprintk("RPC: %s: entering\n", __func__); 1056 + 1057 + for (i = 0; i < buf->rb_max_requests; i++) { 1058 + if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) { 1059 + rpcrdma_deregister_internal(ia, 1060 + buf->rb_recv_bufs[i]->rr_handle, 1061 + &buf->rb_recv_bufs[i]->rr_iov); 1062 + kfree(buf->rb_recv_bufs[i]); 1063 + } 1064 + if (buf->rb_send_bufs && buf->rb_send_bufs[i]) { 1065 + while (!list_empty(&buf->rb_mws)) { 1066 + struct rpcrdma_mw *r; 1067 + r = list_entry(buf->rb_mws.next, 1068 + struct rpcrdma_mw, mw_list); 1069 + list_del(&r->mw_list); 1070 + switch (ia->ri_memreg_strategy) { 1071 + case RPCRDMA_MTHCAFMR: 1072 + rc = ib_dealloc_fmr(r->r.fmr); 1073 + if (rc) 1074 + dprintk("RPC: %s:" 1075 + " ib_dealloc_fmr" 1076 + " failed %i\n", 1077 + __func__, rc); 1078 + break; 1079 + case RPCRDMA_MEMWINDOWS_ASYNC: 1080 + case RPCRDMA_MEMWINDOWS: 1081 + rc = ib_dealloc_mw(r->r.mw); 1082 + if (rc) 1083 + dprintk("RPC: %s:" 1084 + " ib_dealloc_mw" 1085 + " failed %i\n", 1086 + __func__, rc); 1087 + break; 1088 + default: 1089 + break; 1090 + } 1091 + } 1092 + rpcrdma_deregister_internal(ia, 1093 + buf->rb_send_bufs[i]->rl_handle, 1094 + &buf->rb_send_bufs[i]->rl_iov); 1095 + kfree(buf->rb_send_bufs[i]); 1096 + } 1097 + } 1098 + 1099 + kfree(buf->rb_pool); 1100 + } 1101 + 1102 + /* 1103 + * Get a set of request/reply buffers. 1104 + * 1105 + * Reply buffer (if needed) is attached to send buffer upon return. 1106 + * Rule: 1107 + * rb_send_index and rb_recv_index MUST always be pointing to the 1108 + * *next* available buffer (non-NULL). They are incremented after 1109 + * removing buffers, and decremented *before* returning them. 1110 + */ 1111 + struct rpcrdma_req * 1112 + rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) 1113 + { 1114 + struct rpcrdma_req *req; 1115 + unsigned long flags; 1116 + 1117 + spin_lock_irqsave(&buffers->rb_lock, flags); 1118 + if (buffers->rb_send_index == buffers->rb_max_requests) { 1119 + spin_unlock_irqrestore(&buffers->rb_lock, flags); 1120 + dprintk("RPC: %s: out of request buffers\n", __func__); 1121 + return ((struct rpcrdma_req *)NULL); 1122 + } 1123 + 1124 + req = buffers->rb_send_bufs[buffers->rb_send_index]; 1125 + if (buffers->rb_send_index < buffers->rb_recv_index) { 1126 + dprintk("RPC: %s: %d extra receives outstanding (ok)\n", 1127 + __func__, 1128 + buffers->rb_recv_index - buffers->rb_send_index); 1129 + req->rl_reply = NULL; 1130 + } else { 1131 + req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1132 + buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1133 + } 1134 + buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; 1135 + if (!list_empty(&buffers->rb_mws)) { 1136 + int i = RPCRDMA_MAX_SEGS - 1; 1137 + do { 1138 + struct rpcrdma_mw *r; 1139 + r = list_entry(buffers->rb_mws.next, 1140 + struct rpcrdma_mw, mw_list); 1141 + list_del(&r->mw_list); 1142 + req->rl_segments[i].mr_chunk.rl_mw = r; 1143 + } while (--i >= 0); 1144 + } 1145 + spin_unlock_irqrestore(&buffers->rb_lock, flags); 1146 + return req; 1147 + } 1148 + 1149 + /* 1150 + * Put request/reply buffers back into pool. 1151 + * Pre-decrement counter/array index. 1152 + */ 1153 + void 1154 + rpcrdma_buffer_put(struct rpcrdma_req *req) 1155 + { 1156 + struct rpcrdma_buffer *buffers = req->rl_buffer; 1157 + struct rpcrdma_ia *ia = rdmab_to_ia(buffers); 1158 + int i; 1159 + unsigned long flags; 1160 + 1161 + BUG_ON(req->rl_nchunks != 0); 1162 + spin_lock_irqsave(&buffers->rb_lock, flags); 1163 + buffers->rb_send_bufs[--buffers->rb_send_index] = req; 1164 + req->rl_niovs = 0; 1165 + if (req->rl_reply) { 1166 + buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; 1167 + init_waitqueue_head(&req->rl_reply->rr_unbind); 1168 + req->rl_reply->rr_func = NULL; 1169 + req->rl_reply = NULL; 1170 + } 1171 + switch (ia->ri_memreg_strategy) { 1172 + case RPCRDMA_MTHCAFMR: 1173 + case RPCRDMA_MEMWINDOWS_ASYNC: 1174 + case RPCRDMA_MEMWINDOWS: 1175 + /* 1176 + * Cycle mw's back in reverse order, and "spin" them. 1177 + * This delays and scrambles reuse as much as possible. 1178 + */ 1179 + i = 1; 1180 + do { 1181 + struct rpcrdma_mw **mw; 1182 + mw = &req->rl_segments[i].mr_chunk.rl_mw; 1183 + list_add_tail(&(*mw)->mw_list, &buffers->rb_mws); 1184 + *mw = NULL; 1185 + } while (++i < RPCRDMA_MAX_SEGS); 1186 + list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list, 1187 + &buffers->rb_mws); 1188 + req->rl_segments[0].mr_chunk.rl_mw = NULL; 1189 + break; 1190 + default: 1191 + break; 1192 + } 1193 + spin_unlock_irqrestore(&buffers->rb_lock, flags); 1194 + } 1195 + 1196 + /* 1197 + * Recover reply buffers from pool. 1198 + * This happens when recovering from error conditions. 1199 + * Post-increment counter/array index. 1200 + */ 1201 + void 1202 + rpcrdma_recv_buffer_get(struct rpcrdma_req *req) 1203 + { 1204 + struct rpcrdma_buffer *buffers = req->rl_buffer; 1205 + unsigned long flags; 1206 + 1207 + if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */ 1208 + buffers = ((struct rpcrdma_req *) buffers)->rl_buffer; 1209 + spin_lock_irqsave(&buffers->rb_lock, flags); 1210 + if (buffers->rb_recv_index < buffers->rb_max_requests) { 1211 + req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1212 + buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1213 + } 1214 + spin_unlock_irqrestore(&buffers->rb_lock, flags); 1215 + } 1216 + 1217 + /* 1218 + * Put reply buffers back into pool when not attached to 1219 + * request. This happens in error conditions, and when 1220 + * aborting unbinds. Pre-decrement counter/array index. 1221 + */ 1222 + void 1223 + rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) 1224 + { 1225 + struct rpcrdma_buffer *buffers = rep->rr_buffer; 1226 + unsigned long flags; 1227 + 1228 + rep->rr_func = NULL; 1229 + spin_lock_irqsave(&buffers->rb_lock, flags); 1230 + buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; 1231 + spin_unlock_irqrestore(&buffers->rb_lock, flags); 1232 + } 1233 + 1234 + /* 1235 + * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1236 + */ 1237 + 1238 + int 1239 + rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, 1240 + struct ib_mr **mrp, struct ib_sge *iov) 1241 + { 1242 + struct ib_phys_buf ipb; 1243 + struct ib_mr *mr; 1244 + int rc; 1245 + 1246 + /* 1247 + * All memory passed here was kmalloc'ed, therefore phys-contiguous. 1248 + */ 1249 + iov->addr = ib_dma_map_single(ia->ri_id->device, 1250 + va, len, DMA_BIDIRECTIONAL); 1251 + iov->length = len; 1252 + 1253 + if (ia->ri_bind_mem != NULL) { 1254 + *mrp = NULL; 1255 + iov->lkey = ia->ri_bind_mem->lkey; 1256 + return 0; 1257 + } 1258 + 1259 + ipb.addr = iov->addr; 1260 + ipb.size = iov->length; 1261 + mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, 1262 + IB_ACCESS_LOCAL_WRITE, &iov->addr); 1263 + 1264 + dprintk("RPC: %s: phys convert: 0x%llx " 1265 + "registered 0x%llx length %d\n", 1266 + __func__, ipb.addr, iov->addr, len); 1267 + 1268 + if (IS_ERR(mr)) { 1269 + *mrp = NULL; 1270 + rc = PTR_ERR(mr); 1271 + dprintk("RPC: %s: failed with %i\n", __func__, rc); 1272 + } else { 1273 + *mrp = mr; 1274 + iov->lkey = mr->lkey; 1275 + rc = 0; 1276 + } 1277 + 1278 + return rc; 1279 + } 1280 + 1281 + int 1282 + rpcrdma_deregister_internal(struct rpcrdma_ia *ia, 1283 + struct ib_mr *mr, struct ib_sge *iov) 1284 + { 1285 + int rc; 1286 + 1287 + ib_dma_unmap_single(ia->ri_id->device, 1288 + iov->addr, iov->length, DMA_BIDIRECTIONAL); 1289 + 1290 + if (NULL == mr) 1291 + return 0; 1292 + 1293 + rc = ib_dereg_mr(mr); 1294 + if (rc) 1295 + dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); 1296 + return rc; 1297 + } 1298 + 1299 + /* 1300 + * Wrappers for chunk registration, shared by read/write chunk code. 1301 + */ 1302 + 1303 + static void 1304 + rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) 1305 + { 1306 + seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 1307 + seg->mr_dmalen = seg->mr_len; 1308 + if (seg->mr_page) 1309 + seg->mr_dma = ib_dma_map_page(ia->ri_id->device, 1310 + seg->mr_page, offset_in_page(seg->mr_offset), 1311 + seg->mr_dmalen, seg->mr_dir); 1312 + else 1313 + seg->mr_dma = ib_dma_map_single(ia->ri_id->device, 1314 + seg->mr_offset, 1315 + seg->mr_dmalen, seg->mr_dir); 1316 + } 1317 + 1318 + static void 1319 + rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) 1320 + { 1321 + if (seg->mr_page) 1322 + ib_dma_unmap_page(ia->ri_id->device, 1323 + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 1324 + else 1325 + ib_dma_unmap_single(ia->ri_id->device, 1326 + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 1327 + } 1328 + 1329 + int 1330 + rpcrdma_register_external(struct rpcrdma_mr_seg *seg, 1331 + int nsegs, int writing, struct rpcrdma_xprt *r_xprt) 1332 + { 1333 + struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1334 + int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE : 1335 + IB_ACCESS_REMOTE_READ); 1336 + struct rpcrdma_mr_seg *seg1 = seg; 1337 + int i; 1338 + int rc = 0; 1339 + 1340 + switch (ia->ri_memreg_strategy) { 1341 + 1342 + #if RPCRDMA_PERSISTENT_REGISTRATION 1343 + case RPCRDMA_ALLPHYSICAL: 1344 + rpcrdma_map_one(ia, seg, writing); 1345 + seg->mr_rkey = ia->ri_bind_mem->rkey; 1346 + seg->mr_base = seg->mr_dma; 1347 + seg->mr_nsegs = 1; 1348 + nsegs = 1; 1349 + break; 1350 + #endif 1351 + 1352 + /* Registration using fast memory registration */ 1353 + case RPCRDMA_MTHCAFMR: 1354 + { 1355 + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; 1356 + int len, pageoff = offset_in_page(seg->mr_offset); 1357 + seg1->mr_offset -= pageoff; /* start of page */ 1358 + seg1->mr_len += pageoff; 1359 + len = -pageoff; 1360 + if (nsegs > RPCRDMA_MAX_DATA_SEGS) 1361 + nsegs = RPCRDMA_MAX_DATA_SEGS; 1362 + for (i = 0; i < nsegs;) { 1363 + rpcrdma_map_one(ia, seg, writing); 1364 + physaddrs[i] = seg->mr_dma; 1365 + len += seg->mr_len; 1366 + ++seg; 1367 + ++i; 1368 + /* Check for holes */ 1369 + if ((i < nsegs && offset_in_page(seg->mr_offset)) || 1370 + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) 1371 + break; 1372 + } 1373 + nsegs = i; 1374 + rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr, 1375 + physaddrs, nsegs, seg1->mr_dma); 1376 + if (rc) { 1377 + dprintk("RPC: %s: failed ib_map_phys_fmr " 1378 + "%u@0x%llx+%i (%d)... status %i\n", __func__, 1379 + len, (unsigned long long)seg1->mr_dma, 1380 + pageoff, nsegs, rc); 1381 + while (nsegs--) 1382 + rpcrdma_unmap_one(ia, --seg); 1383 + } else { 1384 + seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey; 1385 + seg1->mr_base = seg1->mr_dma + pageoff; 1386 + seg1->mr_nsegs = nsegs; 1387 + seg1->mr_len = len; 1388 + } 1389 + } 1390 + break; 1391 + 1392 + /* Registration using memory windows */ 1393 + case RPCRDMA_MEMWINDOWS_ASYNC: 1394 + case RPCRDMA_MEMWINDOWS: 1395 + { 1396 + struct ib_mw_bind param; 1397 + rpcrdma_map_one(ia, seg, writing); 1398 + param.mr = ia->ri_bind_mem; 1399 + param.wr_id = 0ULL; /* no send cookie */ 1400 + param.addr = seg->mr_dma; 1401 + param.length = seg->mr_len; 1402 + param.send_flags = 0; 1403 + param.mw_access_flags = mem_priv; 1404 + 1405 + DECR_CQCOUNT(&r_xprt->rx_ep); 1406 + rc = ib_bind_mw(ia->ri_id->qp, 1407 + seg->mr_chunk.rl_mw->r.mw, &param); 1408 + if (rc) { 1409 + dprintk("RPC: %s: failed ib_bind_mw " 1410 + "%u@0x%llx status %i\n", 1411 + __func__, seg->mr_len, 1412 + (unsigned long long)seg->mr_dma, rc); 1413 + rpcrdma_unmap_one(ia, seg); 1414 + } else { 1415 + seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey; 1416 + seg->mr_base = param.addr; 1417 + seg->mr_nsegs = 1; 1418 + nsegs = 1; 1419 + } 1420 + } 1421 + break; 1422 + 1423 + /* Default registration each time */ 1424 + default: 1425 + { 1426 + struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS]; 1427 + int len = 0; 1428 + if (nsegs > RPCRDMA_MAX_DATA_SEGS) 1429 + nsegs = RPCRDMA_MAX_DATA_SEGS; 1430 + for (i = 0; i < nsegs;) { 1431 + rpcrdma_map_one(ia, seg, writing); 1432 + ipb[i].addr = seg->mr_dma; 1433 + ipb[i].size = seg->mr_len; 1434 + len += seg->mr_len; 1435 + ++seg; 1436 + ++i; 1437 + /* Check for holes */ 1438 + if ((i < nsegs && offset_in_page(seg->mr_offset)) || 1439 + offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len)) 1440 + break; 1441 + } 1442 + nsegs = i; 1443 + seg1->mr_base = seg1->mr_dma; 1444 + seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd, 1445 + ipb, nsegs, mem_priv, &seg1->mr_base); 1446 + if (IS_ERR(seg1->mr_chunk.rl_mr)) { 1447 + rc = PTR_ERR(seg1->mr_chunk.rl_mr); 1448 + dprintk("RPC: %s: failed ib_reg_phys_mr " 1449 + "%u@0x%llx (%d)... status %i\n", 1450 + __func__, len, 1451 + (unsigned long long)seg1->mr_dma, nsegs, rc); 1452 + while (nsegs--) 1453 + rpcrdma_unmap_one(ia, --seg); 1454 + } else { 1455 + seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey; 1456 + seg1->mr_nsegs = nsegs; 1457 + seg1->mr_len = len; 1458 + } 1459 + } 1460 + break; 1461 + } 1462 + if (rc) 1463 + return -1; 1464 + 1465 + return nsegs; 1466 + } 1467 + 1468 + int 1469 + rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, 1470 + struct rpcrdma_xprt *r_xprt, void *r) 1471 + { 1472 + struct rpcrdma_ia *ia = &r_xprt->rx_ia; 1473 + struct rpcrdma_mr_seg *seg1 = seg; 1474 + int nsegs = seg->mr_nsegs, rc; 1475 + 1476 + switch (ia->ri_memreg_strategy) { 1477 + 1478 + #if RPCRDMA_PERSISTENT_REGISTRATION 1479 + case RPCRDMA_ALLPHYSICAL: 1480 + BUG_ON(nsegs != 1); 1481 + rpcrdma_unmap_one(ia, seg); 1482 + rc = 0; 1483 + break; 1484 + #endif 1485 + 1486 + case RPCRDMA_MTHCAFMR: 1487 + { 1488 + LIST_HEAD(l); 1489 + list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l); 1490 + rc = ib_unmap_fmr(&l); 1491 + while (seg1->mr_nsegs--) 1492 + rpcrdma_unmap_one(ia, seg++); 1493 + } 1494 + if (rc) 1495 + dprintk("RPC: %s: failed ib_unmap_fmr," 1496 + " status %i\n", __func__, rc); 1497 + break; 1498 + 1499 + case RPCRDMA_MEMWINDOWS_ASYNC: 1500 + case RPCRDMA_MEMWINDOWS: 1501 + { 1502 + struct ib_mw_bind param; 1503 + BUG_ON(nsegs != 1); 1504 + param.mr = ia->ri_bind_mem; 1505 + param.addr = 0ULL; /* unbind */ 1506 + param.length = 0; 1507 + param.mw_access_flags = 0; 1508 + if (r) { 1509 + param.wr_id = (u64) (unsigned long) r; 1510 + param.send_flags = IB_SEND_SIGNALED; 1511 + INIT_CQCOUNT(&r_xprt->rx_ep); 1512 + } else { 1513 + param.wr_id = 0ULL; 1514 + param.send_flags = 0; 1515 + DECR_CQCOUNT(&r_xprt->rx_ep); 1516 + } 1517 + rc = ib_bind_mw(ia->ri_id->qp, 1518 + seg->mr_chunk.rl_mw->r.mw, &param); 1519 + rpcrdma_unmap_one(ia, seg); 1520 + } 1521 + if (rc) 1522 + dprintk("RPC: %s: failed ib_(un)bind_mw," 1523 + " status %i\n", __func__, rc); 1524 + else 1525 + r = NULL; /* will upcall on completion */ 1526 + break; 1527 + 1528 + default: 1529 + rc = ib_dereg_mr(seg1->mr_chunk.rl_mr); 1530 + seg1->mr_chunk.rl_mr = NULL; 1531 + while (seg1->mr_nsegs--) 1532 + rpcrdma_unmap_one(ia, seg++); 1533 + if (rc) 1534 + dprintk("RPC: %s: failed ib_dereg_mr," 1535 + " status %i\n", __func__, rc); 1536 + break; 1537 + } 1538 + if (r) { 1539 + struct rpcrdma_rep *rep = r; 1540 + void (*func)(struct rpcrdma_rep *) = rep->rr_func; 1541 + rep->rr_func = NULL; 1542 + func(rep); /* dereg done, callback now */ 1543 + } 1544 + return nsegs; 1545 + } 1546 + 1547 + /* 1548 + * Prepost any receive buffer, then post send. 1549 + * 1550 + * Receive buffer is donated to hardware, reclaimed upon recv completion. 1551 + */ 1552 + int 1553 + rpcrdma_ep_post(struct rpcrdma_ia *ia, 1554 + struct rpcrdma_ep *ep, 1555 + struct rpcrdma_req *req) 1556 + { 1557 + struct ib_send_wr send_wr, *send_wr_fail; 1558 + struct rpcrdma_rep *rep = req->rl_reply; 1559 + int rc; 1560 + 1561 + if (rep) { 1562 + rc = rpcrdma_ep_post_recv(ia, ep, rep); 1563 + if (rc) 1564 + goto out; 1565 + req->rl_reply = NULL; 1566 + } 1567 + 1568 + send_wr.next = NULL; 1569 + send_wr.wr_id = 0ULL; /* no send cookie */ 1570 + send_wr.sg_list = req->rl_send_iov; 1571 + send_wr.num_sge = req->rl_niovs; 1572 + send_wr.opcode = IB_WR_SEND; 1573 + send_wr.imm_data = 0; 1574 + if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ 1575 + ib_dma_sync_single_for_device(ia->ri_id->device, 1576 + req->rl_send_iov[3].addr, req->rl_send_iov[3].length, 1577 + DMA_TO_DEVICE); 1578 + ib_dma_sync_single_for_device(ia->ri_id->device, 1579 + req->rl_send_iov[1].addr, req->rl_send_iov[1].length, 1580 + DMA_TO_DEVICE); 1581 + ib_dma_sync_single_for_device(ia->ri_id->device, 1582 + req->rl_send_iov[0].addr, req->rl_send_iov[0].length, 1583 + DMA_TO_DEVICE); 1584 + 1585 + if (DECR_CQCOUNT(ep) > 0) 1586 + send_wr.send_flags = 0; 1587 + else { /* Provider must take a send completion every now and then */ 1588 + INIT_CQCOUNT(ep); 1589 + send_wr.send_flags = IB_SEND_SIGNALED; 1590 + } 1591 + 1592 + rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); 1593 + if (rc) 1594 + dprintk("RPC: %s: ib_post_send returned %i\n", __func__, 1595 + rc); 1596 + out: 1597 + return rc; 1598 + } 1599 + 1600 + /* 1601 + * (Re)post a receive buffer. 1602 + */ 1603 + int 1604 + rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, 1605 + struct rpcrdma_ep *ep, 1606 + struct rpcrdma_rep *rep) 1607 + { 1608 + struct ib_recv_wr recv_wr, *recv_wr_fail; 1609 + int rc; 1610 + 1611 + recv_wr.next = NULL; 1612 + recv_wr.wr_id = (u64) (unsigned long) rep; 1613 + recv_wr.sg_list = &rep->rr_iov; 1614 + recv_wr.num_sge = 1; 1615 + 1616 + ib_dma_sync_single_for_cpu(ia->ri_id->device, 1617 + rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL); 1618 + 1619 + DECR_CQCOUNT(ep); 1620 + rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 1621 + 1622 + if (rc) 1623 + dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, 1624 + rc); 1625 + return rc; 1626 + }

+330

net/sunrpc/xprtrdma/xprt_rdma.h

··· 1 + /* 2 + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + */ 39 + 40 + #ifndef _LINUX_SUNRPC_XPRT_RDMA_H 41 + #define _LINUX_SUNRPC_XPRT_RDMA_H 42 + 43 + #include <linux/wait.h> /* wait_queue_head_t, etc */ 44 + #include <linux/spinlock.h> /* spinlock_t, etc */ 45 + #include <asm/atomic.h> /* atomic_t, etc */ 46 + 47 + #include <rdma/rdma_cm.h> /* RDMA connection api */ 48 + #include <rdma/ib_verbs.h> /* RDMA verbs api */ 49 + 50 + #include <linux/sunrpc/clnt.h> /* rpc_xprt */ 51 + #include <linux/sunrpc/rpc_rdma.h> /* RPC/RDMA protocol */ 52 + #include <linux/sunrpc/xprtrdma.h> /* xprt parameters */ 53 + 54 + /* 55 + * Interface Adapter -- one per transport instance 56 + */ 57 + struct rpcrdma_ia { 58 + struct rdma_cm_id *ri_id; 59 + struct ib_pd *ri_pd; 60 + struct ib_mr *ri_bind_mem; 61 + struct completion ri_done; 62 + int ri_async_rc; 63 + enum rpcrdma_memreg ri_memreg_strategy; 64 + }; 65 + 66 + /* 67 + * RDMA Endpoint -- one per transport instance 68 + */ 69 + 70 + struct rpcrdma_ep { 71 + atomic_t rep_cqcount; 72 + int rep_cqinit; 73 + int rep_connected; 74 + struct rpcrdma_ia *rep_ia; 75 + struct ib_cq *rep_cq; 76 + struct ib_qp_init_attr rep_attr; 77 + wait_queue_head_t rep_connect_wait; 78 + struct ib_sge rep_pad; /* holds zeroed pad */ 79 + struct ib_mr *rep_pad_mr; /* holds zeroed pad */ 80 + void (*rep_func)(struct rpcrdma_ep *); 81 + struct rpc_xprt *rep_xprt; /* for rep_func */ 82 + struct rdma_conn_param rep_remote_cma; 83 + struct sockaddr_storage rep_remote_addr; 84 + }; 85 + 86 + #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) 87 + #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) 88 + 89 + /* 90 + * struct rpcrdma_rep -- this structure encapsulates state required to recv 91 + * and complete a reply, asychronously. It needs several pieces of 92 + * state: 93 + * o recv buffer (posted to provider) 94 + * o ib_sge (also donated to provider) 95 + * o status of reply (length, success or not) 96 + * o bookkeeping state to get run by tasklet (list, etc) 97 + * 98 + * These are allocated during initialization, per-transport instance; 99 + * however, the tasklet execution list itself is global, as it should 100 + * always be pretty short. 101 + * 102 + * N of these are associated with a transport instance, and stored in 103 + * struct rpcrdma_buffer. N is the max number of outstanding requests. 104 + */ 105 + 106 + /* temporary static scatter/gather max */ 107 + #define RPCRDMA_MAX_DATA_SEGS (8) /* max scatter/gather */ 108 + #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ 109 + #define MAX_RPCRDMAHDR (\ 110 + /* max supported RPC/RDMA header */ \ 111 + sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \ 112 + (sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32)) 113 + 114 + struct rpcrdma_buffer; 115 + 116 + struct rpcrdma_rep { 117 + unsigned int rr_len; /* actual received reply length */ 118 + struct rpcrdma_buffer *rr_buffer; /* home base for this structure */ 119 + struct rpc_xprt *rr_xprt; /* needed for request/reply matching */ 120 + void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */ 121 + struct list_head rr_list; /* tasklet list */ 122 + wait_queue_head_t rr_unbind; /* optional unbind wait */ 123 + struct ib_sge rr_iov; /* for posting */ 124 + struct ib_mr *rr_handle; /* handle for mem in rr_iov */ 125 + char rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */ 126 + }; 127 + 128 + /* 129 + * struct rpcrdma_req -- structure central to the request/reply sequence. 130 + * 131 + * N of these are associated with a transport instance, and stored in 132 + * struct rpcrdma_buffer. N is the max number of outstanding requests. 133 + * 134 + * It includes pre-registered buffer memory for send AND recv. 135 + * The recv buffer, however, is not owned by this structure, and 136 + * is "donated" to the hardware when a recv is posted. When a 137 + * reply is handled, the recv buffer used is given back to the 138 + * struct rpcrdma_req associated with the request. 139 + * 140 + * In addition to the basic memory, this structure includes an array 141 + * of iovs for send operations. The reason is that the iovs passed to 142 + * ib_post_{send,recv} must not be modified until the work request 143 + * completes. 144 + * 145 + * NOTES: 146 + * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we 147 + * marshal. The number needed varies depending on the iov lists that 148 + * are passed to us, the memory registration mode we are in, and if 149 + * physical addressing is used, the layout. 150 + */ 151 + 152 + struct rpcrdma_mr_seg { /* chunk descriptors */ 153 + union { /* chunk memory handles */ 154 + struct ib_mr *rl_mr; /* if registered directly */ 155 + struct rpcrdma_mw { /* if registered from region */ 156 + union { 157 + struct ib_mw *mw; 158 + struct ib_fmr *fmr; 159 + } r; 160 + struct list_head mw_list; 161 + } *rl_mw; 162 + } mr_chunk; 163 + u64 mr_base; /* registration result */ 164 + u32 mr_rkey; /* registration result */ 165 + u32 mr_len; /* length of chunk or segment */ 166 + int mr_nsegs; /* number of segments in chunk or 0 */ 167 + enum dma_data_direction mr_dir; /* segment mapping direction */ 168 + dma_addr_t mr_dma; /* segment mapping address */ 169 + size_t mr_dmalen; /* segment mapping length */ 170 + struct page *mr_page; /* owning page, if any */ 171 + char *mr_offset; /* kva if no page, else offset */ 172 + }; 173 + 174 + struct rpcrdma_req { 175 + size_t rl_size; /* actual length of buffer */ 176 + unsigned int rl_niovs; /* 0, 2 or 4 */ 177 + unsigned int rl_nchunks; /* non-zero if chunks */ 178 + struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ 179 + struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ 180 + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ 181 + struct ib_sge rl_send_iov[4]; /* for active requests */ 182 + struct ib_sge rl_iov; /* for posting */ 183 + struct ib_mr *rl_handle; /* handle for mem in rl_iov */ 184 + char rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */ 185 + __u32 rl_xdr_buf[0]; /* start of returned rpc rq_buffer */ 186 + }; 187 + #define rpcr_to_rdmar(r) \ 188 + container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0]) 189 + 190 + /* 191 + * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for 192 + * inline requests/replies, and client/server credits. 193 + * 194 + * One of these is associated with a transport instance 195 + */ 196 + struct rpcrdma_buffer { 197 + spinlock_t rb_lock; /* protects indexes */ 198 + atomic_t rb_credits; /* most recent server credits */ 199 + unsigned long rb_cwndscale; /* cached framework rpc_cwndscale */ 200 + int rb_max_requests;/* client max requests */ 201 + struct list_head rb_mws; /* optional memory windows/fmrs */ 202 + int rb_send_index; 203 + struct rpcrdma_req **rb_send_bufs; 204 + int rb_recv_index; 205 + struct rpcrdma_rep **rb_recv_bufs; 206 + char *rb_pool; 207 + }; 208 + #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) 209 + 210 + /* 211 + * Internal structure for transport instance creation. This 212 + * exists primarily for modularity. 213 + * 214 + * This data should be set with mount options 215 + */ 216 + struct rpcrdma_create_data_internal { 217 + struct sockaddr_storage addr; /* RDMA server address */ 218 + unsigned int max_requests; /* max requests (slots) in flight */ 219 + unsigned int rsize; /* mount rsize - max read hdr+data */ 220 + unsigned int wsize; /* mount wsize - max write hdr+data */ 221 + unsigned int inline_rsize; /* max non-rdma read data payload */ 222 + unsigned int inline_wsize; /* max non-rdma write data payload */ 223 + unsigned int padding; /* non-rdma write header padding */ 224 + }; 225 + 226 + #define RPCRDMA_INLINE_READ_THRESHOLD(rq) \ 227 + (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize) 228 + 229 + #define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\ 230 + (rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize) 231 + 232 + #define RPCRDMA_INLINE_PAD_VALUE(rq)\ 233 + rpcx_to_rdmad(rq->rq_task->tk_xprt).padding 234 + 235 + /* 236 + * Statistics for RPCRDMA 237 + */ 238 + struct rpcrdma_stats { 239 + unsigned long read_chunk_count; 240 + unsigned long write_chunk_count; 241 + unsigned long reply_chunk_count; 242 + 243 + unsigned long long total_rdma_request; 244 + unsigned long long total_rdma_reply; 245 + 246 + unsigned long long pullup_copy_count; 247 + unsigned long long fixup_copy_count; 248 + unsigned long hardway_register_count; 249 + unsigned long failed_marshal_count; 250 + unsigned long bad_reply_count; 251 + }; 252 + 253 + /* 254 + * RPCRDMA transport -- encapsulates the structures above for 255 + * integration with RPC. 256 + * 257 + * The contained structures are embedded, not pointers, 258 + * for convenience. This structure need not be visible externally. 259 + * 260 + * It is allocated and initialized during mount, and released 261 + * during unmount. 262 + */ 263 + struct rpcrdma_xprt { 264 + struct rpc_xprt xprt; 265 + struct rpcrdma_ia rx_ia; 266 + struct rpcrdma_ep rx_ep; 267 + struct rpcrdma_buffer rx_buf; 268 + struct rpcrdma_create_data_internal rx_data; 269 + struct delayed_work rdma_connect; 270 + struct rpcrdma_stats rx_stats; 271 + }; 272 + 273 + #define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt) 274 + #define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data) 275 + 276 + /* 277 + * Interface Adapter calls - xprtrdma/verbs.c 278 + */ 279 + int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); 280 + void rpcrdma_ia_close(struct rpcrdma_ia *); 281 + 282 + /* 283 + * Endpoint calls - xprtrdma/verbs.c 284 + */ 285 + int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, 286 + struct rpcrdma_create_data_internal *); 287 + int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); 288 + int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); 289 + int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); 290 + 291 + int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, 292 + struct rpcrdma_req *); 293 + int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, 294 + struct rpcrdma_rep *); 295 + 296 + /* 297 + * Buffer calls - xprtrdma/verbs.c 298 + */ 299 + int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *, 300 + struct rpcrdma_ia *, 301 + struct rpcrdma_create_data_internal *); 302 + void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); 303 + 304 + struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); 305 + void rpcrdma_buffer_put(struct rpcrdma_req *); 306 + void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 307 + void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); 308 + 309 + int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int, 310 + struct ib_mr **, struct ib_sge *); 311 + int rpcrdma_deregister_internal(struct rpcrdma_ia *, 312 + struct ib_mr *, struct ib_sge *); 313 + 314 + int rpcrdma_register_external(struct rpcrdma_mr_seg *, 315 + int, int, struct rpcrdma_xprt *); 316 + int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, 317 + struct rpcrdma_xprt *, void *); 318 + 319 + /* 320 + * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c 321 + */ 322 + void rpcrdma_conn_func(struct rpcrdma_ep *); 323 + void rpcrdma_reply_handler(struct rpcrdma_rep *); 324 + 325 + /* 326 + * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c 327 + */ 328 + int rpcrdma_marshal_req(struct rpc_rqst *); 329 + 330 + #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */

+468 -123

net/sunrpc/xprtsock.c

··· 13 13 * (C) 1999 Trond Myklebust <trond.myklebust@fys.uio.no> 14 14 * 15 15 * IP socket transport implementation, (C) 2005 Chuck Lever <cel@netapp.com> 16 + * 17 + * IPv6 support contributed by Gilles Quillard, Bull Open Source, 2005. 18 + * <gilles.quillard@bull.net> 16 19 */ 17 20 18 21 #include <linux/types.h> 19 22 #include <linux/slab.h> 23 + #include <linux/module.h> 20 24 #include <linux/capability.h> 21 25 #include <linux/pagemap.h> 22 26 #include <linux/errno.h> ··· 32 28 #include <linux/tcp.h> 33 29 #include <linux/sunrpc/clnt.h> 34 30 #include <linux/sunrpc/sched.h> 31 + #include <linux/sunrpc/xprtsock.h> 35 32 #include <linux/file.h> 36 33 37 34 #include <net/sock.h> ··· 265 260 #define TCP_RCV_COPY_XID (1UL << 2) 266 261 #define TCP_RCV_COPY_DATA (1UL << 3) 267 262 268 - static void xs_format_peer_addresses(struct rpc_xprt *xprt) 263 + static inline struct sockaddr *xs_addr(struct rpc_xprt *xprt) 269 264 { 270 - struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr; 265 + return (struct sockaddr *) &xprt->addr; 266 + } 267 + 268 + static inline struct sockaddr_in *xs_addr_in(struct rpc_xprt *xprt) 269 + { 270 + return (struct sockaddr_in *) &xprt->addr; 271 + } 272 + 273 + static inline struct sockaddr_in6 *xs_addr_in6(struct rpc_xprt *xprt) 274 + { 275 + return (struct sockaddr_in6 *) &xprt->addr; 276 + } 277 + 278 + static void xs_format_ipv4_peer_addresses(struct rpc_xprt *xprt) 279 + { 280 + struct sockaddr_in *addr = xs_addr_in(xprt); 271 281 char *buf; 272 282 273 283 buf = kzalloc(20, GFP_KERNEL); 274 284 if (buf) { 275 - snprintf(buf, 20, "%u.%u.%u.%u", 285 + snprintf(buf, 20, NIPQUAD_FMT, 276 286 NIPQUAD(addr->sin_addr.s_addr)); 277 287 } 278 288 xprt->address_strings[RPC_DISPLAY_ADDR] = buf; ··· 299 279 } 300 280 xprt->address_strings[RPC_DISPLAY_PORT] = buf; 301 281 302 - if (xprt->prot == IPPROTO_UDP) 303 - xprt->address_strings[RPC_DISPLAY_PROTO] = "udp"; 304 - else 305 - xprt->address_strings[RPC_DISPLAY_PROTO] = "tcp"; 282 + buf = kzalloc(8, GFP_KERNEL); 283 + if (buf) { 284 + if (xprt->prot == IPPROTO_UDP) 285 + snprintf(buf, 8, "udp"); 286 + else 287 + snprintf(buf, 8, "tcp"); 288 + } 289 + xprt->address_strings[RPC_DISPLAY_PROTO] = buf; 306 290 307 291 buf = kzalloc(48, GFP_KERNEL); 308 292 if (buf) { 309 - snprintf(buf, 48, "addr=%u.%u.%u.%u port=%u proto=%s", 293 + snprintf(buf, 48, "addr="NIPQUAD_FMT" port=%u proto=%s", 310 294 NIPQUAD(addr->sin_addr.s_addr), 311 295 ntohs(addr->sin_port), 312 296 xprt->prot == IPPROTO_UDP ? "udp" : "tcp"); 313 297 } 314 298 xprt->address_strings[RPC_DISPLAY_ALL] = buf; 299 + 300 + buf = kzalloc(10, GFP_KERNEL); 301 + if (buf) { 302 + snprintf(buf, 10, "%02x%02x%02x%02x", 303 + NIPQUAD(addr->sin_addr.s_addr)); 304 + } 305 + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf; 306 + 307 + buf = kzalloc(8, GFP_KERNEL); 308 + if (buf) { 309 + snprintf(buf, 8, "%4hx", 310 + ntohs(addr->sin_port)); 311 + } 312 + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf; 313 + 314 + buf = kzalloc(30, GFP_KERNEL); 315 + if (buf) { 316 + snprintf(buf, 30, NIPQUAD_FMT".%u.%u", 317 + NIPQUAD(addr->sin_addr.s_addr), 318 + ntohs(addr->sin_port) >> 8, 319 + ntohs(addr->sin_port) & 0xff); 320 + } 321 + xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf; 322 + 323 + xprt->address_strings[RPC_DISPLAY_NETID] = 324 + kstrdup(xprt->prot == IPPROTO_UDP ? 325 + RPCBIND_NETID_UDP : RPCBIND_NETID_TCP, GFP_KERNEL); 326 + } 327 + 328 + static void xs_format_ipv6_peer_addresses(struct rpc_xprt *xprt) 329 + { 330 + struct sockaddr_in6 *addr = xs_addr_in6(xprt); 331 + char *buf; 332 + 333 + buf = kzalloc(40, GFP_KERNEL); 334 + if (buf) { 335 + snprintf(buf, 40, NIP6_FMT, 336 + NIP6(addr->sin6_addr)); 337 + } 338 + xprt->address_strings[RPC_DISPLAY_ADDR] = buf; 339 + 340 + buf = kzalloc(8, GFP_KERNEL); 341 + if (buf) { 342 + snprintf(buf, 8, "%u", 343 + ntohs(addr->sin6_port)); 344 + } 345 + xprt->address_strings[RPC_DISPLAY_PORT] = buf; 346 + 347 + buf = kzalloc(8, GFP_KERNEL); 348 + if (buf) { 349 + if (xprt->prot == IPPROTO_UDP) 350 + snprintf(buf, 8, "udp"); 351 + else 352 + snprintf(buf, 8, "tcp"); 353 + } 354 + xprt->address_strings[RPC_DISPLAY_PROTO] = buf; 355 + 356 + buf = kzalloc(64, GFP_KERNEL); 357 + if (buf) { 358 + snprintf(buf, 64, "addr="NIP6_FMT" port=%u proto=%s", 359 + NIP6(addr->sin6_addr), 360 + ntohs(addr->sin6_port), 361 + xprt->prot == IPPROTO_UDP ? "udp" : "tcp"); 362 + } 363 + xprt->address_strings[RPC_DISPLAY_ALL] = buf; 364 + 365 + buf = kzalloc(36, GFP_KERNEL); 366 + if (buf) { 367 + snprintf(buf, 36, NIP6_SEQFMT, 368 + NIP6(addr->sin6_addr)); 369 + } 370 + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = buf; 371 + 372 + buf = kzalloc(8, GFP_KERNEL); 373 + if (buf) { 374 + snprintf(buf, 8, "%4hx", 375 + ntohs(addr->sin6_port)); 376 + } 377 + xprt->address_strings[RPC_DISPLAY_HEX_PORT] = buf; 378 + 379 + buf = kzalloc(50, GFP_KERNEL); 380 + if (buf) { 381 + snprintf(buf, 50, NIP6_FMT".%u.%u", 382 + NIP6(addr->sin6_addr), 383 + ntohs(addr->sin6_port) >> 8, 384 + ntohs(addr->sin6_port) & 0xff); 385 + } 386 + xprt->address_strings[RPC_DISPLAY_UNIVERSAL_ADDR] = buf; 387 + 388 + xprt->address_strings[RPC_DISPLAY_NETID] = 389 + kstrdup(xprt->prot == IPPROTO_UDP ? 390 + RPCBIND_NETID_UDP6 : RPCBIND_NETID_TCP6, GFP_KERNEL); 315 391 } 316 392 317 393 static void xs_free_peer_addresses(struct rpc_xprt *xprt) 318 394 { 319 - kfree(xprt->address_strings[RPC_DISPLAY_ADDR]); 320 - kfree(xprt->address_strings[RPC_DISPLAY_PORT]); 321 - kfree(xprt->address_strings[RPC_DISPLAY_ALL]); 395 + int i; 396 + 397 + for (i = 0; i < RPC_DISPLAY_MAX; i++) 398 + kfree(xprt->address_strings[i]); 322 399 } 323 400 324 401 #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) ··· 580 463 581 464 req->rq_xtime = jiffies; 582 465 status = xs_sendpages(transport->sock, 583 - (struct sockaddr *) &xprt->addr, 466 + xs_addr(xprt), 584 467 xprt->addrlen, xdr, 585 468 req->rq_bytes_sent); 586 469 587 470 dprintk("RPC: xs_udp_send_request(%u) = %d\n", 588 471 xdr->len - req->rq_bytes_sent, status); 589 472 590 - if (likely(status >= (int) req->rq_slen)) 591 - return 0; 592 - 593 - /* Still some bytes left; set up for a retry later. */ 594 - if (status > 0) 473 + if (status >= 0) { 474 + task->tk_bytes_sent += status; 475 + if (status >= req->rq_slen) 476 + return 0; 477 + /* Still some bytes left; set up for a retry later. */ 595 478 status = -EAGAIN; 479 + } 596 480 597 481 switch (status) { 598 482 case -ENETUNREACH: ··· 641 523 struct rpc_xprt *xprt = req->rq_xprt; 642 524 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 643 525 struct xdr_buf *xdr = &req->rq_snd_buf; 644 - int status, retry = 0; 526 + int status; 527 + unsigned int retry = 0; 645 528 646 529 xs_encode_tcp_record_marker(&req->rq_snd_buf); 647 530 ··· 780 661 xs_free_peer_addresses(xprt); 781 662 kfree(xprt->slot); 782 663 kfree(xprt); 664 + module_put(THIS_MODULE); 783 665 } 784 666 785 667 static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) ··· 1259 1139 */ 1260 1140 static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) 1261 1141 { 1262 - struct sockaddr_in *sap = (struct sockaddr_in *) &xprt->addr; 1142 + struct sockaddr *addr = xs_addr(xprt); 1263 1143 1264 1144 dprintk("RPC: setting port for xprt %p to %u\n", xprt, port); 1265 1145 1266 - sap->sin_port = htons(port); 1146 + switch (addr->sa_family) { 1147 + case AF_INET: 1148 + ((struct sockaddr_in *)addr)->sin_port = htons(port); 1149 + break; 1150 + case AF_INET6: 1151 + ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); 1152 + break; 1153 + default: 1154 + BUG(); 1155 + } 1267 1156 } 1268 1157 1269 - static int xs_bind(struct sock_xprt *transport, struct socket *sock) 1158 + static int xs_bind4(struct sock_xprt *transport, struct socket *sock) 1270 1159 { 1271 1160 struct sockaddr_in myaddr = { 1272 1161 .sin_family = AF_INET, ··· 1303 1174 else 1304 1175 port--; 1305 1176 } while (err == -EADDRINUSE && port != transport->port); 1306 - dprintk("RPC: xs_bind "NIPQUAD_FMT":%u: %s (%d)\n", 1307 - NIPQUAD(myaddr.sin_addr), port, err ? "failed" : "ok", err); 1177 + dprintk("RPC: %s "NIPQUAD_FMT":%u: %s (%d)\n", 1178 + __FUNCTION__, NIPQUAD(myaddr.sin_addr), 1179 + port, err ? "failed" : "ok", err); 1180 + return err; 1181 + } 1182 + 1183 + static int xs_bind6(struct sock_xprt *transport, struct socket *sock) 1184 + { 1185 + struct sockaddr_in6 myaddr = { 1186 + .sin6_family = AF_INET6, 1187 + }; 1188 + struct sockaddr_in6 *sa; 1189 + int err; 1190 + unsigned short port = transport->port; 1191 + 1192 + if (!transport->xprt.resvport) 1193 + port = 0; 1194 + sa = (struct sockaddr_in6 *)&transport->addr; 1195 + myaddr.sin6_addr = sa->sin6_addr; 1196 + do { 1197 + myaddr.sin6_port = htons(port); 1198 + err = kernel_bind(sock, (struct sockaddr *) &myaddr, 1199 + sizeof(myaddr)); 1200 + if (!transport->xprt.resvport) 1201 + break; 1202 + if (err == 0) { 1203 + transport->port = port; 1204 + break; 1205 + } 1206 + if (port <= xprt_min_resvport) 1207 + port = xprt_max_resvport; 1208 + else 1209 + port--; 1210 + } while (err == -EADDRINUSE && port != transport->port); 1211 + dprintk("RPC: xs_bind6 "NIP6_FMT":%u: %s (%d)\n", 1212 + NIP6(myaddr.sin6_addr), port, err ? "failed" : "ok", err); 1308 1213 return err; 1309 1214 } 1310 1215 ··· 1346 1183 static struct lock_class_key xs_key[2]; 1347 1184 static struct lock_class_key xs_slock_key[2]; 1348 1185 1349 - static inline void xs_reclassify_socket(struct socket *sock) 1186 + static inline void xs_reclassify_socket4(struct socket *sock) 1350 1187 { 1351 1188 struct sock *sk = sock->sk; 1189 + 1352 1190 BUG_ON(sock_owned_by_user(sk)); 1353 - switch (sk->sk_family) { 1354 - case AF_INET: 1355 - sock_lock_init_class_and_name(sk, "slock-AF_INET-NFS", 1356 - &xs_slock_key[0], "sk_lock-AF_INET-NFS", &xs_key[0]); 1357 - break; 1191 + sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC", 1192 + &xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]); 1193 + } 1358 1194 1359 - case AF_INET6: 1360 - sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFS", 1361 - &xs_slock_key[1], "sk_lock-AF_INET6-NFS", &xs_key[1]); 1362 - break; 1195 + static inline void xs_reclassify_socket6(struct socket *sock) 1196 + { 1197 + struct sock *sk = sock->sk; 1363 1198 1364 - default: 1365 - BUG(); 1366 - } 1199 + BUG_ON(sock_owned_by_user(sk)); 1200 + sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC", 1201 + &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]); 1367 1202 } 1368 1203 #else 1369 - static inline void xs_reclassify_socket(struct socket *sock) 1204 + static inline void xs_reclassify_socket4(struct socket *sock) 1205 + { 1206 + } 1207 + 1208 + static inline void xs_reclassify_socket6(struct socket *sock) 1370 1209 { 1371 1210 } 1372 1211 #endif 1373 1212 1374 - /** 1375 - * xs_udp_connect_worker - set up a UDP socket 1376 - * @work: RPC transport to connect 1377 - * 1378 - * Invoked by a work queue tasklet. 1379 - */ 1380 - static void xs_udp_connect_worker(struct work_struct *work) 1213 + static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 1381 1214 { 1382 - struct sock_xprt *transport = 1383 - container_of(work, struct sock_xprt, connect_worker.work); 1384 - struct rpc_xprt *xprt = &transport->xprt; 1385 - struct socket *sock = transport->sock; 1386 - int err, status = -EIO; 1387 - 1388 - if (xprt->shutdown || !xprt_bound(xprt)) 1389 - goto out; 1390 - 1391 - /* Start by resetting any existing state */ 1392 - xs_close(xprt); 1393 - 1394 - if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { 1395 - dprintk("RPC: can't create UDP transport socket (%d).\n", -err); 1396 - goto out; 1397 - } 1398 - xs_reclassify_socket(sock); 1399 - 1400 - if (xs_bind(transport, sock)) { 1401 - sock_release(sock); 1402 - goto out; 1403 - } 1404 - 1405 - dprintk("RPC: worker connecting xprt %p to address: %s\n", 1406 - xprt, xprt->address_strings[RPC_DISPLAY_ALL]); 1215 + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 1407 1216 1408 1217 if (!transport->inet) { 1409 1218 struct sock *sk = sock->sk; ··· 1400 1265 write_unlock_bh(&sk->sk_callback_lock); 1401 1266 } 1402 1267 xs_udp_do_set_buffer_size(xprt); 1268 + } 1269 + 1270 + /** 1271 + * xs_udp_connect_worker4 - set up a UDP socket 1272 + * @work: RPC transport to connect 1273 + * 1274 + * Invoked by a work queue tasklet. 1275 + */ 1276 + static void xs_udp_connect_worker4(struct work_struct *work) 1277 + { 1278 + struct sock_xprt *transport = 1279 + container_of(work, struct sock_xprt, connect_worker.work); 1280 + struct rpc_xprt *xprt = &transport->xprt; 1281 + struct socket *sock = transport->sock; 1282 + int err, status = -EIO; 1283 + 1284 + if (xprt->shutdown || !xprt_bound(xprt)) 1285 + goto out; 1286 + 1287 + /* Start by resetting any existing state */ 1288 + xs_close(xprt); 1289 + 1290 + if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { 1291 + dprintk("RPC: can't create UDP transport socket (%d).\n", -err); 1292 + goto out; 1293 + } 1294 + xs_reclassify_socket4(sock); 1295 + 1296 + if (xs_bind4(transport, sock)) { 1297 + sock_release(sock); 1298 + goto out; 1299 + } 1300 + 1301 + dprintk("RPC: worker connecting xprt %p to address: %s\n", 1302 + xprt, xprt->address_strings[RPC_DISPLAY_ALL]); 1303 + 1304 + xs_udp_finish_connecting(xprt, sock); 1305 + status = 0; 1306 + out: 1307 + xprt_wake_pending_tasks(xprt, status); 1308 + xprt_clear_connecting(xprt); 1309 + } 1310 + 1311 + /** 1312 + * xs_udp_connect_worker6 - set up a UDP socket 1313 + * @work: RPC transport to connect 1314 + * 1315 + * Invoked by a work queue tasklet. 1316 + */ 1317 + static void xs_udp_connect_worker6(struct work_struct *work) 1318 + { 1319 + struct sock_xprt *transport = 1320 + container_of(work, struct sock_xprt, connect_worker.work); 1321 + struct rpc_xprt *xprt = &transport->xprt; 1322 + struct socket *sock = transport->sock; 1323 + int err, status = -EIO; 1324 + 1325 + if (xprt->shutdown || !xprt_bound(xprt)) 1326 + goto out; 1327 + 1328 + /* Start by resetting any existing state */ 1329 + xs_close(xprt); 1330 + 1331 + if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { 1332 + dprintk("RPC: can't create UDP transport socket (%d).\n", -err); 1333 + goto out; 1334 + } 1335 + xs_reclassify_socket6(sock); 1336 + 1337 + if (xs_bind6(transport, sock) < 0) { 1338 + sock_release(sock); 1339 + goto out; 1340 + } 1341 + 1342 + dprintk("RPC: worker connecting xprt %p to address: %s\n", 1343 + xprt, xprt->address_strings[RPC_DISPLAY_ALL]); 1344 + 1345 + xs_udp_finish_connecting(xprt, sock); 1403 1346 status = 0; 1404 1347 out: 1405 1348 xprt_wake_pending_tasks(xprt, status); ··· 1508 1295 result); 1509 1296 } 1510 1297 1511 - /** 1512 - * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint 1513 - * @work: RPC transport to connect 1514 - * 1515 - * Invoked by a work queue tasklet. 1516 - */ 1517 - static void xs_tcp_connect_worker(struct work_struct *work) 1298 + static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 1518 1299 { 1519 - struct sock_xprt *transport = 1520 - container_of(work, struct sock_xprt, connect_worker.work); 1521 - struct rpc_xprt *xprt = &transport->xprt; 1522 - struct socket *sock = transport->sock; 1523 - int err, status = -EIO; 1524 - 1525 - if (xprt->shutdown || !xprt_bound(xprt)) 1526 - goto out; 1527 - 1528 - if (!sock) { 1529 - /* start from scratch */ 1530 - if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { 1531 - dprintk("RPC: can't create TCP transport " 1532 - "socket (%d).\n", -err); 1533 - goto out; 1534 - } 1535 - xs_reclassify_socket(sock); 1536 - 1537 - if (xs_bind(transport, sock)) { 1538 - sock_release(sock); 1539 - goto out; 1540 - } 1541 - } else 1542 - /* "close" the socket, preserving the local port */ 1543 - xs_tcp_reuse_connection(xprt); 1544 - 1545 - dprintk("RPC: worker connecting xprt %p to address: %s\n", 1546 - xprt, xprt->address_strings[RPC_DISPLAY_ALL]); 1300 + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 1547 1301 1548 1302 if (!transport->inet) { 1549 1303 struct sock *sk = sock->sk; ··· 1544 1364 /* Tell the socket layer to start connecting... */ 1545 1365 xprt->stat.connect_count++; 1546 1366 xprt->stat.connect_start = jiffies; 1547 - status = kernel_connect(sock, (struct sockaddr *) &xprt->addr, 1548 - xprt->addrlen, O_NONBLOCK); 1367 + return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); 1368 + } 1369 + 1370 + /** 1371 + * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint 1372 + * @work: RPC transport to connect 1373 + * 1374 + * Invoked by a work queue tasklet. 1375 + */ 1376 + static void xs_tcp_connect_worker4(struct work_struct *work) 1377 + { 1378 + struct sock_xprt *transport = 1379 + container_of(work, struct sock_xprt, connect_worker.work); 1380 + struct rpc_xprt *xprt = &transport->xprt; 1381 + struct socket *sock = transport->sock; 1382 + int err, status = -EIO; 1383 + 1384 + if (xprt->shutdown || !xprt_bound(xprt)) 1385 + goto out; 1386 + 1387 + if (!sock) { 1388 + /* start from scratch */ 1389 + if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { 1390 + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); 1391 + goto out; 1392 + } 1393 + xs_reclassify_socket4(sock); 1394 + 1395 + if (xs_bind4(transport, sock) < 0) { 1396 + sock_release(sock); 1397 + goto out; 1398 + } 1399 + } else 1400 + /* "close" the socket, preserving the local port */ 1401 + xs_tcp_reuse_connection(xprt); 1402 + 1403 + dprintk("RPC: worker connecting xprt %p to address: %s\n", 1404 + xprt, xprt->address_strings[RPC_DISPLAY_ALL]); 1405 + 1406 + status = xs_tcp_finish_connecting(xprt, sock); 1549 1407 dprintk("RPC: %p connect status %d connected %d sock state %d\n", 1550 1408 xprt, -status, xprt_connected(xprt), 1551 1409 sock->sk->sk_state); 1410 + if (status < 0) { 1411 + switch (status) { 1412 + case -EINPROGRESS: 1413 + case -EALREADY: 1414 + goto out_clear; 1415 + case -ECONNREFUSED: 1416 + case -ECONNRESET: 1417 + /* retry with existing socket, after a delay */ 1418 + break; 1419 + default: 1420 + /* get rid of existing socket, and retry */ 1421 + xs_close(xprt); 1422 + break; 1423 + } 1424 + } 1425 + out: 1426 + xprt_wake_pending_tasks(xprt, status); 1427 + out_clear: 1428 + xprt_clear_connecting(xprt); 1429 + } 1430 + 1431 + /** 1432 + * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint 1433 + * @work: RPC transport to connect 1434 + * 1435 + * Invoked by a work queue tasklet. 1436 + */ 1437 + static void xs_tcp_connect_worker6(struct work_struct *work) 1438 + { 1439 + struct sock_xprt *transport = 1440 + container_of(work, struct sock_xprt, connect_worker.work); 1441 + struct rpc_xprt *xprt = &transport->xprt; 1442 + struct socket *sock = transport->sock; 1443 + int err, status = -EIO; 1444 + 1445 + if (xprt->shutdown || !xprt_bound(xprt)) 1446 + goto out; 1447 + 1448 + if (!sock) { 1449 + /* start from scratch */ 1450 + if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { 1451 + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); 1452 + goto out; 1453 + } 1454 + xs_reclassify_socket6(sock); 1455 + 1456 + if (xs_bind6(transport, sock) < 0) { 1457 + sock_release(sock); 1458 + goto out; 1459 + } 1460 + } else 1461 + /* "close" the socket, preserving the local port */ 1462 + xs_tcp_reuse_connection(xprt); 1463 + 1464 + dprintk("RPC: worker connecting xprt %p to address: %s\n", 1465 + xprt, xprt->address_strings[RPC_DISPLAY_ALL]); 1466 + 1467 + status = xs_tcp_finish_connecting(xprt, sock); 1468 + dprintk("RPC: %p connect status %d connected %d sock state %d\n", 1469 + xprt, -status, xprt_connected(xprt), sock->sk->sk_state); 1552 1470 if (status < 0) { 1553 1471 switch (status) { 1554 1472 case -EINPROGRESS: ··· 1786 1508 .print_stats = xs_tcp_print_stats, 1787 1509 }; 1788 1510 1789 - static struct rpc_xprt *xs_setup_xprt(struct rpc_xprtsock_create *args, unsigned int slot_table_size) 1511 + static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, 1512 + unsigned int slot_table_size) 1790 1513 { 1791 1514 struct rpc_xprt *xprt; 1792 1515 struct sock_xprt *new; ··· 1828 1549 * @args: rpc transport creation arguments 1829 1550 * 1830 1551 */ 1831 - struct rpc_xprt *xs_setup_udp(struct rpc_xprtsock_create *args) 1552 + struct rpc_xprt *xs_setup_udp(struct xprt_create *args) 1832 1553 { 1554 + struct sockaddr *addr = args->dstaddr; 1833 1555 struct rpc_xprt *xprt; 1834 1556 struct sock_xprt *transport; 1835 1557 ··· 1839 1559 return xprt; 1840 1560 transport = container_of(xprt, struct sock_xprt, xprt); 1841 1561 1842 - if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0) 1843 - xprt_set_bound(xprt); 1844 - 1845 1562 xprt->prot = IPPROTO_UDP; 1846 1563 xprt->tsh_size = 0; 1847 1564 /* XXX: header size can vary due to auth type, IPv6, etc. */ 1848 1565 xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); 1849 1566 1850 - INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_connect_worker); 1851 1567 xprt->bind_timeout = XS_BIND_TO; 1852 1568 xprt->connect_timeout = XS_UDP_CONN_TO; 1853 1569 xprt->reestablish_timeout = XS_UDP_REEST_TO; ··· 1856 1580 else 1857 1581 xprt_set_timeout(&xprt->timeout, 5, 5 * HZ); 1858 1582 1859 - xs_format_peer_addresses(xprt); 1583 + switch (addr->sa_family) { 1584 + case AF_INET: 1585 + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) 1586 + xprt_set_bound(xprt); 1587 + 1588 + INIT_DELAYED_WORK(&transport->connect_worker, 1589 + xs_udp_connect_worker4); 1590 + xs_format_ipv4_peer_addresses(xprt); 1591 + break; 1592 + case AF_INET6: 1593 + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) 1594 + xprt_set_bound(xprt); 1595 + 1596 + INIT_DELAYED_WORK(&transport->connect_worker, 1597 + xs_udp_connect_worker6); 1598 + xs_format_ipv6_peer_addresses(xprt); 1599 + break; 1600 + default: 1601 + kfree(xprt); 1602 + return ERR_PTR(-EAFNOSUPPORT); 1603 + } 1604 + 1860 1605 dprintk("RPC: set up transport to address %s\n", 1861 1606 xprt->address_strings[RPC_DISPLAY_ALL]); 1862 1607 1863 - return xprt; 1608 + if (try_module_get(THIS_MODULE)) 1609 + return xprt; 1610 + 1611 + kfree(xprt->slot); 1612 + kfree(xprt); 1613 + return ERR_PTR(-EINVAL); 1864 1614 } 1865 1615 1866 1616 /** ··· 1894 1592 * @args: rpc transport creation arguments 1895 1593 * 1896 1594 */ 1897 - struct rpc_xprt *xs_setup_tcp(struct rpc_xprtsock_create *args) 1595 + struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) 1898 1596 { 1597 + struct sockaddr *addr = args->dstaddr; 1899 1598 struct rpc_xprt *xprt; 1900 1599 struct sock_xprt *transport; 1901 1600 ··· 1905 1602 return xprt; 1906 1603 transport = container_of(xprt, struct sock_xprt, xprt); 1907 1604 1908 - if (ntohs(((struct sockaddr_in *)args->dstaddr)->sin_port) != 0) 1909 - xprt_set_bound(xprt); 1910 - 1911 1605 xprt->prot = IPPROTO_TCP; 1912 1606 xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); 1913 1607 xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; 1914 1608 1915 - INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker); 1916 1609 xprt->bind_timeout = XS_BIND_TO; 1917 1610 xprt->connect_timeout = XS_TCP_CONN_TO; 1918 1611 xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; ··· 1921 1622 else 1922 1623 xprt_set_timeout(&xprt->timeout, 2, 60 * HZ); 1923 1624 1924 - xs_format_peer_addresses(xprt); 1625 + switch (addr->sa_family) { 1626 + case AF_INET: 1627 + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) 1628 + xprt_set_bound(xprt); 1629 + 1630 + INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker4); 1631 + xs_format_ipv4_peer_addresses(xprt); 1632 + break; 1633 + case AF_INET6: 1634 + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) 1635 + xprt_set_bound(xprt); 1636 + 1637 + INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_connect_worker6); 1638 + xs_format_ipv6_peer_addresses(xprt); 1639 + break; 1640 + default: 1641 + kfree(xprt); 1642 + return ERR_PTR(-EAFNOSUPPORT); 1643 + } 1644 + 1925 1645 dprintk("RPC: set up transport to address %s\n", 1926 1646 xprt->address_strings[RPC_DISPLAY_ALL]); 1927 1647 1928 - return xprt; 1648 + if (try_module_get(THIS_MODULE)) 1649 + return xprt; 1650 + 1651 + kfree(xprt->slot); 1652 + kfree(xprt); 1653 + return ERR_PTR(-EINVAL); 1929 1654 } 1930 1655 1656 + static struct xprt_class xs_udp_transport = { 1657 + .list = LIST_HEAD_INIT(xs_udp_transport.list), 1658 + .name = "udp", 1659 + .owner = THIS_MODULE, 1660 + .ident = IPPROTO_UDP, 1661 + .setup = xs_setup_udp, 1662 + }; 1663 + 1664 + static struct xprt_class xs_tcp_transport = { 1665 + .list = LIST_HEAD_INIT(xs_tcp_transport.list), 1666 + .name = "tcp", 1667 + .owner = THIS_MODULE, 1668 + .ident = IPPROTO_TCP, 1669 + .setup = xs_setup_tcp, 1670 + }; 1671 + 1931 1672 /** 1932 - * init_socket_xprt - set up xprtsock's sysctls 1673 + * init_socket_xprt - set up xprtsock's sysctls, register with RPC client 1933 1674 * 1934 1675 */ 1935 1676 int init_socket_xprt(void) ··· 1979 1640 sunrpc_table_header = register_sysctl_table(sunrpc_table); 1980 1641 #endif 1981 1642 1643 + xprt_register_transport(&xs_udp_transport); 1644 + xprt_register_transport(&xs_tcp_transport); 1645 + 1982 1646 return 0; 1983 1647 } 1984 1648 1985 1649 /** 1986 - * cleanup_socket_xprt - remove xprtsock's sysctls 1650 + * cleanup_socket_xprt - remove xprtsock's sysctls, unregister 1987 1651 * 1988 1652 */ 1989 1653 void cleanup_socket_xprt(void) ··· 1997 1655 sunrpc_table_header = NULL; 1998 1656 } 1999 1657 #endif 1658 + 1659 + xprt_unregister_transport(&xs_udp_transport); 1660 + xprt_unregister_transport(&xs_tcp_transport); 2000 1661 }