Merge branch 'for-3.20' of git://linux-nfs.org/~bfields/linux

+8 -15

Documentation/filesystems/nfs/nfs41-server.txt

··· 24 24 "exactly once" semantics and better control and throttling of the 25 25 resources allocated for each client. 26 26 27 - Other NFSv4.1 features, Parallel NFS operations in particular, 28 - are still under development out of tree. 29 - See http://wiki.linux-nfs.org/wiki/index.php/PNFS_prototype_design 30 - for more information. 31 - 32 27 The table below, taken from the NFSv4.1 document, lists 33 28 the operations that are mandatory to implement (REQ), optional 34 29 (OPT), and NFSv4.0 operations that are required not to implement (MNI) ··· 38 43 The following abbreviations indicate the linux server implementation status. 39 44 I Implemented NFSv4.1 operations. 40 45 NS Not Supported. 41 - NS* unimplemented optional feature. 42 - P pNFS features implemented out of tree. 43 - PNS pNFS features that are not supported yet (out of tree). 46 + NS* Unimplemented optional feature. 44 47 45 48 Operations 46 49 ··· 63 70 I | EXCHANGE_ID | REQ | | Section 18.35 | 64 71 I | FREE_STATEID | REQ | | Section 18.38 | 65 72 | GETATTR | REQ | | Section 18.7 | 66 - P | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 | 67 - P | GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 | 73 + I | GETDEVICEINFO | OPT | pNFS (REQ) | Section 18.40 | 74 + NS*| GETDEVICELIST | OPT | pNFS (OPT) | Section 18.41 | 68 75 | GETFH | REQ | | Section 18.8 | 69 76 NS*| GET_DIR_DELEGATION | OPT | DDELG (REQ) | Section 18.39 | 70 - P | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 | 71 - P | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 | 72 - P | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 | 77 + I | LAYOUTCOMMIT | OPT | pNFS (REQ) | Section 18.42 | 78 + I | LAYOUTGET | OPT | pNFS (REQ) | Section 18.43 | 79 + I | LAYOUTRETURN | OPT | pNFS (REQ) | Section 18.44 | 73 80 | LINK | OPT | | Section 18.9 | 74 81 | LOCK | REQ | | Section 18.10 | 75 82 | LOCKT | REQ | | Section 18.11 | ··· 115 122 | | MNI | or OPT) | | 116 123 +-------------------------+-----------+-------------+---------------+ 117 124 | CB_GETATTR | OPT | FDELG (REQ) | Section 20.1 | 118 - P | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 | 125 + I | CB_LAYOUTRECALL | OPT | pNFS (REQ) | Section 20.3 | 119 126 NS*| CB_NOTIFY | OPT | DDELG (REQ) | Section 20.4 | 120 - P | CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 | 127 + NS*| CB_NOTIFY_DEVICEID | OPT | pNFS (OPT) | Section 20.12 | 121 128 NS*| CB_NOTIFY_LOCK | OPT | | Section 20.11 | 122 129 NS*| CB_PUSH_DELEG | OPT | FDELG (OPT) | Section 20.5 | 123 130 | CB_RECALL | OPT | FDELG, | Section 20.2 |

+37

Documentation/filesystems/nfs/pnfs-block-server.txt

··· 1 + pNFS block layout server user guide 2 + 3 + The Linux NFS server now supports the pNFS block layout extension. In this 4 + case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition 5 + to handling all the metadata access to the NFS export also hands out layouts 6 + to the clients to directly access the underlying block devices that are 7 + shared with the client. 8 + 9 + To use pNFS block layouts with with the Linux NFS server the exported file 10 + system needs to support the pNFS block layouts (currently just XFS), and the 11 + file system must sit on shared storage (typically iSCSI) that is accessible 12 + to the clients in addition to the MDS. As of now the file system needs to 13 + sit directly on the exported volume, striping or concatenation of 14 + volumes on the MDS and clients is not supported yet. 15 + 16 + On the server, pNFS block volume support is automatically if the file system 17 + support it. On the client make sure the kernel has the CONFIG_PNFS_BLOCK 18 + option enabled, the blkmapd daemon from nfs-utils is running, and the 19 + file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1). 20 + 21 + If the nfsd server needs to fence a non-responding client it calls 22 + /sbin/nfsd-recall-failed with the first argument set to the IP address of 23 + the client, and the second argument set to the device node without the /dev 24 + prefix for the file system to be fenced. Below is an example file that shows 25 + how to translate the device into a serial number from SCSI EVPD 0x80: 26 + 27 + cat > /sbin/nfsd-recall-failed << EOF 28 + #!/bin/sh 29 + 30 + CLIENT="$1" 31 + DEV="/dev/$2" 32 + EVPD=`sg_inq --page=0x80 ${DEV} | \ 33 + grep "Unit serial number:" | \ 34 + awk -F ': ' '{print $2}'` 35 + 36 + echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log 37 + EOF

+2 -2

fs/lockd/svclock.c

··· 57 57 static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) 58 58 { 59 59 /* 60 - * We can get away with a static buffer because we're only 61 - * called with BKL held. 60 + * We can get away with a static buffer because this is only called 61 + * from lockd, which is single-threaded. 62 62 */ 63 63 static char buf[2*NLM_MAXCOOKIELEN+1]; 64 64 unsigned int i, len = sizeof(buf);

-8

fs/lockd/xdr.c

··· 95 95 return p + XDR_QUADLEN(NFS2_FHSIZE); 96 96 } 97 97 98 - static inline __be32 * 99 - nlm_encode_fh(__be32 *p, struct nfs_fh *f) 100 - { 101 - *p++ = htonl(NFS2_FHSIZE); 102 - memcpy(p, f->data, NFS2_FHSIZE); 103 - return p + XDR_QUADLEN(NFS2_FHSIZE); 104 - } 105 - 106 98 /* 107 99 * Encode and decode owner handle 108 100 */

+17 -9

fs/locks.c

··· 137 137 138 138 #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) 139 139 #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) 140 - #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) 140 + #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) 141 141 #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) 142 142 143 143 static bool lease_breaking(struct file_lock *fl) ··· 1371 1371 1372 1372 static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) 1373 1373 { 1374 + if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) 1375 + return false; 1374 1376 if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) 1375 1377 return false; 1376 1378 return locks_conflict(breaker, lease); ··· 1596 1594 * conflict with the lease we're trying to set. 1597 1595 */ 1598 1596 static int 1599 - check_conflicting_open(const struct dentry *dentry, const long arg) 1597 + check_conflicting_open(const struct dentry *dentry, const long arg, int flags) 1600 1598 { 1601 1599 int ret = 0; 1602 1600 struct inode *inode = dentry->d_inode; 1601 + 1602 + if (flags & FL_LAYOUT) 1603 + return 0; 1603 1604 1604 1605 if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) 1605 1606 return -EAGAIN; ··· 1652 1647 1653 1648 spin_lock(&ctx->flc_lock); 1654 1649 time_out_leases(inode, &dispose); 1655 - error = check_conflicting_open(dentry, arg); 1650 + error = check_conflicting_open(dentry, arg, lease->fl_flags); 1656 1651 if (error) 1657 1652 goto out; 1658 1653 ··· 1666 1661 */ 1667 1662 error = -EAGAIN; 1668 1663 list_for_each_entry(fl, &ctx->flc_lease, fl_list) { 1669 - if (fl->fl_file == filp) { 1664 + if (fl->fl_file == filp && 1665 + fl->fl_owner == lease->fl_owner) { 1670 1666 my_fl = fl; 1671 1667 continue; 1672 1668 } ··· 1708 1702 * precedes these checks. 1709 1703 */ 1710 1704 smp_mb(); 1711 - error = check_conflicting_open(dentry, arg); 1705 + error = check_conflicting_open(dentry, arg, lease->fl_flags); 1712 1706 if (error) { 1713 1707 locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt); 1714 1708 goto out; ··· 1727 1721 return error; 1728 1722 } 1729 1723 1730 - static int generic_delete_lease(struct file *filp) 1724 + static int generic_delete_lease(struct file *filp, void *owner) 1731 1725 { 1732 1726 int error = -EAGAIN; 1733 1727 struct file_lock *fl, *victim = NULL; ··· 1743 1737 1744 1738 spin_lock(&ctx->flc_lock); 1745 1739 list_for_each_entry(fl, &ctx->flc_lease, fl_list) { 1746 - if (fl->fl_file == filp) { 1740 + if (fl->fl_file == filp && 1741 + fl->fl_owner == owner) { 1747 1742 victim = fl; 1748 1743 break; 1749 1744 } ··· 1785 1778 1786 1779 switch (arg) { 1787 1780 case F_UNLCK: 1788 - return generic_delete_lease(filp); 1781 + return generic_delete_lease(filp, *priv); 1789 1782 case F_RDLCK: 1790 1783 case F_WRLCK: 1791 1784 if (!(*flp)->fl_lmops->lm_break) { 1792 1785 WARN_ON_ONCE(1); 1793 1786 return -ENOLCK; 1794 1787 } 1788 + 1795 1789 return generic_add_lease(filp, arg, flp, priv); 1796 1790 default: 1797 1791 return -EINVAL; ··· 1865 1857 int fcntl_setlease(unsigned int fd, struct file *filp, long arg) 1866 1858 { 1867 1859 if (arg == F_UNLCK) 1868 - return vfs_setlease(filp, F_UNLCK, NULL, NULL); 1860 + return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); 1869 1861 return do_fcntl_add_lease(fd, filp, arg); 1870 1862 } 1871 1863

+10

fs/nfsd/Kconfig

··· 82 82 83 83 If unsure, say N. 84 84 85 + config NFSD_PNFS 86 + bool "NFSv4.1 server support for Parallel NFS (pNFS)" 87 + depends on NFSD_V4 88 + help 89 + This option enables support for the parallel NFS features of the 90 + minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS 91 + server. 92 + 93 + If unsure, say N. 94 + 85 95 config NFSD_V4_SECURITY_LABEL 86 96 bool "Provide Security Label support for NFSv4 server" 87 97 depends on NFSD_V4 && SECURITY

+7 -1

fs/nfsd/Makefile

··· 2 2 # Makefile for the Linux nfs server 3 3 # 4 4 5 + ccflags-y += -I$(src) # needed for trace events 6 + 5 7 obj-$(CONFIG_NFSD) += nfsd.o 6 8 7 - nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ 9 + # this one should be compiled first, as the tracing macros can easily blow up 10 + nfsd-y += trace.o 11 + 12 + nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ 8 13 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o 9 14 nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o 10 15 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o ··· 17 12 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o 18 13 nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ 19 14 nfs4acl.o nfs4callback.o nfs4recover.o 15 + nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o

+189

fs/nfsd/blocklayout.c

··· 1 + /* 2 + * Copyright (c) 2014 Christoph Hellwig. 3 + */ 4 + #include <linux/exportfs.h> 5 + #include <linux/genhd.h> 6 + #include <linux/slab.h> 7 + 8 + #include <linux/nfsd/debug.h> 9 + 10 + #include "blocklayoutxdr.h" 11 + #include "pnfs.h" 12 + 13 + #define NFSDDBG_FACILITY NFSDDBG_PNFS 14 + 15 + 16 + static int 17 + nfsd4_block_get_device_info_simple(struct super_block *sb, 18 + struct nfsd4_getdeviceinfo *gdp) 19 + { 20 + struct pnfs_block_deviceaddr *dev; 21 + struct pnfs_block_volume *b; 22 + 23 + dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + 24 + sizeof(struct pnfs_block_volume), GFP_KERNEL); 25 + if (!dev) 26 + return -ENOMEM; 27 + gdp->gd_device = dev; 28 + 29 + dev->nr_volumes = 1; 30 + b = &dev->volumes[0]; 31 + 32 + b->type = PNFS_BLOCK_VOLUME_SIMPLE; 33 + b->simple.sig_len = PNFS_BLOCK_UUID_LEN; 34 + return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len, 35 + &b->simple.offset); 36 + } 37 + 38 + static __be32 39 + nfsd4_block_proc_getdeviceinfo(struct super_block *sb, 40 + struct nfsd4_getdeviceinfo *gdp) 41 + { 42 + if (sb->s_bdev != sb->s_bdev->bd_contains) 43 + return nfserr_inval; 44 + return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); 45 + } 46 + 47 + static __be32 48 + nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, 49 + struct nfsd4_layoutget *args) 50 + { 51 + struct nfsd4_layout_seg *seg = &args->lg_seg; 52 + struct super_block *sb = inode->i_sb; 53 + u32 block_size = (1 << inode->i_blkbits); 54 + struct pnfs_block_extent *bex; 55 + struct iomap iomap; 56 + u32 device_generation = 0; 57 + int error; 58 + 59 + /* 60 + * We do not attempt to support I/O smaller than the fs block size, 61 + * or not aligned to it. 62 + */ 63 + if (args->lg_minlength < block_size) { 64 + dprintk("pnfsd: I/O too small\n"); 65 + goto out_layoutunavailable; 66 + } 67 + if (seg->offset & (block_size - 1)) { 68 + dprintk("pnfsd: I/O misaligned\n"); 69 + goto out_layoutunavailable; 70 + } 71 + 72 + /* 73 + * Some clients barf on non-zero block numbers for NONE or INVALID 74 + * layouts, so make sure to zero the whole structure. 75 + */ 76 + error = -ENOMEM; 77 + bex = kzalloc(sizeof(*bex), GFP_KERNEL); 78 + if (!bex) 79 + goto out_error; 80 + args->lg_content = bex; 81 + 82 + error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length, 83 + &iomap, seg->iomode != IOMODE_READ, 84 + &device_generation); 85 + if (error) { 86 + if (error == -ENXIO) 87 + goto out_layoutunavailable; 88 + goto out_error; 89 + } 90 + 91 + if (iomap.length < args->lg_minlength) { 92 + dprintk("pnfsd: extent smaller than minlength\n"); 93 + goto out_layoutunavailable; 94 + } 95 + 96 + switch (iomap.type) { 97 + case IOMAP_MAPPED: 98 + if (seg->iomode == IOMODE_READ) 99 + bex->es = PNFS_BLOCK_READ_DATA; 100 + else 101 + bex->es = PNFS_BLOCK_READWRITE_DATA; 102 + bex->soff = (iomap.blkno << 9); 103 + break; 104 + case IOMAP_UNWRITTEN: 105 + if (seg->iomode & IOMODE_RW) { 106 + /* 107 + * Crack monkey special case from section 2.3.1. 108 + */ 109 + if (args->lg_minlength == 0) { 110 + dprintk("pnfsd: no soup for you!\n"); 111 + goto out_layoutunavailable; 112 + } 113 + 114 + bex->es = PNFS_BLOCK_INVALID_DATA; 115 + bex->soff = (iomap.blkno << 9); 116 + break; 117 + } 118 + /*FALLTHRU*/ 119 + case IOMAP_HOLE: 120 + if (seg->iomode == IOMODE_READ) { 121 + bex->es = PNFS_BLOCK_NONE_DATA; 122 + break; 123 + } 124 + /*FALLTHRU*/ 125 + case IOMAP_DELALLOC: 126 + default: 127 + WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type); 128 + goto out_layoutunavailable; 129 + } 130 + 131 + error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation); 132 + if (error) 133 + goto out_error; 134 + bex->foff = iomap.offset; 135 + bex->len = iomap.length; 136 + 137 + seg->offset = iomap.offset; 138 + seg->length = iomap.length; 139 + 140 + dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es); 141 + return 0; 142 + 143 + out_error: 144 + seg->length = 0; 145 + return nfserrno(error); 146 + out_layoutunavailable: 147 + seg->length = 0; 148 + return nfserr_layoutunavailable; 149 + } 150 + 151 + static __be32 152 + nfsd4_block_proc_layoutcommit(struct inode *inode, 153 + struct nfsd4_layoutcommit *lcp) 154 + { 155 + loff_t new_size = lcp->lc_last_wr + 1; 156 + struct iattr iattr = { .ia_valid = 0 }; 157 + struct iomap *iomaps; 158 + int nr_iomaps; 159 + int error; 160 + 161 + nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, 162 + lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); 163 + if (nr_iomaps < 0) 164 + return nfserrno(nr_iomaps); 165 + 166 + if (lcp->lc_mtime.tv_nsec == UTIME_NOW || 167 + timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) 168 + lcp->lc_mtime = current_fs_time(inode->i_sb); 169 + iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; 170 + iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; 171 + 172 + if (new_size > i_size_read(inode)) { 173 + iattr.ia_valid |= ATTR_SIZE; 174 + iattr.ia_size = new_size; 175 + } 176 + 177 + error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps, 178 + nr_iomaps, &iattr); 179 + kfree(iomaps); 180 + return nfserrno(error); 181 + } 182 + 183 + const struct nfsd4_layout_ops bl_layout_ops = { 184 + .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo, 185 + .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, 186 + .proc_layoutget = nfsd4_block_proc_layoutget, 187 + .encode_layoutget = nfsd4_block_encode_layoutget, 188 + .proc_layoutcommit = nfsd4_block_proc_layoutcommit, 189 + };

+157

fs/nfsd/blocklayoutxdr.c

··· 1 + /* 2 + * Copyright (c) 2014 Christoph Hellwig. 3 + */ 4 + #include <linux/sunrpc/svc.h> 5 + #include <linux/exportfs.h> 6 + #include <linux/nfs4.h> 7 + 8 + #include "nfsd.h" 9 + #include "blocklayoutxdr.h" 10 + 11 + #define NFSDDBG_FACILITY NFSDDBG_PNFS 12 + 13 + 14 + __be32 15 + nfsd4_block_encode_layoutget(struct xdr_stream *xdr, 16 + struct nfsd4_layoutget *lgp) 17 + { 18 + struct pnfs_block_extent *b = lgp->lg_content; 19 + int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32); 20 + __be32 *p; 21 + 22 + p = xdr_reserve_space(xdr, sizeof(__be32) + len); 23 + if (!p) 24 + return nfserr_toosmall; 25 + 26 + *p++ = cpu_to_be32(len); 27 + *p++ = cpu_to_be32(1); /* we always return a single extent */ 28 + 29 + p = xdr_encode_opaque_fixed(p, &b->vol_id, 30 + sizeof(struct nfsd4_deviceid)); 31 + p = xdr_encode_hyper(p, b->foff); 32 + p = xdr_encode_hyper(p, b->len); 33 + p = xdr_encode_hyper(p, b->soff); 34 + *p++ = cpu_to_be32(b->es); 35 + return 0; 36 + } 37 + 38 + static int 39 + nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) 40 + { 41 + __be32 *p; 42 + int len; 43 + 44 + switch (b->type) { 45 + case PNFS_BLOCK_VOLUME_SIMPLE: 46 + len = 4 + 4 + 8 + 4 + b->simple.sig_len; 47 + p = xdr_reserve_space(xdr, len); 48 + if (!p) 49 + return -ETOOSMALL; 50 + 51 + *p++ = cpu_to_be32(b->type); 52 + *p++ = cpu_to_be32(1); /* single signature */ 53 + p = xdr_encode_hyper(p, b->simple.offset); 54 + p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len); 55 + break; 56 + default: 57 + return -ENOTSUPP; 58 + } 59 + 60 + return len; 61 + } 62 + 63 + __be32 64 + nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, 65 + struct nfsd4_getdeviceinfo *gdp) 66 + { 67 + struct pnfs_block_deviceaddr *dev = gdp->gd_device; 68 + int len = sizeof(__be32), ret, i; 69 + __be32 *p; 70 + 71 + p = xdr_reserve_space(xdr, len + sizeof(__be32)); 72 + if (!p) 73 + return nfserr_resource; 74 + 75 + for (i = 0; i < dev->nr_volumes; i++) { 76 + ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]); 77 + if (ret < 0) 78 + return nfserrno(ret); 79 + len += ret; 80 + } 81 + 82 + /* 83 + * Fill in the overall length and number of volumes at the beginning 84 + * of the layout. 85 + */ 86 + *p++ = cpu_to_be32(len); 87 + *p++ = cpu_to_be32(dev->nr_volumes); 88 + return 0; 89 + } 90 + 91 + int 92 + nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, 93 + u32 block_size) 94 + { 95 + struct iomap *iomaps; 96 + u32 nr_iomaps, expected, i; 97 + 98 + if (len < sizeof(u32)) { 99 + dprintk("%s: extent array too small: %u\n", __func__, len); 100 + return -EINVAL; 101 + } 102 + 103 + nr_iomaps = be32_to_cpup(p++); 104 + expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE; 105 + if (len != expected) { 106 + dprintk("%s: extent array size mismatch: %u/%u\n", 107 + __func__, len, expected); 108 + return -EINVAL; 109 + } 110 + 111 + iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); 112 + if (!iomaps) { 113 + dprintk("%s: failed to allocate extent array\n", __func__); 114 + return -ENOMEM; 115 + } 116 + 117 + for (i = 0; i < nr_iomaps; i++) { 118 + struct pnfs_block_extent bex; 119 + 120 + memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); 121 + p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); 122 + 123 + p = xdr_decode_hyper(p, &bex.foff); 124 + if (bex.foff & (block_size - 1)) { 125 + dprintk("%s: unaligned offset %lld\n", 126 + __func__, bex.foff); 127 + goto fail; 128 + } 129 + p = xdr_decode_hyper(p, &bex.len); 130 + if (bex.len & (block_size - 1)) { 131 + dprintk("%s: unaligned length %lld\n", 132 + __func__, bex.foff); 133 + goto fail; 134 + } 135 + p = xdr_decode_hyper(p, &bex.soff); 136 + if (bex.soff & (block_size - 1)) { 137 + dprintk("%s: unaligned disk offset %lld\n", 138 + __func__, bex.soff); 139 + goto fail; 140 + } 141 + bex.es = be32_to_cpup(p++); 142 + if (bex.es != PNFS_BLOCK_READWRITE_DATA) { 143 + dprintk("%s: incorrect extent state %d\n", 144 + __func__, bex.es); 145 + goto fail; 146 + } 147 + 148 + iomaps[i].offset = bex.foff; 149 + iomaps[i].length = bex.len; 150 + } 151 + 152 + *iomapp = iomaps; 153 + return nr_iomaps; 154 + fail: 155 + kfree(iomaps); 156 + return -EINVAL; 157 + }

+62

fs/nfsd/blocklayoutxdr.h

··· 1 + #ifndef _NFSD_BLOCKLAYOUTXDR_H 2 + #define _NFSD_BLOCKLAYOUTXDR_H 1 3 + 4 + #include <linux/blkdev.h> 5 + #include "xdr4.h" 6 + 7 + struct iomap; 8 + struct xdr_stream; 9 + 10 + enum pnfs_block_extent_state { 11 + PNFS_BLOCK_READWRITE_DATA = 0, 12 + PNFS_BLOCK_READ_DATA = 1, 13 + PNFS_BLOCK_INVALID_DATA = 2, 14 + PNFS_BLOCK_NONE_DATA = 3, 15 + }; 16 + 17 + struct pnfs_block_extent { 18 + struct nfsd4_deviceid vol_id; 19 + u64 foff; 20 + u64 len; 21 + u64 soff; 22 + enum pnfs_block_extent_state es; 23 + }; 24 + #define NFS4_BLOCK_EXTENT_SIZE 44 25 + 26 + enum pnfs_block_volume_type { 27 + PNFS_BLOCK_VOLUME_SIMPLE = 0, 28 + PNFS_BLOCK_VOLUME_SLICE = 1, 29 + PNFS_BLOCK_VOLUME_CONCAT = 2, 30 + PNFS_BLOCK_VOLUME_STRIPE = 3, 31 + }; 32 + 33 + /* 34 + * Random upper cap for the uuid length to avoid unbounded allocation. 35 + * Not actually limited by the protocol. 36 + */ 37 + #define PNFS_BLOCK_UUID_LEN 128 38 + 39 + struct pnfs_block_volume { 40 + enum pnfs_block_volume_type type; 41 + union { 42 + struct { 43 + u64 offset; 44 + u32 sig_len; 45 + u8 sig[PNFS_BLOCK_UUID_LEN]; 46 + } simple; 47 + }; 48 + }; 49 + 50 + struct pnfs_block_deviceaddr { 51 + u32 nr_volumes; 52 + struct pnfs_block_volume volumes[]; 53 + }; 54 + 55 + __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, 56 + struct nfsd4_getdeviceinfo *gdp); 57 + __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, 58 + struct nfsd4_layoutget *lgp); 59 + int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, 60 + u32 block_size); 61 + 62 + #endif /* _NFSD_BLOCKLAYOUTXDR_H */

+8

fs/nfsd/export.c

··· 20 20 #include "nfsd.h" 21 21 #include "nfsfh.h" 22 22 #include "netns.h" 23 + #include "pnfs.h" 23 24 24 25 #define NFSDDBG_FACILITY NFSDDBG_EXPORT 25 26 ··· 546 545 547 546 exp.ex_client = dom; 548 547 exp.cd = cd; 548 + exp.ex_devid_map = NULL; 549 549 550 550 /* expiry */ 551 551 err = -EINVAL; ··· 623 621 if (!gid_valid(exp.ex_anon_gid)) 624 622 goto out4; 625 623 err = 0; 624 + 625 + nfsd4_setup_layout_type(&exp); 626 626 } 627 627 628 628 expp = svc_export_lookup(&exp); ··· 707 703 new->ex_fslocs.locations = NULL; 708 704 new->ex_fslocs.locations_count = 0; 709 705 new->ex_fslocs.migrated = 0; 706 + new->ex_layout_type = 0; 710 707 new->ex_uuid = NULL; 711 708 new->cd = item->cd; 712 709 } ··· 722 717 new->ex_anon_uid = item->ex_anon_uid; 723 718 new->ex_anon_gid = item->ex_anon_gid; 724 719 new->ex_fsid = item->ex_fsid; 720 + new->ex_devid_map = item->ex_devid_map; 721 + item->ex_devid_map = NULL; 725 722 new->ex_uuid = item->ex_uuid; 726 723 item->ex_uuid = NULL; 727 724 new->ex_fslocs.locations = item->ex_fslocs.locations; ··· 732 725 item->ex_fslocs.locations_count = 0; 733 726 new->ex_fslocs.migrated = item->ex_fslocs.migrated; 734 727 item->ex_fslocs.migrated = 0; 728 + new->ex_layout_type = item->ex_layout_type; 735 729 new->ex_nflavors = item->ex_nflavors; 736 730 for (i = 0; i < MAX_SECINFO_LIST; i++) { 737 731 new->ex_flavors[i] = item->ex_flavors[i];

+2

fs/nfsd/export.h

··· 56 56 struct nfsd4_fs_locations ex_fslocs; 57 57 uint32_t ex_nflavors; 58 58 struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; 59 + enum pnfs_layouttype ex_layout_type; 60 + struct nfsd4_deviceid_map *ex_devid_map; 59 61 struct cache_detail *cd; 60 62 }; 61 63

+99

fs/nfsd/nfs4callback.c

··· 546 546 return status; 547 547 } 548 548 549 + #ifdef CONFIG_NFSD_PNFS 550 + /* 551 + * CB_LAYOUTRECALL4args 552 + * 553 + * struct layoutrecall_file4 { 554 + * nfs_fh4 lor_fh; 555 + * offset4 lor_offset; 556 + * length4 lor_length; 557 + * stateid4 lor_stateid; 558 + * }; 559 + * 560 + * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) { 561 + * case LAYOUTRECALL4_FILE: 562 + * layoutrecall_file4 lor_layout; 563 + * case LAYOUTRECALL4_FSID: 564 + * fsid4 lor_fsid; 565 + * case LAYOUTRECALL4_ALL: 566 + * void; 567 + * }; 568 + * 569 + * struct CB_LAYOUTRECALL4args { 570 + * layouttype4 clora_type; 571 + * layoutiomode4 clora_iomode; 572 + * bool clora_changed; 573 + * layoutrecall4 clora_recall; 574 + * }; 575 + */ 576 + static void encode_cb_layout4args(struct xdr_stream *xdr, 577 + const struct nfs4_layout_stateid *ls, 578 + struct nfs4_cb_compound_hdr *hdr) 579 + { 580 + __be32 *p; 581 + 582 + BUG_ON(hdr->minorversion == 0); 583 + 584 + p = xdr_reserve_space(xdr, 5 * 4); 585 + *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL); 586 + *p++ = cpu_to_be32(ls->ls_layout_type); 587 + *p++ = cpu_to_be32(IOMODE_ANY); 588 + *p++ = cpu_to_be32(1); 589 + *p = cpu_to_be32(RETURN_FILE); 590 + 591 + encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle); 592 + 593 + p = xdr_reserve_space(xdr, 2 * 8); 594 + p = xdr_encode_hyper(p, 0); 595 + xdr_encode_hyper(p, NFS4_MAX_UINT64); 596 + 597 + encode_stateid4(xdr, &ls->ls_recall_sid); 598 + 599 + hdr->nops++; 600 + } 601 + 602 + static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, 603 + struct xdr_stream *xdr, 604 + const struct nfsd4_callback *cb) 605 + { 606 + const struct nfs4_layout_stateid *ls = 607 + container_of(cb, struct nfs4_layout_stateid, ls_recall); 608 + struct nfs4_cb_compound_hdr hdr = { 609 + .ident = 0, 610 + .minorversion = cb->cb_minorversion, 611 + }; 612 + 613 + encode_cb_compound4args(xdr, &hdr); 614 + encode_cb_sequence4args(xdr, cb, &hdr); 615 + encode_cb_layout4args(xdr, ls, &hdr); 616 + encode_cb_nops(&hdr); 617 + } 618 + 619 + static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, 620 + struct xdr_stream *xdr, 621 + struct nfsd4_callback *cb) 622 + { 623 + struct nfs4_cb_compound_hdr hdr; 624 + enum nfsstat4 nfserr; 625 + int status; 626 + 627 + status = decode_cb_compound4res(xdr, &hdr); 628 + if (unlikely(status)) 629 + goto out; 630 + if (cb) { 631 + status = decode_cb_sequence4res(xdr, cb); 632 + if (unlikely(status)) 633 + goto out; 634 + } 635 + status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr); 636 + if (unlikely(status)) 637 + goto out; 638 + if (unlikely(nfserr != NFS4_OK)) 639 + status = nfs_cb_stat_to_errno(nfserr); 640 + out: 641 + return status; 642 + } 643 + #endif /* CONFIG_NFSD_PNFS */ 644 + 549 645 /* 550 646 * RPC procedure tables 551 647 */ ··· 659 563 static struct rpc_procinfo nfs4_cb_procedures[] = { 660 564 PROC(CB_NULL, NULL, cb_null, cb_null), 661 565 PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), 566 + #ifdef CONFIG_NFSD_PNFS 567 + PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), 568 + #endif 662 569 }; 663 570 664 571 static struct rpc_version nfs_cb_version4 = {

+721

fs/nfsd/nfs4layouts.c

··· 1 + /* 2 + * Copyright (c) 2014 Christoph Hellwig. 3 + */ 4 + #include <linux/kmod.h> 5 + #include <linux/file.h> 6 + #include <linux/jhash.h> 7 + #include <linux/sched.h> 8 + #include <linux/sunrpc/addr.h> 9 + 10 + #include "pnfs.h" 11 + #include "netns.h" 12 + #include "trace.h" 13 + 14 + #define NFSDDBG_FACILITY NFSDDBG_PNFS 15 + 16 + struct nfs4_layout { 17 + struct list_head lo_perstate; 18 + struct nfs4_layout_stateid *lo_state; 19 + struct nfsd4_layout_seg lo_seg; 20 + }; 21 + 22 + static struct kmem_cache *nfs4_layout_cache; 23 + static struct kmem_cache *nfs4_layout_stateid_cache; 24 + 25 + static struct nfsd4_callback_ops nfsd4_cb_layout_ops; 26 + static const struct lock_manager_operations nfsd4_layouts_lm_ops; 27 + 28 + const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { 29 + [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, 30 + }; 31 + 32 + /* pNFS device ID to export fsid mapping */ 33 + #define DEVID_HASH_BITS 8 34 + #define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS) 35 + #define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1) 36 + static u64 nfsd_devid_seq = 1; 37 + static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE]; 38 + static DEFINE_SPINLOCK(nfsd_devid_lock); 39 + 40 + static inline u32 devid_hashfn(u64 idx) 41 + { 42 + return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK; 43 + } 44 + 45 + static void 46 + nfsd4_alloc_devid_map(const struct svc_fh *fhp) 47 + { 48 + const struct knfsd_fh *fh = &fhp->fh_handle; 49 + size_t fsid_len = key_len(fh->fh_fsid_type); 50 + struct nfsd4_deviceid_map *map, *old; 51 + int i; 52 + 53 + map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL); 54 + if (!map) 55 + return; 56 + 57 + map->fsid_type = fh->fh_fsid_type; 58 + memcpy(&map->fsid, fh->fh_fsid, fsid_len); 59 + 60 + spin_lock(&nfsd_devid_lock); 61 + if (fhp->fh_export->ex_devid_map) 62 + goto out_unlock; 63 + 64 + for (i = 0; i < DEVID_HASH_SIZE; i++) { 65 + list_for_each_entry(old, &nfsd_devid_hash[i], hash) { 66 + if (old->fsid_type != fh->fh_fsid_type) 67 + continue; 68 + if (memcmp(old->fsid, fh->fh_fsid, 69 + key_len(old->fsid_type))) 70 + continue; 71 + 72 + fhp->fh_export->ex_devid_map = old; 73 + goto out_unlock; 74 + } 75 + } 76 + 77 + map->idx = nfsd_devid_seq++; 78 + list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]); 79 + fhp->fh_export->ex_devid_map = map; 80 + map = NULL; 81 + 82 + out_unlock: 83 + spin_unlock(&nfsd_devid_lock); 84 + kfree(map); 85 + } 86 + 87 + struct nfsd4_deviceid_map * 88 + nfsd4_find_devid_map(int idx) 89 + { 90 + struct nfsd4_deviceid_map *map, *ret = NULL; 91 + 92 + rcu_read_lock(); 93 + list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash) 94 + if (map->idx == idx) 95 + ret = map; 96 + rcu_read_unlock(); 97 + 98 + return ret; 99 + } 100 + 101 + int 102 + nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, 103 + u32 device_generation) 104 + { 105 + if (!fhp->fh_export->ex_devid_map) { 106 + nfsd4_alloc_devid_map(fhp); 107 + if (!fhp->fh_export->ex_devid_map) 108 + return -ENOMEM; 109 + } 110 + 111 + id->fsid_idx = fhp->fh_export->ex_devid_map->idx; 112 + id->generation = device_generation; 113 + id->pad = 0; 114 + return 0; 115 + } 116 + 117 + void nfsd4_setup_layout_type(struct svc_export *exp) 118 + { 119 + struct super_block *sb = exp->ex_path.mnt->mnt_sb; 120 + 121 + if (exp->ex_flags & NFSEXP_NOPNFS) 122 + return; 123 + 124 + if (sb->s_export_op->get_uuid && 125 + sb->s_export_op->map_blocks && 126 + sb->s_export_op->commit_blocks) 127 + exp->ex_layout_type = LAYOUT_BLOCK_VOLUME; 128 + } 129 + 130 + static void 131 + nfsd4_free_layout_stateid(struct nfs4_stid *stid) 132 + { 133 + struct nfs4_layout_stateid *ls = layoutstateid(stid); 134 + struct nfs4_client *clp = ls->ls_stid.sc_client; 135 + struct nfs4_file *fp = ls->ls_stid.sc_file; 136 + 137 + trace_layoutstate_free(&ls->ls_stid.sc_stateid); 138 + 139 + spin_lock(&clp->cl_lock); 140 + list_del_init(&ls->ls_perclnt); 141 + spin_unlock(&clp->cl_lock); 142 + 143 + spin_lock(&fp->fi_lock); 144 + list_del_init(&ls->ls_perfile); 145 + spin_unlock(&fp->fi_lock); 146 + 147 + vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); 148 + fput(ls->ls_file); 149 + 150 + if (ls->ls_recalled) 151 + atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); 152 + 153 + kmem_cache_free(nfs4_layout_stateid_cache, ls); 154 + } 155 + 156 + static int 157 + nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) 158 + { 159 + struct file_lock *fl; 160 + int status; 161 + 162 + fl = locks_alloc_lock(); 163 + if (!fl) 164 + return -ENOMEM; 165 + locks_init_lock(fl); 166 + fl->fl_lmops = &nfsd4_layouts_lm_ops; 167 + fl->fl_flags = FL_LAYOUT; 168 + fl->fl_type = F_RDLCK; 169 + fl->fl_end = OFFSET_MAX; 170 + fl->fl_owner = ls; 171 + fl->fl_pid = current->tgid; 172 + fl->fl_file = ls->ls_file; 173 + 174 + status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); 175 + if (status) { 176 + locks_free_lock(fl); 177 + return status; 178 + } 179 + BUG_ON(fl != NULL); 180 + return 0; 181 + } 182 + 183 + static struct nfs4_layout_stateid * 184 + nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, 185 + struct nfs4_stid *parent, u32 layout_type) 186 + { 187 + struct nfs4_client *clp = cstate->clp; 188 + struct nfs4_file *fp = parent->sc_file; 189 + struct nfs4_layout_stateid *ls; 190 + struct nfs4_stid *stp; 191 + 192 + stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache); 193 + if (!stp) 194 + return NULL; 195 + stp->sc_free = nfsd4_free_layout_stateid; 196 + get_nfs4_file(fp); 197 + stp->sc_file = fp; 198 + 199 + ls = layoutstateid(stp); 200 + INIT_LIST_HEAD(&ls->ls_perclnt); 201 + INIT_LIST_HEAD(&ls->ls_perfile); 202 + spin_lock_init(&ls->ls_lock); 203 + INIT_LIST_HEAD(&ls->ls_layouts); 204 + ls->ls_layout_type = layout_type; 205 + nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, 206 + NFSPROC4_CLNT_CB_LAYOUT); 207 + 208 + if (parent->sc_type == NFS4_DELEG_STID) 209 + ls->ls_file = get_file(fp->fi_deleg_file); 210 + else 211 + ls->ls_file = find_any_file(fp); 212 + BUG_ON(!ls->ls_file); 213 + 214 + if (nfsd4_layout_setlease(ls)) { 215 + put_nfs4_file(fp); 216 + kmem_cache_free(nfs4_layout_stateid_cache, ls); 217 + return NULL; 218 + } 219 + 220 + spin_lock(&clp->cl_lock); 221 + stp->sc_type = NFS4_LAYOUT_STID; 222 + list_add(&ls->ls_perclnt, &clp->cl_lo_states); 223 + spin_unlock(&clp->cl_lock); 224 + 225 + spin_lock(&fp->fi_lock); 226 + list_add(&ls->ls_perfile, &fp->fi_lo_states); 227 + spin_unlock(&fp->fi_lock); 228 + 229 + trace_layoutstate_alloc(&ls->ls_stid.sc_stateid); 230 + return ls; 231 + } 232 + 233 + __be32 234 + nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, 235 + struct nfsd4_compound_state *cstate, stateid_t *stateid, 236 + bool create, u32 layout_type, struct nfs4_layout_stateid **lsp) 237 + { 238 + struct nfs4_layout_stateid *ls; 239 + struct nfs4_stid *stid; 240 + unsigned char typemask = NFS4_LAYOUT_STID; 241 + __be32 status; 242 + 243 + if (create) 244 + typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID); 245 + 246 + status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid, 247 + net_generic(SVC_NET(rqstp), nfsd_net_id)); 248 + if (status) 249 + goto out; 250 + 251 + if (!fh_match(&cstate->current_fh.fh_handle, 252 + &stid->sc_file->fi_fhandle)) { 253 + status = nfserr_bad_stateid; 254 + goto out_put_stid; 255 + } 256 + 257 + if (stid->sc_type != NFS4_LAYOUT_STID) { 258 + ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type); 259 + nfs4_put_stid(stid); 260 + 261 + status = nfserr_jukebox; 262 + if (!ls) 263 + goto out; 264 + } else { 265 + ls = container_of(stid, struct nfs4_layout_stateid, ls_stid); 266 + 267 + status = nfserr_bad_stateid; 268 + if (stateid->si_generation > stid->sc_stateid.si_generation) 269 + goto out_put_stid; 270 + if (layout_type != ls->ls_layout_type) 271 + goto out_put_stid; 272 + } 273 + 274 + *lsp = ls; 275 + return 0; 276 + 277 + out_put_stid: 278 + nfs4_put_stid(stid); 279 + out: 280 + return status; 281 + } 282 + 283 + static void 284 + nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) 285 + { 286 + spin_lock(&ls->ls_lock); 287 + if (ls->ls_recalled) 288 + goto out_unlock; 289 + 290 + ls->ls_recalled = true; 291 + atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); 292 + if (list_empty(&ls->ls_layouts)) 293 + goto out_unlock; 294 + 295 + trace_layout_recall(&ls->ls_stid.sc_stateid); 296 + 297 + atomic_inc(&ls->ls_stid.sc_count); 298 + update_stateid(&ls->ls_stid.sc_stateid); 299 + memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); 300 + nfsd4_run_cb(&ls->ls_recall); 301 + 302 + out_unlock: 303 + spin_unlock(&ls->ls_lock); 304 + } 305 + 306 + static inline u64 307 + layout_end(struct nfsd4_layout_seg *seg) 308 + { 309 + u64 end = seg->offset + seg->length; 310 + return end >= seg->offset ? end : NFS4_MAX_UINT64; 311 + } 312 + 313 + static void 314 + layout_update_len(struct nfsd4_layout_seg *lo, u64 end) 315 + { 316 + if (end == NFS4_MAX_UINT64) 317 + lo->length = NFS4_MAX_UINT64; 318 + else 319 + lo->length = end - lo->offset; 320 + } 321 + 322 + static bool 323 + layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s) 324 + { 325 + if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode) 326 + return false; 327 + if (layout_end(&lo->lo_seg) <= s->offset) 328 + return false; 329 + if (layout_end(s) <= lo->lo_seg.offset) 330 + return false; 331 + return true; 332 + } 333 + 334 + static bool 335 + layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new) 336 + { 337 + if (lo->iomode != new->iomode) 338 + return false; 339 + if (layout_end(new) < lo->offset) 340 + return false; 341 + if (layout_end(lo) < new->offset) 342 + return false; 343 + 344 + lo->offset = min(lo->offset, new->offset); 345 + layout_update_len(lo, max(layout_end(lo), layout_end(new))); 346 + return true; 347 + } 348 + 349 + static __be32 350 + nfsd4_recall_conflict(struct nfs4_layout_stateid *ls) 351 + { 352 + struct nfs4_file *fp = ls->ls_stid.sc_file; 353 + struct nfs4_layout_stateid *l, *n; 354 + __be32 nfserr = nfs_ok; 355 + 356 + assert_spin_locked(&fp->fi_lock); 357 + 358 + list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) { 359 + if (l != ls) { 360 + nfsd4_recall_file_layout(l); 361 + nfserr = nfserr_recallconflict; 362 + } 363 + } 364 + 365 + return nfserr; 366 + } 367 + 368 + __be32 369 + nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) 370 + { 371 + struct nfsd4_layout_seg *seg = &lgp->lg_seg; 372 + struct nfs4_file *fp = ls->ls_stid.sc_file; 373 + struct nfs4_layout *lp, *new = NULL; 374 + __be32 nfserr; 375 + 376 + spin_lock(&fp->fi_lock); 377 + nfserr = nfsd4_recall_conflict(ls); 378 + if (nfserr) 379 + goto out; 380 + spin_lock(&ls->ls_lock); 381 + list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { 382 + if (layouts_try_merge(&lp->lo_seg, seg)) 383 + goto done; 384 + } 385 + spin_unlock(&ls->ls_lock); 386 + spin_unlock(&fp->fi_lock); 387 + 388 + new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL); 389 + if (!new) 390 + return nfserr_jukebox; 391 + memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg)); 392 + new->lo_state = ls; 393 + 394 + spin_lock(&fp->fi_lock); 395 + nfserr = nfsd4_recall_conflict(ls); 396 + if (nfserr) 397 + goto out; 398 + spin_lock(&ls->ls_lock); 399 + list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { 400 + if (layouts_try_merge(&lp->lo_seg, seg)) 401 + goto done; 402 + } 403 + 404 + atomic_inc(&ls->ls_stid.sc_count); 405 + list_add_tail(&new->lo_perstate, &ls->ls_layouts); 406 + new = NULL; 407 + done: 408 + update_stateid(&ls->ls_stid.sc_stateid); 409 + memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); 410 + spin_unlock(&ls->ls_lock); 411 + out: 412 + spin_unlock(&fp->fi_lock); 413 + if (new) 414 + kmem_cache_free(nfs4_layout_cache, new); 415 + return nfserr; 416 + } 417 + 418 + static void 419 + nfsd4_free_layouts(struct list_head *reaplist) 420 + { 421 + while (!list_empty(reaplist)) { 422 + struct nfs4_layout *lp = list_first_entry(reaplist, 423 + struct nfs4_layout, lo_perstate); 424 + 425 + list_del(&lp->lo_perstate); 426 + nfs4_put_stid(&lp->lo_state->ls_stid); 427 + kmem_cache_free(nfs4_layout_cache, lp); 428 + } 429 + } 430 + 431 + static void 432 + nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg, 433 + struct list_head *reaplist) 434 + { 435 + struct nfsd4_layout_seg *lo = &lp->lo_seg; 436 + u64 end = layout_end(lo); 437 + 438 + if (seg->offset <= lo->offset) { 439 + if (layout_end(seg) >= end) { 440 + list_move_tail(&lp->lo_perstate, reaplist); 441 + return; 442 + } 443 + end = seg->offset; 444 + } else { 445 + /* retain the whole layout segment on a split. */ 446 + if (layout_end(seg) < end) { 447 + dprintk("%s: split not supported\n", __func__); 448 + return; 449 + } 450 + 451 + lo->offset = layout_end(seg); 452 + } 453 + 454 + layout_update_len(lo, end); 455 + } 456 + 457 + __be32 458 + nfsd4_return_file_layouts(struct svc_rqst *rqstp, 459 + struct nfsd4_compound_state *cstate, 460 + struct nfsd4_layoutreturn *lrp) 461 + { 462 + struct nfs4_layout_stateid *ls; 463 + struct nfs4_layout *lp, *n; 464 + LIST_HEAD(reaplist); 465 + __be32 nfserr; 466 + int found = 0; 467 + 468 + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid, 469 + false, lrp->lr_layout_type, 470 + &ls); 471 + if (nfserr) { 472 + trace_layout_return_lookup_fail(&lrp->lr_sid); 473 + return nfserr; 474 + } 475 + 476 + spin_lock(&ls->ls_lock); 477 + list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) { 478 + if (layouts_overlapping(lp, &lrp->lr_seg)) { 479 + nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist); 480 + found++; 481 + } 482 + } 483 + if (!list_empty(&ls->ls_layouts)) { 484 + if (found) { 485 + update_stateid(&ls->ls_stid.sc_stateid); 486 + memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid, 487 + sizeof(stateid_t)); 488 + } 489 + lrp->lrs_present = 1; 490 + } else { 491 + trace_layoutstate_unhash(&ls->ls_stid.sc_stateid); 492 + nfs4_unhash_stid(&ls->ls_stid); 493 + lrp->lrs_present = 0; 494 + } 495 + spin_unlock(&ls->ls_lock); 496 + 497 + nfs4_put_stid(&ls->ls_stid); 498 + nfsd4_free_layouts(&reaplist); 499 + return nfs_ok; 500 + } 501 + 502 + __be32 503 + nfsd4_return_client_layouts(struct svc_rqst *rqstp, 504 + struct nfsd4_compound_state *cstate, 505 + struct nfsd4_layoutreturn *lrp) 506 + { 507 + struct nfs4_layout_stateid *ls, *n; 508 + struct nfs4_client *clp = cstate->clp; 509 + struct nfs4_layout *lp, *t; 510 + LIST_HEAD(reaplist); 511 + 512 + lrp->lrs_present = 0; 513 + 514 + spin_lock(&clp->cl_lock); 515 + list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) { 516 + if (lrp->lr_return_type == RETURN_FSID && 517 + !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle, 518 + &cstate->current_fh.fh_handle)) 519 + continue; 520 + 521 + spin_lock(&ls->ls_lock); 522 + list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) { 523 + if (lrp->lr_seg.iomode == IOMODE_ANY || 524 + lrp->lr_seg.iomode == lp->lo_seg.iomode) 525 + list_move_tail(&lp->lo_perstate, &reaplist); 526 + } 527 + spin_unlock(&ls->ls_lock); 528 + } 529 + spin_unlock(&clp->cl_lock); 530 + 531 + nfsd4_free_layouts(&reaplist); 532 + return 0; 533 + } 534 + 535 + static void 536 + nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls, 537 + struct list_head *reaplist) 538 + { 539 + spin_lock(&ls->ls_lock); 540 + list_splice_init(&ls->ls_layouts, reaplist); 541 + spin_unlock(&ls->ls_lock); 542 + } 543 + 544 + void 545 + nfsd4_return_all_client_layouts(struct nfs4_client *clp) 546 + { 547 + struct nfs4_layout_stateid *ls, *n; 548 + LIST_HEAD(reaplist); 549 + 550 + spin_lock(&clp->cl_lock); 551 + list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) 552 + nfsd4_return_all_layouts(ls, &reaplist); 553 + spin_unlock(&clp->cl_lock); 554 + 555 + nfsd4_free_layouts(&reaplist); 556 + } 557 + 558 + void 559 + nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) 560 + { 561 + struct nfs4_layout_stateid *ls, *n; 562 + LIST_HEAD(reaplist); 563 + 564 + spin_lock(&fp->fi_lock); 565 + list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) { 566 + if (ls->ls_stid.sc_client == clp) 567 + nfsd4_return_all_layouts(ls, &reaplist); 568 + } 569 + spin_unlock(&fp->fi_lock); 570 + 571 + nfsd4_free_layouts(&reaplist); 572 + } 573 + 574 + static void 575 + nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) 576 + { 577 + struct nfs4_client *clp = ls->ls_stid.sc_client; 578 + char addr_str[INET6_ADDRSTRLEN]; 579 + static char *envp[] = { 580 + "HOME=/", 581 + "TERM=linux", 582 + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", 583 + NULL 584 + }; 585 + char *argv[8]; 586 + int error; 587 + 588 + rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); 589 + 590 + nfsd4_cb_layout_fail(ls); 591 + 592 + printk(KERN_WARNING 593 + "nfsd: client %s failed to respond to layout recall. " 594 + " Fencing..\n", addr_str); 595 + 596 + argv[0] = "/sbin/nfsd-recall-failed"; 597 + argv[1] = addr_str; 598 + argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; 599 + argv[3] = NULL; 600 + 601 + error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); 602 + if (error) { 603 + printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n", 604 + addr_str, error); 605 + } 606 + } 607 + 608 + static int 609 + nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) 610 + { 611 + struct nfs4_layout_stateid *ls = 612 + container_of(cb, struct nfs4_layout_stateid, ls_recall); 613 + LIST_HEAD(reaplist); 614 + 615 + switch (task->tk_status) { 616 + case 0: 617 + return 1; 618 + case -NFS4ERR_NOMATCHING_LAYOUT: 619 + trace_layout_recall_done(&ls->ls_stid.sc_stateid); 620 + task->tk_status = 0; 621 + return 1; 622 + case -NFS4ERR_DELAY: 623 + /* Poll the client until it's done with the layout */ 624 + /* FIXME: cap number of retries. 625 + * The pnfs standard states that we need to only expire 626 + * the client after at-least "lease time" .eg lease-time * 2 627 + * when failing to communicate a recall 628 + */ 629 + rpc_delay(task, HZ/100); /* 10 mili-seconds */ 630 + return 0; 631 + default: 632 + /* 633 + * Unknown error or non-responding client, we'll need to fence. 634 + */ 635 + nfsd4_cb_layout_fail(ls); 636 + return -1; 637 + } 638 + } 639 + 640 + static void 641 + nfsd4_cb_layout_release(struct nfsd4_callback *cb) 642 + { 643 + struct nfs4_layout_stateid *ls = 644 + container_of(cb, struct nfs4_layout_stateid, ls_recall); 645 + LIST_HEAD(reaplist); 646 + 647 + trace_layout_recall_release(&ls->ls_stid.sc_stateid); 648 + 649 + nfsd4_return_all_layouts(ls, &reaplist); 650 + nfsd4_free_layouts(&reaplist); 651 + nfs4_put_stid(&ls->ls_stid); 652 + } 653 + 654 + static struct nfsd4_callback_ops nfsd4_cb_layout_ops = { 655 + .done = nfsd4_cb_layout_done, 656 + .release = nfsd4_cb_layout_release, 657 + }; 658 + 659 + static bool 660 + nfsd4_layout_lm_break(struct file_lock *fl) 661 + { 662 + /* 663 + * We don't want the locks code to timeout the lease for us; 664 + * we'll remove it ourself if a layout isn't returned 665 + * in time: 666 + */ 667 + fl->fl_break_time = 0; 668 + nfsd4_recall_file_layout(fl->fl_owner); 669 + return false; 670 + } 671 + 672 + static int 673 + nfsd4_layout_lm_change(struct file_lock *onlist, int arg, 674 + struct list_head *dispose) 675 + { 676 + BUG_ON(!(arg & F_UNLCK)); 677 + return lease_modify(onlist, arg, dispose); 678 + } 679 + 680 + static const struct lock_manager_operations nfsd4_layouts_lm_ops = { 681 + .lm_break = nfsd4_layout_lm_break, 682 + .lm_change = nfsd4_layout_lm_change, 683 + }; 684 + 685 + int 686 + nfsd4_init_pnfs(void) 687 + { 688 + int i; 689 + 690 + for (i = 0; i < DEVID_HASH_SIZE; i++) 691 + INIT_LIST_HEAD(&nfsd_devid_hash[i]); 692 + 693 + nfs4_layout_cache = kmem_cache_create("nfs4_layout", 694 + sizeof(struct nfs4_layout), 0, 0, NULL); 695 + if (!nfs4_layout_cache) 696 + return -ENOMEM; 697 + 698 + nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid", 699 + sizeof(struct nfs4_layout_stateid), 0, 0, NULL); 700 + if (!nfs4_layout_stateid_cache) { 701 + kmem_cache_destroy(nfs4_layout_cache); 702 + return -ENOMEM; 703 + } 704 + return 0; 705 + } 706 + 707 + void 708 + nfsd4_exit_pnfs(void) 709 + { 710 + int i; 711 + 712 + kmem_cache_destroy(nfs4_layout_cache); 713 + kmem_cache_destroy(nfs4_layout_stateid_cache); 714 + 715 + for (i = 0; i < DEVID_HASH_SIZE; i++) { 716 + struct nfsd4_deviceid_map *map, *n; 717 + 718 + list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash) 719 + kfree(map); 720 + } 721 + }

+310

fs/nfsd/nfs4proc.c

··· 43 43 #include "current_stateid.h" 44 44 #include "netns.h" 45 45 #include "acl.h" 46 + #include "pnfs.h" 47 + #include "trace.h" 46 48 47 49 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL 48 50 #include <linux/security.h> ··· 1180 1178 return status == nfserr_same ? nfs_ok : status; 1181 1179 } 1182 1180 1181 + #ifdef CONFIG_NFSD_PNFS 1182 + static const struct nfsd4_layout_ops * 1183 + nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) 1184 + { 1185 + if (!exp->ex_layout_type) { 1186 + dprintk("%s: export does not support pNFS\n", __func__); 1187 + return NULL; 1188 + } 1189 + 1190 + if (exp->ex_layout_type != layout_type) { 1191 + dprintk("%s: layout type %d not supported\n", 1192 + __func__, layout_type); 1193 + return NULL; 1194 + } 1195 + 1196 + return nfsd4_layout_ops[layout_type]; 1197 + } 1198 + 1199 + static __be32 1200 + nfsd4_getdeviceinfo(struct svc_rqst *rqstp, 1201 + struct nfsd4_compound_state *cstate, 1202 + struct nfsd4_getdeviceinfo *gdp) 1203 + { 1204 + const struct nfsd4_layout_ops *ops; 1205 + struct nfsd4_deviceid_map *map; 1206 + struct svc_export *exp; 1207 + __be32 nfserr; 1208 + 1209 + dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n", 1210 + __func__, 1211 + gdp->gd_layout_type, 1212 + gdp->gd_devid.fsid_idx, gdp->gd_devid.generation, 1213 + gdp->gd_maxcount); 1214 + 1215 + map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx); 1216 + if (!map) { 1217 + dprintk("%s: couldn't find device ID to export mapping!\n", 1218 + __func__); 1219 + return nfserr_noent; 1220 + } 1221 + 1222 + exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid); 1223 + if (IS_ERR(exp)) { 1224 + dprintk("%s: could not find device id\n", __func__); 1225 + return nfserr_noent; 1226 + } 1227 + 1228 + nfserr = nfserr_layoutunavailable; 1229 + ops = nfsd4_layout_verify(exp, gdp->gd_layout_type); 1230 + if (!ops) 1231 + goto out; 1232 + 1233 + nfserr = nfs_ok; 1234 + if (gdp->gd_maxcount != 0) 1235 + nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp); 1236 + 1237 + gdp->gd_notify_types &= ops->notify_types; 1238 + exp_put(exp); 1239 + out: 1240 + return nfserr; 1241 + } 1242 + 1243 + static __be32 1244 + nfsd4_layoutget(struct svc_rqst *rqstp, 1245 + struct nfsd4_compound_state *cstate, 1246 + struct nfsd4_layoutget *lgp) 1247 + { 1248 + struct svc_fh *current_fh = &cstate->current_fh; 1249 + const struct nfsd4_layout_ops *ops; 1250 + struct nfs4_layout_stateid *ls; 1251 + __be32 nfserr; 1252 + int accmode; 1253 + 1254 + switch (lgp->lg_seg.iomode) { 1255 + case IOMODE_READ: 1256 + accmode = NFSD_MAY_READ; 1257 + break; 1258 + case IOMODE_RW: 1259 + accmode = NFSD_MAY_READ | NFSD_MAY_WRITE; 1260 + break; 1261 + default: 1262 + dprintk("%s: invalid iomode %d\n", 1263 + __func__, lgp->lg_seg.iomode); 1264 + nfserr = nfserr_badiomode; 1265 + goto out; 1266 + } 1267 + 1268 + nfserr = fh_verify(rqstp, current_fh, 0, accmode); 1269 + if (nfserr) 1270 + goto out; 1271 + 1272 + nfserr = nfserr_layoutunavailable; 1273 + ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type); 1274 + if (!ops) 1275 + goto out; 1276 + 1277 + /* 1278 + * Verify minlength and range as per RFC5661: 1279 + * o If loga_length is less than loga_minlength, 1280 + * the metadata server MUST return NFS4ERR_INVAL. 1281 + * o If the sum of loga_offset and loga_minlength exceeds 1282 + * NFS4_UINT64_MAX, and loga_minlength is not 1283 + * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result. 1284 + * o If the sum of loga_offset and loga_length exceeds 1285 + * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX, 1286 + * the error NFS4ERR_INVAL MUST result. 1287 + */ 1288 + nfserr = nfserr_inval; 1289 + if (lgp->lg_seg.length < lgp->lg_minlength || 1290 + (lgp->lg_minlength != NFS4_MAX_UINT64 && 1291 + lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) || 1292 + (lgp->lg_seg.length != NFS4_MAX_UINT64 && 1293 + lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset)) 1294 + goto out; 1295 + if (lgp->lg_seg.length == 0) 1296 + goto out; 1297 + 1298 + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid, 1299 + true, lgp->lg_layout_type, &ls); 1300 + if (nfserr) { 1301 + trace_layout_get_lookup_fail(&lgp->lg_sid); 1302 + goto out; 1303 + } 1304 + 1305 + nfserr = nfserr_recallconflict; 1306 + if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) 1307 + goto out_put_stid; 1308 + 1309 + nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode, 1310 + current_fh, lgp); 1311 + if (nfserr) 1312 + goto out_put_stid; 1313 + 1314 + nfserr = nfsd4_insert_layout(lgp, ls); 1315 + 1316 + out_put_stid: 1317 + nfs4_put_stid(&ls->ls_stid); 1318 + out: 1319 + return nfserr; 1320 + } 1321 + 1322 + static __be32 1323 + nfsd4_layoutcommit(struct svc_rqst *rqstp, 1324 + struct nfsd4_compound_state *cstate, 1325 + struct nfsd4_layoutcommit *lcp) 1326 + { 1327 + const struct nfsd4_layout_seg *seg = &lcp->lc_seg; 1328 + struct svc_fh *current_fh = &cstate->current_fh; 1329 + const struct nfsd4_layout_ops *ops; 1330 + loff_t new_size = lcp->lc_last_wr + 1; 1331 + struct inode *inode; 1332 + struct nfs4_layout_stateid *ls; 1333 + __be32 nfserr; 1334 + 1335 + nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE); 1336 + if (nfserr) 1337 + goto out; 1338 + 1339 + nfserr = nfserr_layoutunavailable; 1340 + ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type); 1341 + if (!ops) 1342 + goto out; 1343 + inode = current_fh->fh_dentry->d_inode; 1344 + 1345 + nfserr = nfserr_inval; 1346 + if (new_size <= seg->offset) { 1347 + dprintk("pnfsd: last write before layout segment\n"); 1348 + goto out; 1349 + } 1350 + if (new_size > seg->offset + seg->length) { 1351 + dprintk("pnfsd: last write beyond layout segment\n"); 1352 + goto out; 1353 + } 1354 + if (!lcp->lc_newoffset && new_size > i_size_read(inode)) { 1355 + dprintk("pnfsd: layoutcommit beyond EOF\n"); 1356 + goto out; 1357 + } 1358 + 1359 + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid, 1360 + false, lcp->lc_layout_type, 1361 + &ls); 1362 + if (nfserr) { 1363 + trace_layout_commit_lookup_fail(&lcp->lc_sid); 1364 + /* fixup error code as per RFC5661 */ 1365 + if (nfserr == nfserr_bad_stateid) 1366 + nfserr = nfserr_badlayout; 1367 + goto out; 1368 + } 1369 + 1370 + nfserr = ops->proc_layoutcommit(inode, lcp); 1371 + if (nfserr) 1372 + goto out_put_stid; 1373 + 1374 + if (new_size > i_size_read(inode)) { 1375 + lcp->lc_size_chg = 1; 1376 + lcp->lc_newsize = new_size; 1377 + } else { 1378 + lcp->lc_size_chg = 0; 1379 + } 1380 + 1381 + out_put_stid: 1382 + nfs4_put_stid(&ls->ls_stid); 1383 + out: 1384 + return nfserr; 1385 + } 1386 + 1387 + static __be32 1388 + nfsd4_layoutreturn(struct svc_rqst *rqstp, 1389 + struct nfsd4_compound_state *cstate, 1390 + struct nfsd4_layoutreturn *lrp) 1391 + { 1392 + struct svc_fh *current_fh = &cstate->current_fh; 1393 + __be32 nfserr; 1394 + 1395 + nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); 1396 + if (nfserr) 1397 + goto out; 1398 + 1399 + nfserr = nfserr_layoutunavailable; 1400 + if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type)) 1401 + goto out; 1402 + 1403 + switch (lrp->lr_seg.iomode) { 1404 + case IOMODE_READ: 1405 + case IOMODE_RW: 1406 + case IOMODE_ANY: 1407 + break; 1408 + default: 1409 + dprintk("%s: invalid iomode %d\n", __func__, 1410 + lrp->lr_seg.iomode); 1411 + nfserr = nfserr_inval; 1412 + goto out; 1413 + } 1414 + 1415 + switch (lrp->lr_return_type) { 1416 + case RETURN_FILE: 1417 + nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp); 1418 + break; 1419 + case RETURN_FSID: 1420 + case RETURN_ALL: 1421 + nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp); 1422 + break; 1423 + default: 1424 + dprintk("%s: invalid return_type %d\n", __func__, 1425 + lrp->lr_return_type); 1426 + nfserr = nfserr_inval; 1427 + break; 1428 + } 1429 + out: 1430 + return nfserr; 1431 + } 1432 + #endif /* CONFIG_NFSD_PNFS */ 1433 + 1183 1434 /* 1184 1435 * NULL call. 1185 1436 */ ··· 1934 1679 op_encode_channel_attrs_maxsz) * sizeof(__be32); 1935 1680 } 1936 1681 1682 + #ifdef CONFIG_NFSD_PNFS 1683 + /* 1684 + * At this stage we don't really know what layout driver will handle the request, 1685 + * so we need to define an arbitrary upper bound here. 1686 + */ 1687 + #define MAX_LAYOUT_SIZE 128 1688 + static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1689 + { 1690 + return (op_encode_hdr_size + 1691 + 1 /* logr_return_on_close */ + 1692 + op_encode_stateid_maxsz + 1693 + 1 /* nr of layouts */ + 1694 + MAX_LAYOUT_SIZE) * sizeof(__be32); 1695 + } 1696 + 1697 + static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1698 + { 1699 + return (op_encode_hdr_size + 1700 + 1 /* locr_newsize */ + 1701 + 2 /* ns_size */) * sizeof(__be32); 1702 + } 1703 + 1704 + static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1705 + { 1706 + return (op_encode_hdr_size + 1707 + 1 /* lrs_stateid */ + 1708 + op_encode_stateid_maxsz) * sizeof(__be32); 1709 + } 1710 + #endif /* CONFIG_NFSD_PNFS */ 1711 + 1937 1712 static struct nfsd4_operation nfsd4_ops[] = { 1938 1713 [OP_ACCESS] = { 1939 1714 .op_func = (nfsd4op_func)nfsd4_access, ··· 2251 1966 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, 2252 1967 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, 2253 1968 }, 1969 + #ifdef CONFIG_NFSD_PNFS 1970 + [OP_GETDEVICEINFO] = { 1971 + .op_func = (nfsd4op_func)nfsd4_getdeviceinfo, 1972 + .op_flags = ALLOWED_WITHOUT_FH, 1973 + .op_name = "OP_GETDEVICEINFO", 1974 + }, 1975 + [OP_LAYOUTGET] = { 1976 + .op_func = (nfsd4op_func)nfsd4_layoutget, 1977 + .op_flags = OP_MODIFIES_SOMETHING, 1978 + .op_name = "OP_LAYOUTGET", 1979 + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize, 1980 + }, 1981 + [OP_LAYOUTCOMMIT] = { 1982 + .op_func = (nfsd4op_func)nfsd4_layoutcommit, 1983 + .op_flags = OP_MODIFIES_SOMETHING, 1984 + .op_name = "OP_LAYOUTCOMMIT", 1985 + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize, 1986 + }, 1987 + [OP_LAYOUTRETURN] = { 1988 + .op_func = (nfsd4op_func)nfsd4_layoutreturn, 1989 + .op_flags = OP_MODIFIES_SOMETHING, 1990 + .op_name = "OP_LAYOUTRETURN", 1991 + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize, 1992 + }, 1993 + #endif /* CONFIG_NFSD_PNFS */ 2254 1994 2255 1995 /* NFSv4.2 operations */ 2256 1996 [OP_ALLOCATE] = {

+36 -40

fs/nfsd/nfs4state.c

··· 48 48 #include "current_stateid.h" 49 49 50 50 #include "netns.h" 51 + #include "pnfs.h" 51 52 52 53 #define NFSDDBG_FACILITY NFSDDBG_PROC 53 54 ··· 149 148 clp->cl_clientid.cl_id); 150 149 list_move_tail(&clp->cl_lru, &nn->client_lru); 151 150 clp->cl_time = get_seconds(); 152 - } 153 - 154 - static inline void 155 - renew_client(struct nfs4_client *clp) 156 - { 157 - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 158 - 159 - spin_lock(&nn->client_lock); 160 - renew_client_locked(clp); 161 - spin_unlock(&nn->client_lock); 162 151 } 163 152 164 153 static void put_client_renew_locked(struct nfs4_client *clp) ··· 273 282 kmem_cache_free(file_slab, fp); 274 283 } 275 284 276 - static inline void 285 + void 277 286 put_nfs4_file(struct nfs4_file *fi) 278 287 { 279 288 might_lock(&state_lock); ··· 284 293 WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); 285 294 call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); 286 295 } 287 - } 288 - 289 - static inline void 290 - get_nfs4_file(struct nfs4_file *fi) 291 - { 292 - atomic_inc(&fi->fi_ref); 293 296 } 294 297 295 298 static struct file * ··· 343 358 return ret; 344 359 } 345 360 346 - static struct file * 361 + struct file * 347 362 find_any_file(struct nfs4_file *f) 348 363 { 349 364 struct file *ret; ··· 391 406 static unsigned int file_hashval(struct knfsd_fh *fh) 392 407 { 393 408 return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); 394 - } 395 - 396 - static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) 397 - { 398 - return fh1->fh_size == fh2->fh_size && 399 - !memcmp(fh1->fh_base.fh_pad, 400 - fh2->fh_base.fh_pad, 401 - fh1->fh_size); 402 409 } 403 410 404 411 static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; ··· 471 494 __nfs4_file_put_access(fp, O_RDONLY); 472 495 } 473 496 474 - static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, 497 + struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, 475 498 struct kmem_cache *slab) 476 499 { 477 500 struct nfs4_stid *stid; ··· 665 688 struct file *filp = NULL; 666 689 667 690 spin_lock(&fp->fi_lock); 668 - if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees)) 691 + if (fp->fi_deleg_file && --fp->fi_delegees == 0) 669 692 swap(filp, fp->fi_deleg_file); 670 693 spin_unlock(&fp->fi_lock); 671 694 672 695 if (filp) { 673 - vfs_setlease(filp, F_UNLCK, NULL, NULL); 696 + vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp); 674 697 fput(filp); 675 698 } 676 699 } 677 700 678 - static void unhash_stid(struct nfs4_stid *s) 701 + void nfs4_unhash_stid(struct nfs4_stid *s) 679 702 { 680 703 s->sc_type = 0; 681 704 } ··· 983 1006 984 1007 list_del_init(&stp->st_locks); 985 1008 unhash_ol_stateid(stp); 986 - unhash_stid(&stp->st_stid); 1009 + nfs4_unhash_stid(&stp->st_stid); 987 1010 } 988 1011 989 1012 static void release_lock_stateid(struct nfs4_ol_stateid *stp) ··· 1495 1518 static int 1496 1519 STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) 1497 1520 { 1498 - if (clid->cl_boot == nn->boot_time) 1521 + /* 1522 + * We're assuming the clid was not given out from a boot 1523 + * precisely 2^32 (about 136 years) before this one. That seems 1524 + * a safe assumption: 1525 + */ 1526 + if (clid->cl_boot == (u32)nn->boot_time) 1499 1527 return 0; 1500 1528 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", 1501 1529 clid->cl_boot, clid->cl_id, nn->boot_time); ··· 1540 1558 INIT_LIST_HEAD(&clp->cl_lru); 1541 1559 INIT_LIST_HEAD(&clp->cl_callbacks); 1542 1560 INIT_LIST_HEAD(&clp->cl_revoked); 1561 + #ifdef CONFIG_NFSD_PNFS 1562 + INIT_LIST_HEAD(&clp->cl_lo_states); 1563 + #endif 1543 1564 spin_lock_init(&clp->cl_lock); 1544 1565 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1545 1566 return clp; ··· 1647 1662 nfs4_get_stateowner(&oo->oo_owner); 1648 1663 release_openowner(oo); 1649 1664 } 1665 + nfsd4_return_all_client_layouts(clp); 1650 1666 nfsd4_shutdown_callback(clp); 1651 1667 if (clp->cl_cb_conn.cb_xprt) 1652 1668 svc_xprt_put(clp->cl_cb_conn.cb_xprt); ··· 2131 2145 static void 2132 2146 nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) 2133 2147 { 2134 - /* pNFS is not supported */ 2148 + #ifdef CONFIG_NFSD_PNFS 2149 + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS; 2150 + #else 2135 2151 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; 2152 + #endif 2136 2153 2137 2154 /* Referrals are supported, Migration is not. */ 2138 2155 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; ··· 3063 3074 fp->fi_share_deny = 0; 3064 3075 memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); 3065 3076 memset(fp->fi_access, 0, sizeof(fp->fi_access)); 3077 + #ifdef CONFIG_NFSD_PNFS 3078 + INIT_LIST_HEAD(&fp->fi_lo_states); 3079 + atomic_set(&fp->fi_lo_recalls, 0); 3080 + #endif 3066 3081 hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); 3067 3082 } 3068 3083 ··· 3293 3300 struct nfs4_file *fp; 3294 3301 3295 3302 hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { 3296 - if (nfsd_fh_match(&fp->fi_fhandle, fh)) { 3303 + if (fh_match(&fp->fi_fhandle, fh)) { 3297 3304 if (atomic_inc_not_zero(&fp->fi_ref)) 3298 3305 return fp; 3299 3306 } ··· 3301 3308 return NULL; 3302 3309 } 3303 3310 3304 - static struct nfs4_file * 3311 + struct nfs4_file * 3305 3312 find_file(struct knfsd_fh *fh) 3306 3313 { 3307 3314 struct nfs4_file *fp; ··· 3849 3856 /* Race breaker */ 3850 3857 if (fp->fi_deleg_file) { 3851 3858 status = 0; 3852 - atomic_inc(&fp->fi_delegees); 3859 + ++fp->fi_delegees; 3853 3860 hash_delegation_locked(dp, fp); 3854 3861 goto out_unlock; 3855 3862 } 3856 3863 fp->fi_deleg_file = filp; 3857 - atomic_set(&fp->fi_delegees, 1); 3864 + fp->fi_delegees = 1; 3858 3865 hash_delegation_locked(dp, fp); 3859 3866 spin_unlock(&fp->fi_lock); 3860 3867 spin_unlock(&state_lock); ··· 3895 3902 status = -EAGAIN; 3896 3903 goto out_unlock; 3897 3904 } 3898 - atomic_inc(&fp->fi_delegees); 3905 + ++fp->fi_delegees; 3899 3906 hash_delegation_locked(dp, fp); 3900 3907 status = 0; 3901 3908 out_unlock: ··· 4288 4295 4289 4296 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) 4290 4297 { 4291 - if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) 4298 + if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) 4292 4299 return nfserr_bad_stateid; 4293 4300 return nfs_ok; 4294 4301 } ··· 4439 4446 return status; 4440 4447 } 4441 4448 4442 - static __be32 4449 + __be32 4443 4450 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, 4444 4451 stateid_t *stateid, unsigned char typemask, 4445 4452 struct nfs4_stid **s, struct nfsd_net *nn) ··· 4852 4859 goto out; 4853 4860 update_stateid(&stp->st_stid.sc_stateid); 4854 4861 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4862 + 4863 + nfsd4_return_all_file_layouts(stp->st_stateowner->so_client, 4864 + stp->st_stid.sc_file); 4855 4865 4856 4866 nfsd4_close_open_stateid(stp); 4857 4867

+342 -20

fs/nfsd/nfs4xdr.c

··· 47 47 #include "state.h" 48 48 #include "cache.h" 49 49 #include "netns.h" 50 + #include "pnfs.h" 50 51 51 52 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL 52 53 #include <linux/security.h> ··· 235 234 return ret; 236 235 } 237 236 237 + /* 238 + * We require the high 32 bits of 'seconds' to be 0, and 239 + * we ignore all 32 bits of 'nseconds'. 240 + */ 241 + static __be32 242 + nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv) 243 + { 244 + DECODE_HEAD; 245 + u64 sec; 246 + 247 + READ_BUF(12); 248 + p = xdr_decode_hyper(p, &sec); 249 + tv->tv_sec = sec; 250 + tv->tv_nsec = be32_to_cpup(p++); 251 + if (tv->tv_nsec >= (u32)1000000000) 252 + return nfserr_inval; 253 + 254 + DECODE_TAIL; 255 + } 256 + 238 257 static __be32 239 258 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) 240 259 { ··· 288 267 { 289 268 int expected_len, len = 0; 290 269 u32 dummy32; 291 - u64 sec; 292 270 char *buf; 293 271 294 272 DECODE_HEAD; ··· 378 358 dummy32 = be32_to_cpup(p++); 379 359 switch (dummy32) { 380 360 case NFS4_SET_TO_CLIENT_TIME: 381 - /* We require the high 32 bits of 'seconds' to be 0, and we ignore 382 - all 32 bits of 'nseconds'. */ 383 - READ_BUF(12); 384 361 len += 12; 385 - p = xdr_decode_hyper(p, &sec); 386 - iattr->ia_atime.tv_sec = (time_t)sec; 387 - iattr->ia_atime.tv_nsec = be32_to_cpup(p++); 388 - if (iattr->ia_atime.tv_nsec >= (u32)1000000000) 389 - return nfserr_inval; 362 + status = nfsd4_decode_time(argp, &iattr->ia_atime); 363 + if (status) 364 + return status; 390 365 iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); 391 366 break; 392 367 case NFS4_SET_TO_SERVER_TIME: ··· 397 382 dummy32 = be32_to_cpup(p++); 398 383 switch (dummy32) { 399 384 case NFS4_SET_TO_CLIENT_TIME: 400 - /* We require the high 32 bits of 'seconds' to be 0, and we ignore 401 - all 32 bits of 'nseconds'. */ 402 - READ_BUF(12); 403 385 len += 12; 404 - p = xdr_decode_hyper(p, &sec); 405 - iattr->ia_mtime.tv_sec = sec; 406 - iattr->ia_mtime.tv_nsec = be32_to_cpup(p++); 407 - if (iattr->ia_mtime.tv_nsec >= (u32)1000000000) 408 - return nfserr_inval; 386 + status = nfsd4_decode_time(argp, &iattr->ia_mtime); 387 + if (status) 388 + return status; 409 389 iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); 410 390 break; 411 391 case NFS4_SET_TO_SERVER_TIME: ··· 1523 1513 DECODE_TAIL; 1524 1514 } 1525 1515 1516 + #ifdef CONFIG_NFSD_PNFS 1517 + static __be32 1518 + nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, 1519 + struct nfsd4_getdeviceinfo *gdev) 1520 + { 1521 + DECODE_HEAD; 1522 + u32 num, i; 1523 + 1524 + READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4); 1525 + COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid)); 1526 + gdev->gd_layout_type = be32_to_cpup(p++); 1527 + gdev->gd_maxcount = be32_to_cpup(p++); 1528 + num = be32_to_cpup(p++); 1529 + if (num) { 1530 + READ_BUF(4 * num); 1531 + gdev->gd_notify_types = be32_to_cpup(p++); 1532 + for (i = 1; i < num; i++) { 1533 + if (be32_to_cpup(p++)) { 1534 + status = nfserr_inval; 1535 + goto out; 1536 + } 1537 + } 1538 + } 1539 + DECODE_TAIL; 1540 + } 1541 + 1542 + static __be32 1543 + nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, 1544 + struct nfsd4_layoutget *lgp) 1545 + { 1546 + DECODE_HEAD; 1547 + 1548 + READ_BUF(36); 1549 + lgp->lg_signal = be32_to_cpup(p++); 1550 + lgp->lg_layout_type = be32_to_cpup(p++); 1551 + lgp->lg_seg.iomode = be32_to_cpup(p++); 1552 + p = xdr_decode_hyper(p, &lgp->lg_seg.offset); 1553 + p = xdr_decode_hyper(p, &lgp->lg_seg.length); 1554 + p = xdr_decode_hyper(p, &lgp->lg_minlength); 1555 + nfsd4_decode_stateid(argp, &lgp->lg_sid); 1556 + READ_BUF(4); 1557 + lgp->lg_maxcount = be32_to_cpup(p++); 1558 + 1559 + DECODE_TAIL; 1560 + } 1561 + 1562 + static __be32 1563 + nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, 1564 + struct nfsd4_layoutcommit *lcp) 1565 + { 1566 + DECODE_HEAD; 1567 + u32 timechange; 1568 + 1569 + READ_BUF(20); 1570 + p = xdr_decode_hyper(p, &lcp->lc_seg.offset); 1571 + p = xdr_decode_hyper(p, &lcp->lc_seg.length); 1572 + lcp->lc_reclaim = be32_to_cpup(p++); 1573 + nfsd4_decode_stateid(argp, &lcp->lc_sid); 1574 + READ_BUF(4); 1575 + lcp->lc_newoffset = be32_to_cpup(p++); 1576 + if (lcp->lc_newoffset) { 1577 + READ_BUF(8); 1578 + p = xdr_decode_hyper(p, &lcp->lc_last_wr); 1579 + } else 1580 + lcp->lc_last_wr = 0; 1581 + READ_BUF(4); 1582 + timechange = be32_to_cpup(p++); 1583 + if (timechange) { 1584 + status = nfsd4_decode_time(argp, &lcp->lc_mtime); 1585 + if (status) 1586 + return status; 1587 + } else { 1588 + lcp->lc_mtime.tv_nsec = UTIME_NOW; 1589 + } 1590 + READ_BUF(8); 1591 + lcp->lc_layout_type = be32_to_cpup(p++); 1592 + 1593 + /* 1594 + * Save the layout update in XDR format and let the layout driver deal 1595 + * with it later. 1596 + */ 1597 + lcp->lc_up_len = be32_to_cpup(p++); 1598 + if (lcp->lc_up_len > 0) { 1599 + READ_BUF(lcp->lc_up_len); 1600 + READMEM(lcp->lc_up_layout, lcp->lc_up_len); 1601 + } 1602 + 1603 + DECODE_TAIL; 1604 + } 1605 + 1606 + static __be32 1607 + nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, 1608 + struct nfsd4_layoutreturn *lrp) 1609 + { 1610 + DECODE_HEAD; 1611 + 1612 + READ_BUF(16); 1613 + lrp->lr_reclaim = be32_to_cpup(p++); 1614 + lrp->lr_layout_type = be32_to_cpup(p++); 1615 + lrp->lr_seg.iomode = be32_to_cpup(p++); 1616 + lrp->lr_return_type = be32_to_cpup(p++); 1617 + if (lrp->lr_return_type == RETURN_FILE) { 1618 + READ_BUF(16); 1619 + p = xdr_decode_hyper(p, &lrp->lr_seg.offset); 1620 + p = xdr_decode_hyper(p, &lrp->lr_seg.length); 1621 + nfsd4_decode_stateid(argp, &lrp->lr_sid); 1622 + READ_BUF(4); 1623 + lrp->lrf_body_len = be32_to_cpup(p++); 1624 + if (lrp->lrf_body_len > 0) { 1625 + READ_BUF(lrp->lrf_body_len); 1626 + READMEM(lrp->lrf_body, lrp->lrf_body_len); 1627 + } 1628 + } else { 1629 + lrp->lr_seg.offset = 0; 1630 + lrp->lr_seg.length = NFS4_MAX_UINT64; 1631 + } 1632 + 1633 + DECODE_TAIL; 1634 + } 1635 + #endif /* CONFIG_NFSD_PNFS */ 1636 + 1526 1637 static __be32 1527 1638 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, 1528 1639 struct nfsd4_fallocate *fallocate) ··· 1738 1607 [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, 1739 1608 [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, 1740 1609 [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1610 + #ifdef CONFIG_NFSD_PNFS 1611 + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo, 1612 + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, 1613 + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, 1614 + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, 1615 + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, 1616 + #else 1741 1617 [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, 1742 1618 [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, 1743 1619 [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, 1744 1620 [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, 1745 1621 [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, 1622 + #endif 1746 1623 [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, 1747 1624 [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, 1748 1625 [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, ··· 2678 2539 get_parent_attributes(exp, &stat); 2679 2540 p = xdr_encode_hyper(p, stat.ino); 2680 2541 } 2542 + #ifdef CONFIG_NFSD_PNFS 2543 + if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) || 2544 + (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) { 2545 + if (exp->ex_layout_type) { 2546 + p = xdr_reserve_space(xdr, 8); 2547 + if (!p) 2548 + goto out_resource; 2549 + *p++ = cpu_to_be32(1); 2550 + *p++ = cpu_to_be32(exp->ex_layout_type); 2551 + } else { 2552 + p = xdr_reserve_space(xdr, 4); 2553 + if (!p) 2554 + goto out_resource; 2555 + *p++ = cpu_to_be32(0); 2556 + } 2557 + } 2558 + 2559 + if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { 2560 + p = xdr_reserve_space(xdr, 4); 2561 + if (!p) 2562 + goto out_resource; 2563 + *p++ = cpu_to_be32(stat.blksize); 2564 + } 2565 + #endif /* CONFIG_NFSD_PNFS */ 2681 2566 if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { 2682 2567 status = nfsd4_encode_security_label(xdr, rqstp, context, 2683 2568 contextlen); ··· 2931 2768 if (entry_bytes > cd->rd_maxcount) 2932 2769 goto fail; 2933 2770 cd->rd_maxcount -= entry_bytes; 2934 - if (!cd->rd_dircount) 2935 - goto fail; 2936 2771 /* 2937 2772 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so 2938 2773 * let's always let through the first entry, at least: 2939 2774 */ 2940 - name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8; 2775 + if (!cd->rd_dircount) 2776 + goto fail; 2777 + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; 2941 2778 if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) 2942 2779 goto fail; 2943 2780 cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); 2781 + 2944 2782 cd->cookie_offset = cookie_offset; 2945 2783 skip_entry: 2946 2784 cd->common.err = nfs_ok; ··· 3978 3814 return nfserr; 3979 3815 } 3980 3816 3817 + #ifdef CONFIG_NFSD_PNFS 3818 + static __be32 3819 + nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, 3820 + struct nfsd4_getdeviceinfo *gdev) 3821 + { 3822 + struct xdr_stream *xdr = &resp->xdr; 3823 + const struct nfsd4_layout_ops *ops = 3824 + nfsd4_layout_ops[gdev->gd_layout_type]; 3825 + u32 starting_len = xdr->buf->len, needed_len; 3826 + __be32 *p; 3827 + 3828 + dprintk("%s: err %d\n", __func__, nfserr); 3829 + if (nfserr) 3830 + goto out; 3831 + 3832 + nfserr = nfserr_resource; 3833 + p = xdr_reserve_space(xdr, 4); 3834 + if (!p) 3835 + goto out; 3836 + 3837 + *p++ = cpu_to_be32(gdev->gd_layout_type); 3838 + 3839 + /* If maxcount is 0 then just update notifications */ 3840 + if (gdev->gd_maxcount != 0) { 3841 + nfserr = ops->encode_getdeviceinfo(xdr, gdev); 3842 + if (nfserr) { 3843 + /* 3844 + * We don't bother to burden the layout drivers with 3845 + * enforcing gd_maxcount, just tell the client to 3846 + * come back with a bigger buffer if it's not enough. 3847 + */ 3848 + if (xdr->buf->len + 4 > gdev->gd_maxcount) 3849 + goto toosmall; 3850 + goto out; 3851 + } 3852 + } 3853 + 3854 + nfserr = nfserr_resource; 3855 + if (gdev->gd_notify_types) { 3856 + p = xdr_reserve_space(xdr, 4 + 4); 3857 + if (!p) 3858 + goto out; 3859 + *p++ = cpu_to_be32(1); /* bitmap length */ 3860 + *p++ = cpu_to_be32(gdev->gd_notify_types); 3861 + } else { 3862 + p = xdr_reserve_space(xdr, 4); 3863 + if (!p) 3864 + goto out; 3865 + *p++ = 0; 3866 + } 3867 + 3868 + nfserr = 0; 3869 + out: 3870 + kfree(gdev->gd_device); 3871 + dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr)); 3872 + return nfserr; 3873 + 3874 + toosmall: 3875 + dprintk("%s: maxcount too small\n", __func__); 3876 + needed_len = xdr->buf->len + 4 /* notifications */; 3877 + xdr_truncate_encode(xdr, starting_len); 3878 + p = xdr_reserve_space(xdr, 4); 3879 + if (!p) { 3880 + nfserr = nfserr_resource; 3881 + } else { 3882 + *p++ = cpu_to_be32(needed_len); 3883 + nfserr = nfserr_toosmall; 3884 + } 3885 + goto out; 3886 + } 3887 + 3888 + static __be32 3889 + nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, 3890 + struct nfsd4_layoutget *lgp) 3891 + { 3892 + struct xdr_stream *xdr = &resp->xdr; 3893 + const struct nfsd4_layout_ops *ops = 3894 + nfsd4_layout_ops[lgp->lg_layout_type]; 3895 + __be32 *p; 3896 + 3897 + dprintk("%s: err %d\n", __func__, nfserr); 3898 + if (nfserr) 3899 + goto out; 3900 + 3901 + nfserr = nfserr_resource; 3902 + p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t)); 3903 + if (!p) 3904 + goto out; 3905 + 3906 + *p++ = cpu_to_be32(1); /* we always set return-on-close */ 3907 + *p++ = cpu_to_be32(lgp->lg_sid.si_generation); 3908 + p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque, 3909 + sizeof(stateid_opaque_t)); 3910 + 3911 + *p++ = cpu_to_be32(1); /* we always return a single layout */ 3912 + p = xdr_encode_hyper(p, lgp->lg_seg.offset); 3913 + p = xdr_encode_hyper(p, lgp->lg_seg.length); 3914 + *p++ = cpu_to_be32(lgp->lg_seg.iomode); 3915 + *p++ = cpu_to_be32(lgp->lg_layout_type); 3916 + 3917 + nfserr = ops->encode_layoutget(xdr, lgp); 3918 + out: 3919 + kfree(lgp->lg_content); 3920 + return nfserr; 3921 + } 3922 + 3923 + static __be32 3924 + nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, 3925 + struct nfsd4_layoutcommit *lcp) 3926 + { 3927 + struct xdr_stream *xdr = &resp->xdr; 3928 + __be32 *p; 3929 + 3930 + if (nfserr) 3931 + return nfserr; 3932 + 3933 + p = xdr_reserve_space(xdr, 4); 3934 + if (!p) 3935 + return nfserr_resource; 3936 + *p++ = cpu_to_be32(lcp->lc_size_chg); 3937 + if (lcp->lc_size_chg) { 3938 + p = xdr_reserve_space(xdr, 8); 3939 + if (!p) 3940 + return nfserr_resource; 3941 + p = xdr_encode_hyper(p, lcp->lc_newsize); 3942 + } 3943 + 3944 + return nfs_ok; 3945 + } 3946 + 3947 + static __be32 3948 + nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, 3949 + struct nfsd4_layoutreturn *lrp) 3950 + { 3951 + struct xdr_stream *xdr = &resp->xdr; 3952 + __be32 *p; 3953 + 3954 + if (nfserr) 3955 + return nfserr; 3956 + 3957 + p = xdr_reserve_space(xdr, 4); 3958 + if (!p) 3959 + return nfserr_resource; 3960 + *p++ = cpu_to_be32(lrp->lrs_present); 3961 + if (lrp->lrs_present) 3962 + nfsd4_encode_stateid(xdr, &lrp->lr_sid); 3963 + return nfs_ok; 3964 + } 3965 + #endif /* CONFIG_NFSD_PNFS */ 3966 + 3981 3967 static __be32 3982 3968 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, 3983 3969 struct nfsd4_seek *seek) ··· 4204 3890 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, 4205 3891 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, 4206 3892 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, 3893 + #ifdef CONFIG_NFSD_PNFS 3894 + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo, 3895 + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, 3896 + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, 3897 + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, 3898 + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, 3899 + #else 4207 3900 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, 4208 3901 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, 4209 3902 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, 4210 3903 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, 4211 3904 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, 3905 + #endif 4212 3906 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, 4213 3907 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, 4214 3908 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,

+8 -1

fs/nfsd/nfsctl.c

··· 21 21 #include "cache.h" 22 22 #include "state.h" 23 23 #include "netns.h" 24 + #include "pnfs.h" 24 25 25 26 /* 26 27 * We have a single directory with several nodes in it. ··· 1259 1258 retval = nfsd4_init_slabs(); 1260 1259 if (retval) 1261 1260 goto out_unregister_pernet; 1262 - retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ 1261 + retval = nfsd4_init_pnfs(); 1263 1262 if (retval) 1264 1263 goto out_free_slabs; 1264 + retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ 1265 + if (retval) 1266 + goto out_exit_pnfs; 1265 1267 nfsd_stat_init(); /* Statistics */ 1266 1268 retval = nfsd_reply_cache_init(); 1267 1269 if (retval) ··· 1286 1282 out_free_stat: 1287 1283 nfsd_stat_shutdown(); 1288 1284 nfsd_fault_inject_cleanup(); 1285 + out_exit_pnfs: 1286 + nfsd4_exit_pnfs(); 1289 1287 out_free_slabs: 1290 1288 nfsd4_free_slabs(); 1291 1289 out_unregister_pernet: ··· 1305 1299 nfsd_stat_shutdown(); 1306 1300 nfsd_lockd_shutdown(); 1307 1301 nfsd4_free_slabs(); 1302 + nfsd4_exit_pnfs(); 1308 1303 nfsd_fault_inject_cleanup(); 1309 1304 unregister_filesystem(&nfsd_fs_type); 1310 1305 unregister_pernet_subsys(&nfsd_net_ops);

+14 -2

fs/nfsd/nfsd.h

··· 325 325 326 326 #define NFSD4_SUPPORTED_ATTRS_WORD2 0 327 327 328 + /* 4.1 */ 329 + #ifdef CONFIG_NFSD_PNFS 330 + #define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES 331 + #define PNFSD_SUPPORTED_ATTRS_WORD2 \ 332 + (FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES) 333 + #else 334 + #define PNFSD_SUPPORTED_ATTRS_WORD1 0 335 + #define PNFSD_SUPPORTED_ATTRS_WORD2 0 336 + #endif /* CONFIG_NFSD_PNFS */ 337 + 328 338 #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ 329 339 NFSD4_SUPPORTED_ATTRS_WORD0 330 340 331 341 #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ 332 - NFSD4_SUPPORTED_ATTRS_WORD1 342 + (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1) 333 343 334 344 #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ 335 - (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) 345 + (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \ 346 + FATTR4_WORD2_SUPPATTR_EXCLCREAT) 336 347 348 + /* 4.2 */ 337 349 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL 338 350 #define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL 339 351 #else

+18

fs/nfsd/nfsfh.h

··· 187 187 return fhp; 188 188 } 189 189 190 + static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) 191 + { 192 + if (fh1->fh_size != fh2->fh_size) 193 + return false; 194 + if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0) 195 + return false; 196 + return true; 197 + } 198 + 199 + static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) 200 + { 201 + if (fh1->fh_fsid_type != fh2->fh_fsid_type) 202 + return false; 203 + if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type) != 0)) 204 + return false; 205 + return true; 206 + } 207 + 190 208 #ifdef CONFIG_NFSD_V3 191 209 /* 192 210 * The wcc data stored in current_fh should be cleared

+1

fs/nfsd/nfssvc.c

··· 119 119 static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = { 120 120 [0] = 1, 121 121 [1] = 1, 122 + [2] = 1, 122 123 }; 123 124 124 125 int nfsd_vers(int vers, enum vers_op change)

+81

fs/nfsd/pnfs.h

··· 1 + #ifndef _FS_NFSD_PNFS_H 2 + #define _FS_NFSD_PNFS_H 1 3 + 4 + #include <linux/exportfs.h> 5 + #include <linux/nfsd/export.h> 6 + 7 + #include "state.h" 8 + #include "xdr4.h" 9 + 10 + struct xdr_stream; 11 + 12 + struct nfsd4_deviceid_map { 13 + struct list_head hash; 14 + u64 idx; 15 + int fsid_type; 16 + u32 fsid[]; 17 + }; 18 + 19 + struct nfsd4_layout_ops { 20 + u32 notify_types; 21 + 22 + __be32 (*proc_getdeviceinfo)(struct super_block *sb, 23 + struct nfsd4_getdeviceinfo *gdevp); 24 + __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, 25 + struct nfsd4_getdeviceinfo *gdevp); 26 + 27 + __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp, 28 + struct nfsd4_layoutget *lgp); 29 + __be32 (*encode_layoutget)(struct xdr_stream *, 30 + struct nfsd4_layoutget *lgp); 31 + 32 + __be32 (*proc_layoutcommit)(struct inode *inode, 33 + struct nfsd4_layoutcommit *lcp); 34 + }; 35 + 36 + extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; 37 + extern const struct nfsd4_layout_ops bl_layout_ops; 38 + 39 + __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, 40 + struct nfsd4_compound_state *cstate, stateid_t *stateid, 41 + bool create, u32 layout_type, struct nfs4_layout_stateid **lsp); 42 + __be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp, 43 + struct nfs4_layout_stateid *ls); 44 + __be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp, 45 + struct nfsd4_compound_state *cstate, 46 + struct nfsd4_layoutreturn *lrp); 47 + __be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp, 48 + struct nfsd4_compound_state *cstate, 49 + struct nfsd4_layoutreturn *lrp); 50 + int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, 51 + u32 device_generation); 52 + struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx); 53 + 54 + #ifdef CONFIG_NFSD_PNFS 55 + void nfsd4_setup_layout_type(struct svc_export *exp); 56 + void nfsd4_return_all_client_layouts(struct nfs4_client *); 57 + void nfsd4_return_all_file_layouts(struct nfs4_client *clp, 58 + struct nfs4_file *fp); 59 + int nfsd4_init_pnfs(void); 60 + void nfsd4_exit_pnfs(void); 61 + #else 62 + static inline void nfsd4_setup_layout_type(struct svc_export *exp) 63 + { 64 + } 65 + 66 + static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp) 67 + { 68 + } 69 + static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp, 70 + struct nfs4_file *fp) 71 + { 72 + } 73 + static inline void nfsd4_exit_pnfs(void) 74 + { 75 + } 76 + static inline int nfsd4_init_pnfs(void) 77 + { 78 + return 0; 79 + } 80 + #endif /* CONFIG_NFSD_PNFS */ 81 + #endif /* _FS_NFSD_PNFS_H */

+42 -1

fs/nfsd/state.h

··· 92 92 /* For a deleg stateid kept around only to process free_stateid's: */ 93 93 #define NFS4_REVOKED_DELEG_STID 16 94 94 #define NFS4_CLOSED_DELEG_STID 32 95 + #define NFS4_LAYOUT_STID 64 95 96 unsigned char sc_type; 96 97 stateid_t sc_stateid; 97 98 struct nfs4_client *sc_client; ··· 298 297 struct list_head cl_delegations; 299 298 struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */ 300 299 struct list_head cl_lru; /* tail queue */ 300 + #ifdef CONFIG_NFSD_PNFS 301 + struct list_head cl_lo_states; /* outstanding layout states */ 302 + #endif 301 303 struct xdr_netobj cl_name; /* id generated by client */ 302 304 nfs4_verifier cl_verifier; /* generated by client */ 303 305 time_t cl_time; /* time of last lease renewal */ ··· 497 493 atomic_t fi_access[2]; 498 494 u32 fi_share_deny; 499 495 struct file *fi_deleg_file; 500 - atomic_t fi_delegees; 496 + int fi_delegees; 501 497 struct knfsd_fh fi_fhandle; 502 498 bool fi_had_conflict; 499 + #ifdef CONFIG_NFSD_PNFS 500 + struct list_head fi_lo_states; 501 + atomic_t fi_lo_recalls; 502 + #endif 503 503 }; 504 504 505 505 /* ··· 536 528 return container_of(s, struct nfs4_ol_stateid, st_stid); 537 529 } 538 530 531 + struct nfs4_layout_stateid { 532 + struct nfs4_stid ls_stid; 533 + struct list_head ls_perclnt; 534 + struct list_head ls_perfile; 535 + spinlock_t ls_lock; 536 + struct list_head ls_layouts; 537 + u32 ls_layout_type; 538 + struct file *ls_file; 539 + struct nfsd4_callback ls_recall; 540 + stateid_t ls_recall_sid; 541 + bool ls_recalled; 542 + }; 543 + 544 + static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) 545 + { 546 + return container_of(s, struct nfs4_layout_stateid, ls_stid); 547 + } 548 + 539 549 /* flags for preprocess_seqid_op() */ 540 550 #define RD_STATE 0x00000010 541 551 #define WR_STATE 0x00000020 ··· 561 535 enum nfsd4_cb_op { 562 536 NFSPROC4_CLNT_CB_NULL = 0, 563 537 NFSPROC4_CLNT_CB_RECALL, 538 + NFSPROC4_CLNT_CB_LAYOUT, 564 539 NFSPROC4_CLNT_CB_SEQUENCE, 565 540 }; 566 541 ··· 572 545 extern __be32 nfs4_preprocess_stateid_op(struct net *net, 573 546 struct nfsd4_compound_state *cstate, 574 547 stateid_t *stateid, int flags, struct file **filp); 548 + __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, 549 + stateid_t *stateid, unsigned char typemask, 550 + struct nfs4_stid **s, struct nfsd_net *nn); 551 + struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, 552 + struct kmem_cache *slab); 553 + void nfs4_unhash_stid(struct nfs4_stid *s); 575 554 void nfs4_put_stid(struct nfs4_stid *s); 576 555 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); 577 556 extern void nfs4_release_reclaim(struct nfsd_net *); ··· 599 566 extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, 600 567 struct nfsd_net *nn); 601 568 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); 569 + 570 + struct nfs4_file *find_file(struct knfsd_fh *fh); 571 + void put_nfs4_file(struct nfs4_file *fi); 572 + static inline void get_nfs4_file(struct nfs4_file *fi) 573 + { 574 + atomic_inc(&fi->fi_ref); 575 + } 576 + struct file *find_any_file(struct nfs4_file *f); 602 577 603 578 /* grace period management */ 604 579 void nfsd4_end_grace(struct nfsd_net *nn);

+5

fs/nfsd/trace.c

··· 1 + 2 + #include "state.h" 3 + 4 + #define CREATE_TRACE_POINTS 5 + #include "trace.h"

+54

fs/nfsd/trace.h

··· 1 + /* 2 + * Copyright (c) 2014 Christoph Hellwig. 3 + */ 4 + #undef TRACE_SYSTEM 5 + #define TRACE_SYSTEM nfsd 6 + 7 + #if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) 8 + #define _NFSD_TRACE_H 9 + 10 + #include <linux/tracepoint.h> 11 + 12 + DECLARE_EVENT_CLASS(nfsd_stateid_class, 13 + TP_PROTO(stateid_t *stp), 14 + TP_ARGS(stp), 15 + TP_STRUCT__entry( 16 + __field(u32, cl_boot) 17 + __field(u32, cl_id) 18 + __field(u32, si_id) 19 + __field(u32, si_generation) 20 + ), 21 + TP_fast_assign( 22 + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; 23 + __entry->cl_id = stp->si_opaque.so_clid.cl_id; 24 + __entry->si_id = stp->si_opaque.so_id; 25 + __entry->si_generation = stp->si_generation; 26 + ), 27 + TP_printk("client %08x:%08x stateid %08x:%08x", 28 + __entry->cl_boot, 29 + __entry->cl_id, 30 + __entry->si_id, 31 + __entry->si_generation) 32 + ) 33 + 34 + #define DEFINE_STATEID_EVENT(name) \ 35 + DEFINE_EVENT(nfsd_stateid_class, name, \ 36 + TP_PROTO(stateid_t *stp), \ 37 + TP_ARGS(stp)) 38 + DEFINE_STATEID_EVENT(layoutstate_alloc); 39 + DEFINE_STATEID_EVENT(layoutstate_unhash); 40 + DEFINE_STATEID_EVENT(layoutstate_free); 41 + DEFINE_STATEID_EVENT(layout_get_lookup_fail); 42 + DEFINE_STATEID_EVENT(layout_commit_lookup_fail); 43 + DEFINE_STATEID_EVENT(layout_return_lookup_fail); 44 + DEFINE_STATEID_EVENT(layout_recall); 45 + DEFINE_STATEID_EVENT(layout_recall_done); 46 + DEFINE_STATEID_EVENT(layout_recall_fail); 47 + DEFINE_STATEID_EVENT(layout_recall_release); 48 + 49 + #endif /* _NFSD_TRACE_H */ 50 + 51 + #undef TRACE_INCLUDE_PATH 52 + #define TRACE_INCLUDE_PATH . 53 + #define TRACE_INCLUDE_FILE trace 54 + #include <trace/define_trace.h>

+59

fs/nfsd/xdr4.h

··· 428 428 u32 rca_one_fs; 429 429 }; 430 430 431 + struct nfsd4_deviceid { 432 + u64 fsid_idx; 433 + u32 generation; 434 + u32 pad; 435 + }; 436 + 437 + struct nfsd4_layout_seg { 438 + u32 iomode; 439 + u64 offset; 440 + u64 length; 441 + }; 442 + 443 + struct nfsd4_getdeviceinfo { 444 + struct nfsd4_deviceid gd_devid; /* request */ 445 + u32 gd_layout_type; /* request */ 446 + u32 gd_maxcount; /* request */ 447 + u32 gd_notify_types;/* request - response */ 448 + void *gd_device; /* response */ 449 + }; 450 + 451 + struct nfsd4_layoutget { 452 + u64 lg_minlength; /* request */ 453 + u32 lg_signal; /* request */ 454 + u32 lg_layout_type; /* request */ 455 + u32 lg_maxcount; /* request */ 456 + stateid_t lg_sid; /* request/response */ 457 + struct nfsd4_layout_seg lg_seg; /* request/response */ 458 + void *lg_content; /* response */ 459 + }; 460 + 461 + struct nfsd4_layoutcommit { 462 + stateid_t lc_sid; /* request */ 463 + struct nfsd4_layout_seg lc_seg; /* request */ 464 + u32 lc_reclaim; /* request */ 465 + u32 lc_newoffset; /* request */ 466 + u64 lc_last_wr; /* request */ 467 + struct timespec lc_mtime; /* request */ 468 + u32 lc_layout_type; /* request */ 469 + u32 lc_up_len; /* layout length */ 470 + void *lc_up_layout; /* decoded by callback */ 471 + u32 lc_size_chg; /* boolean for response */ 472 + u64 lc_newsize; /* response */ 473 + }; 474 + 475 + struct nfsd4_layoutreturn { 476 + u32 lr_return_type; /* request */ 477 + u32 lr_layout_type; /* request */ 478 + struct nfsd4_layout_seg lr_seg; /* request */ 479 + u32 lr_reclaim; /* request */ 480 + u32 lrf_body_len; /* request */ 481 + void *lrf_body; /* request */ 482 + stateid_t lr_sid; /* request/response */ 483 + u32 lrs_present; /* response */ 484 + }; 485 + 431 486 struct nfsd4_fallocate { 432 487 /* request */ 433 488 stateid_t falloc_stateid; ··· 546 491 struct nfsd4_reclaim_complete reclaim_complete; 547 492 struct nfsd4_test_stateid test_stateid; 548 493 struct nfsd4_free_stateid free_stateid; 494 + struct nfsd4_getdeviceinfo getdeviceinfo; 495 + struct nfsd4_layoutget layoutget; 496 + struct nfsd4_layoutcommit layoutcommit; 497 + struct nfsd4_layoutreturn layoutreturn; 549 498 550 499 /* NFSv4.2 */ 551 500 struct nfsd4_fallocate allocate;

+7

fs/nfsd/xdr4cb.h

··· 21 21 #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ 22 22 cb_sequence_dec_sz + \ 23 23 op_dec_sz) 24 + #define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ 25 + cb_sequence_enc_sz + \ 26 + 1 + 3 + \ 27 + enc_nfs4_fh_sz + 4) 28 + #define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ 29 + cb_sequence_dec_sz + \ 30 + op_dec_sz)

+23

include/linux/exportfs.h

··· 4 4 #include <linux/types.h> 5 5 6 6 struct dentry; 7 + struct iattr; 7 8 struct inode; 8 9 struct super_block; 9 10 struct vfsmount; ··· 181 180 * get_name is not (which is possibly inconsistent) 182 181 */ 183 182 183 + /* types of block ranges for multipage write mappings. */ 184 + #define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */ 185 + #define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */ 186 + #define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */ 187 + #define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */ 188 + 189 + #define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */ 190 + 191 + struct iomap { 192 + sector_t blkno; /* first sector of mapping */ 193 + loff_t offset; /* file offset of mapping, bytes */ 194 + u64 length; /* length of mapping, bytes */ 195 + int type; /* type of mapping */ 196 + }; 197 + 184 198 struct export_operations { 185 199 int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len, 186 200 struct inode *parent); ··· 207 191 struct dentry *child); 208 192 struct dentry * (*get_parent)(struct dentry *child); 209 193 int (*commit_metadata)(struct inode *inode); 194 + 195 + int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); 196 + int (*map_blocks)(struct inode *inode, loff_t offset, 197 + u64 len, struct iomap *iomap, 198 + bool write, u32 *device_generation); 199 + int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, 200 + int nr_iomaps, struct iattr *iattr); 210 201 }; 211 202 212 203 extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,

+16

include/linux/fs.h

··· 873 873 #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ 874 874 #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ 875 875 #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ 876 + #define FL_LAYOUT 2048 /* outstanding pNFS layout */ 876 877 877 878 /* 878 879 * Special return value from posix_lock_file() and vfs_lock_file() for ··· 2036 2035 return ret; 2037 2036 } 2038 2037 2038 + static inline int break_layout(struct inode *inode, bool wait) 2039 + { 2040 + smp_mb(); 2041 + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) 2042 + return __break_lease(inode, 2043 + wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, 2044 + FL_LAYOUT); 2045 + return 0; 2046 + } 2047 + 2039 2048 #else /* !CONFIG_FILE_LOCKING */ 2040 2049 static inline int locks_mandatory_locked(struct file *file) 2041 2050 { ··· 2098 2087 static inline int break_deleg_wait(struct inode **delegated_inode) 2099 2088 { 2100 2089 BUG(); 2090 + return 0; 2091 + } 2092 + 2093 + static inline int break_layout(struct inode *inode, bool wait) 2094 + { 2101 2095 return 0; 2102 2096 } 2103 2097

+2

include/linux/nfs4.h

··· 411 411 #define FATTR4_WORD1_TIME_MODIFY_SET (1UL << 22) 412 412 #define FATTR4_WORD1_MOUNTED_ON_FILEID (1UL << 23) 413 413 #define FATTR4_WORD1_FS_LAYOUT_TYPES (1UL << 30) 414 + #define FATTR4_WORD2_LAYOUT_TYPES (1UL << 0) 414 415 #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) 415 416 #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) 416 417 #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) ··· 518 517 LAYOUT_OSD2_OBJECTS = 2, 519 518 LAYOUT_BLOCK_VOLUME = 3, 520 519 LAYOUT_FLEX_FILES = 4, 520 + LAYOUT_TYPE_MAX 521 521 }; 522 522 523 523 /* used for both layout return and recall */

+1 -1

include/linux/sunrpc/svc.h

··· 110 110 * We use sv_nrthreads as a reference count. svc_destroy() drops 111 111 * this refcount, so we need to bump it up around operations that 112 112 * change the number of threads. Horrible, but there it is. 113 - * Should be called with the BKL held. 113 + * Should be called with the "service mutex" held. 114 114 */ 115 115 static inline void svc_get(struct svc_serv *serv) 116 116 {

+11 -2

include/linux/sunrpc/svc_rdma.h

··· 77 77 enum ib_wr_opcode wr_op; 78 78 enum ib_wc_status wc_status; 79 79 u32 byte_len; 80 + u32 position; 80 81 struct svcxprt_rdma *xprt; 81 82 unsigned long flags; 82 83 enum dma_data_direction direction; ··· 149 148 struct ib_cq *sc_rq_cq; 150 149 struct ib_cq *sc_sq_cq; 151 150 struct ib_mr *sc_phys_mr; /* MR for server memory */ 151 + int (*sc_reader)(struct svcxprt_rdma *, 152 + struct svc_rqst *, 153 + struct svc_rdma_op_ctxt *, 154 + int *, u32 *, u32, u32, u64, bool); 152 155 u32 sc_dev_caps; /* distilled device caps */ 153 156 u32 sc_dma_lkey; /* local dma key */ 154 157 unsigned int sc_frmr_pg_list_len; ··· 181 176 #define RPCRDMA_MAX_REQ_SIZE 4096 182 177 183 178 /* svc_rdma_marshal.c */ 184 - extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *, 185 - int *, int *); 186 179 extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); 187 180 extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *); 188 181 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, ··· 198 195 199 196 /* svc_rdma_recvfrom.c */ 200 197 extern int svc_rdma_recvfrom(struct svc_rqst *); 198 + extern int rdma_read_chunk_lcl(struct svcxprt_rdma *, struct svc_rqst *, 199 + struct svc_rdma_op_ctxt *, int *, u32 *, 200 + u32, u32, u64, bool); 201 + extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, 202 + struct svc_rdma_op_ctxt *, int *, u32 *, 203 + u32, u32, u64, bool); 201 204 202 205 /* svc_rdma_sendto.c */ 203 206 extern int svc_rdma_sendto(struct svc_rqst *);

+1

include/uapi/linux/nfsd/debug.h

··· 32 32 #define NFSDDBG_REPCACHE 0x0080 33 33 #define NFSDDBG_XDR 0x0100 34 34 #define NFSDDBG_LOCKD 0x0200 35 + #define NFSDDBG_PNFS 0x0400 35 36 #define NFSDDBG_ALL 0x7FFF 36 37 #define NFSDDBG_NOCHANGE 0xFFFF 37 38

+3 -1

include/uapi/linux/nfsd/export.h

··· 47 47 * exported filesystem. 48 48 */ 49 49 #define NFSEXP_V4ROOT 0x10000 50 + #define NFSEXP_NOPNFS 0x20000 51 + 50 52 /* All flags that we claim to support. (Note we don't support NOACL.) */ 51 - #define NFSEXP_ALLFLAGS 0x1FE7F 53 + #define NFSEXP_ALLFLAGS 0x3FE7F 52 54 53 55 /* The flags that may vary depending on security flavor: */ 54 56 #define NFSEXP_SECINFO_FLAGS (NFSEXP_READONLY | NFSEXP_ROOTSQUASH \

+2 -2

net/sunrpc/svc.c

··· 768 768 EXPORT_SYMBOL_GPL(svc_set_num_threads); 769 769 770 770 /* 771 - * Called from a server thread as it's exiting. Caller must hold the BKL or 772 - * the "service mutex", whichever is appropriate for the service. 771 + * Called from a server thread as it's exiting. Caller must hold the "service 772 + * mutex" for the service. 773 773 */ 774 774 void 775 775 svc_exit_thread(struct svc_rqst *rqstp)

+1 -2

net/sunrpc/svc_xprt.c

··· 42 42 * svc_pool->sp_lock protects most of the fields of that pool. 43 43 * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. 44 44 * when both need to be taken (rare), svc_serv->sv_lock is first. 45 - * BKL protects svc_serv->sv_nrthread. 45 + * The "service mutex" protects svc_serv->sv_nrthread. 46 46 * svc_sock->sk_lock protects the svc_sock->sk_deferred list 47 47 * and the ->sk_info_authunix cache. 48 48 * ··· 67 67 * that no other thread will be using the transport or will 68 68 * try to set XPT_DEAD. 69 69 */ 70 - 71 70 int svc_reg_xprt_class(struct svc_xprt_class *xcl) 72 71 { 73 72 struct svc_xprt_class *cl;

-16

net/sunrpc/xprtrdma/svc_rdma_marshal.c

··· 71 71 } 72 72 73 73 /* 74 - * Determine number of chunks and total bytes in chunk list. The chunk 75 - * list has already been verified to fit within the RPCRDMA header. 76 - */ 77 - void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch, 78 - int *ch_count, int *byte_count) 79 - { 80 - /* compute the number of bytes represented by read chunks */ 81 - *byte_count = 0; 82 - *ch_count = 0; 83 - for (; ch->rc_discrim != 0; ch++) { 84 - *byte_count = *byte_count + ntohl(ch->rc_target.rs_length); 85 - *ch_count = *ch_count + 1; 86 - } 87 - } 88 - 89 - /* 90 74 * Decodes a write chunk list. The expected format is as follows: 91 75 * descrim : xdr_one 92 76 * nchunks : <count>

+151 -93

net/sunrpc/xprtrdma/svc_rdma_recvfrom.c

··· 43 43 #include <linux/sunrpc/debug.h> 44 44 #include <linux/sunrpc/rpc_rdma.h> 45 45 #include <linux/spinlock.h> 46 - #include <linux/highmem.h> 47 46 #include <asm/unaligned.h> 48 47 #include <rdma/ib_verbs.h> 49 48 #include <rdma/rdma_cm.h> ··· 59 60 struct svc_rdma_op_ctxt *ctxt, 60 61 u32 byte_count) 61 62 { 63 + struct rpcrdma_msg *rmsgp; 62 64 struct page *page; 63 65 u32 bc; 64 66 int sge_no; ··· 82 82 /* If data remains, store it in the pagelist */ 83 83 rqstp->rq_arg.page_len = bc; 84 84 rqstp->rq_arg.page_base = 0; 85 - rqstp->rq_arg.pages = &rqstp->rq_pages[1]; 85 + 86 + /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ 87 + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; 88 + if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG) 89 + rqstp->rq_arg.pages = &rqstp->rq_pages[0]; 90 + else 91 + rqstp->rq_arg.pages = &rqstp->rq_pages[1]; 92 + 86 93 sge_no = 1; 87 94 while (bc && sge_no < ctxt->count) { 88 95 page = ctxt->pages[sge_no]; ··· 101 94 } 102 95 rqstp->rq_respages = &rqstp->rq_pages[sge_no]; 103 96 rqstp->rq_next_page = rqstp->rq_respages + 1; 104 - 105 - /* We should never run out of SGE because the limit is defined to 106 - * support the max allowed RPC data length 107 - */ 108 - BUG_ON(bc && (sge_no == ctxt->count)); 109 - BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len) 110 - != byte_count); 111 - BUG_ON(rqstp->rq_arg.len != byte_count); 112 97 113 98 /* If not all pages were used from the SGL, free the remaining ones */ 114 99 bc = sge_no; ··· 124 125 return min_t(int, sge_count, xprt->sc_max_sge); 125 126 } 126 127 127 - typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt, 128 - struct svc_rqst *rqstp, 129 - struct svc_rdma_op_ctxt *head, 130 - int *page_no, 131 - u32 *page_offset, 132 - u32 rs_handle, 133 - u32 rs_length, 134 - u64 rs_offset, 135 - int last); 136 - 137 128 /* Issue an RDMA_READ using the local lkey to map the data sink */ 138 - static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, 139 - struct svc_rqst *rqstp, 140 - struct svc_rdma_op_ctxt *head, 141 - int *page_no, 142 - u32 *page_offset, 143 - u32 rs_handle, 144 - u32 rs_length, 145 - u64 rs_offset, 146 - int last) 129 + int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, 130 + struct svc_rqst *rqstp, 131 + struct svc_rdma_op_ctxt *head, 132 + int *page_no, 133 + u32 *page_offset, 134 + u32 rs_handle, 135 + u32 rs_length, 136 + u64 rs_offset, 137 + bool last) 147 138 { 148 139 struct ib_send_wr read_wr; 149 140 int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; ··· 218 229 } 219 230 220 231 /* Issue an RDMA_READ using an FRMR to map the data sink */ 221 - static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, 222 - struct svc_rqst *rqstp, 223 - struct svc_rdma_op_ctxt *head, 224 - int *page_no, 225 - u32 *page_offset, 226 - u32 rs_handle, 227 - u32 rs_length, 228 - u64 rs_offset, 229 - int last) 232 + int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, 233 + struct svc_rqst *rqstp, 234 + struct svc_rdma_op_ctxt *head, 235 + int *page_no, 236 + u32 *page_offset, 237 + u32 rs_handle, 238 + u32 rs_length, 239 + u64 rs_offset, 240 + bool last) 230 241 { 231 242 struct ib_send_wr read_wr; 232 243 struct ib_send_wr inv_wr; ··· 354 365 return ret; 355 366 } 356 367 368 + static unsigned int 369 + rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch) 370 + { 371 + unsigned int count; 372 + 373 + for (count = 0; ch->rc_discrim != xdr_zero; ch++) 374 + count++; 375 + return count; 376 + } 377 + 378 + /* If there was additional inline content, append it to the end of arg.pages. 379 + * Tail copy has to be done after the reader function has determined how many 380 + * pages are needed for RDMA READ. 381 + */ 382 + static int 383 + rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, 384 + u32 position, u32 byte_count, u32 page_offset, int page_no) 385 + { 386 + char *srcp, *destp; 387 + int ret; 388 + 389 + ret = 0; 390 + srcp = head->arg.head[0].iov_base + position; 391 + byte_count = head->arg.head[0].iov_len - position; 392 + if (byte_count > PAGE_SIZE) { 393 + dprintk("svcrdma: large tail unsupported\n"); 394 + return 0; 395 + } 396 + 397 + /* Fit as much of the tail on the current page as possible */ 398 + if (page_offset != PAGE_SIZE) { 399 + destp = page_address(rqstp->rq_arg.pages[page_no]); 400 + destp += page_offset; 401 + while (byte_count--) { 402 + *destp++ = *srcp++; 403 + page_offset++; 404 + if (page_offset == PAGE_SIZE && byte_count) 405 + goto more; 406 + } 407 + goto done; 408 + } 409 + 410 + more: 411 + /* Fit the rest on the next page */ 412 + page_no++; 413 + destp = page_address(rqstp->rq_arg.pages[page_no]); 414 + while (byte_count--) 415 + *destp++ = *srcp++; 416 + 417 + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; 418 + rqstp->rq_next_page = rqstp->rq_respages + 1; 419 + 420 + done: 421 + byte_count = head->arg.head[0].iov_len - position; 422 + head->arg.page_len += byte_count; 423 + head->arg.len += byte_count; 424 + head->arg.buflen += byte_count; 425 + return 1; 426 + } 427 + 357 428 static int rdma_read_chunks(struct svcxprt_rdma *xprt, 358 429 struct rpcrdma_msg *rmsgp, 359 430 struct svc_rqst *rqstp, 360 431 struct svc_rdma_op_ctxt *head) 361 432 { 362 - int page_no, ch_count, ret; 433 + int page_no, ret; 363 434 struct rpcrdma_read_chunk *ch; 364 - u32 page_offset, byte_count; 435 + u32 handle, page_offset, byte_count; 436 + u32 position; 365 437 u64 rs_offset; 366 - rdma_reader_fn reader; 438 + bool last; 367 439 368 440 /* If no read list is present, return 0 */ 369 441 ch = svc_rdma_get_read_chunk(rmsgp); 370 442 if (!ch) 371 443 return 0; 372 444 373 - svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); 374 - if (ch_count > RPCSVC_MAXPAGES) 445 + if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES) 375 446 return -EINVAL; 376 447 377 448 /* The request is completed when the RDMA_READs complete. The ··· 440 391 */ 441 392 head->arg.head[0] = rqstp->rq_arg.head[0]; 442 393 head->arg.tail[0] = rqstp->rq_arg.tail[0]; 443 - head->arg.pages = &head->pages[head->count]; 444 394 head->hdr_count = head->count; 445 395 head->arg.page_base = 0; 446 396 head->arg.page_len = 0; 447 397 head->arg.len = rqstp->rq_arg.len; 448 398 head->arg.buflen = rqstp->rq_arg.buflen; 449 399 450 - /* Use FRMR if supported */ 451 - if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) 452 - reader = rdma_read_chunk_frmr; 453 - else 454 - reader = rdma_read_chunk_lcl; 400 + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; 401 + position = be32_to_cpu(ch->rc_position); 455 402 456 - page_no = 0; page_offset = 0; 457 - for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; 458 - ch->rc_discrim != 0; ch++) { 403 + /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ 404 + if (position == 0) { 405 + head->arg.pages = &head->pages[0]; 406 + page_offset = head->byte_len; 407 + } else { 408 + head->arg.pages = &head->pages[head->count]; 409 + page_offset = 0; 410 + } 459 411 412 + ret = 0; 413 + page_no = 0; 414 + for (; ch->rc_discrim != xdr_zero; ch++) { 415 + if (be32_to_cpu(ch->rc_position) != position) 416 + goto err; 417 + 418 + handle = be32_to_cpu(ch->rc_target.rs_handle), 419 + byte_count = be32_to_cpu(ch->rc_target.rs_length); 460 420 xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset, 461 421 &rs_offset); 462 - byte_count = ntohl(ch->rc_target.rs_length); 463 422 464 423 while (byte_count > 0) { 465 - ret = reader(xprt, rqstp, head, 466 - &page_no, &page_offset, 467 - ntohl(ch->rc_target.rs_handle), 468 - byte_count, rs_offset, 469 - ((ch+1)->rc_discrim == 0) /* last */ 470 - ); 424 + last = (ch + 1)->rc_discrim == xdr_zero; 425 + ret = xprt->sc_reader(xprt, rqstp, head, 426 + &page_no, &page_offset, 427 + handle, byte_count, 428 + rs_offset, last); 471 429 if (ret < 0) 472 430 goto err; 473 431 byte_count -= ret; ··· 482 426 head->arg.buflen += ret; 483 427 } 484 428 } 429 + 430 + /* Read list may need XDR round-up (see RFC 5666, s. 3.7) */ 431 + if (page_offset & 3) { 432 + u32 pad = 4 - (page_offset & 3); 433 + 434 + head->arg.page_len += pad; 435 + head->arg.len += pad; 436 + head->arg.buflen += pad; 437 + page_offset += pad; 438 + } 439 + 485 440 ret = 1; 441 + if (position && position < head->arg.head[0].iov_len) 442 + ret = rdma_copy_tail(rqstp, head, position, 443 + byte_count, page_offset, page_no); 444 + head->arg.head[0].iov_len = position; 445 + head->position = position; 446 + 486 447 err: 487 448 /* Detach arg pages. svc_recv will replenish them */ 488 449 for (page_no = 0; ··· 509 436 return ret; 510 437 } 511 438 512 - /* 513 - * To avoid a separate RDMA READ just for a handful of zero bytes, 514 - * RFC 5666 section 3.7 allows the client to omit the XDR zero pad 515 - * in chunk lists. 516 - */ 517 - static void 518 - rdma_fix_xdr_pad(struct xdr_buf *buf) 519 - { 520 - unsigned int page_len = buf->page_len; 521 - unsigned int size = (XDR_QUADLEN(page_len) << 2) - page_len; 522 - unsigned int offset, pg_no; 523 - char *p; 524 - 525 - if (size == 0) 526 - return; 527 - 528 - pg_no = page_len >> PAGE_SHIFT; 529 - offset = page_len & ~PAGE_MASK; 530 - p = page_address(buf->pages[pg_no]); 531 - memset(p + offset, 0, size); 532 - 533 - buf->page_len += size; 534 - buf->buflen += size; 535 - buf->len += size; 536 - } 537 - 538 439 static int rdma_read_complete(struct svc_rqst *rqstp, 539 440 struct svc_rdma_op_ctxt *head) 540 441 { 541 442 int page_no; 542 443 int ret; 543 444 544 - BUG_ON(!head); 545 - 546 445 /* Copy RPC pages */ 547 446 for (page_no = 0; page_no < head->count; page_no++) { 548 447 put_page(rqstp->rq_pages[page_no]); 549 448 rqstp->rq_pages[page_no] = head->pages[page_no]; 550 449 } 450 + 451 + /* Adjustments made for RDMA_NOMSG type requests */ 452 + if (head->position == 0) { 453 + if (head->arg.len <= head->sge[0].length) { 454 + head->arg.head[0].iov_len = head->arg.len - 455 + head->byte_len; 456 + head->arg.page_len = 0; 457 + } else { 458 + head->arg.head[0].iov_len = head->sge[0].length - 459 + head->byte_len; 460 + head->arg.page_len = head->arg.len - 461 + head->sge[0].length; 462 + } 463 + } 464 + 551 465 /* Point rq_arg.pages past header */ 552 - rdma_fix_xdr_pad(&head->arg); 553 466 rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; 554 467 rqstp->rq_arg.page_len = head->arg.page_len; 555 468 rqstp->rq_arg.page_base = head->arg.page_base; ··· 560 501 ret = rqstp->rq_arg.head[0].iov_len 561 502 + rqstp->rq_arg.page_len 562 503 + rqstp->rq_arg.tail[0].iov_len; 563 - dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, " 564 - "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", 504 + dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, " 505 + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n", 565 506 ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, 566 507 rqstp->rq_arg.head[0].iov_len); 567 508 ··· 617 558 } 618 559 dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", 619 560 ctxt, rdma_xprt, rqstp, ctxt->wc_status); 620 - BUG_ON(ctxt->wc_status != IB_WC_SUCCESS); 621 561 atomic_inc(&rdma_stat_recv); 622 562 623 563 /* Build up the XDR from the receive buffers. */ ··· 649 591 + rqstp->rq_arg.tail[0].iov_len; 650 592 svc_rdma_put_context(ctxt, 0); 651 593 out: 652 - dprintk("svcrdma: ret = %d, rq_arg.len =%d, " 653 - "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", 594 + dprintk("svcrdma: ret=%d, rq_arg.len=%u, " 595 + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zd\n", 654 596 ret, rqstp->rq_arg.len, 655 597 rqstp->rq_arg.head[0].iov_base, 656 598 rqstp->rq_arg.head[0].iov_len);

+26 -20

net/sunrpc/xprtrdma/svc_rdma_sendto.c

··· 60 60 u32 page_off; 61 61 int page_no; 62 62 63 - BUG_ON(xdr->len != 64 - (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)); 63 + if (xdr->len != 64 + (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) { 65 + pr_err("svcrdma: map_xdr: XDR buffer length error\n"); 66 + return -EIO; 67 + } 65 68 66 69 /* Skip the first sge, this is for the RPCRDMA header */ 67 70 sge_no = 1; ··· 153 150 int bc; 154 151 struct svc_rdma_op_ctxt *ctxt; 155 152 156 - BUG_ON(vec->count > RPCSVC_MAXPAGES); 153 + if (vec->count > RPCSVC_MAXPAGES) { 154 + pr_err("svcrdma: Too many pages (%lu)\n", vec->count); 155 + return -EIO; 156 + } 157 + 157 158 dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " 158 159 "write_len=%d, vec->sge=%p, vec->count=%lu\n", 159 160 rmr, (unsigned long long)to, xdr_off, ··· 197 190 sge_off = 0; 198 191 sge_no++; 199 192 xdr_sge_no++; 200 - BUG_ON(xdr_sge_no > vec->count); 193 + if (xdr_sge_no > vec->count) { 194 + pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no); 195 + goto err; 196 + } 201 197 bc -= sge_bytes; 202 198 if (sge_no == xprt->sc_max_sge) 203 199 break; ··· 431 421 ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; 432 422 ctxt->sge[sge_no].length = sge_bytes; 433 423 } 434 - BUG_ON(byte_count != 0); 424 + if (byte_count != 0) { 425 + pr_err("svcrdma: Could not map %d bytes\n", byte_count); 426 + goto err; 427 + } 435 428 436 429 /* Save all respages in the ctxt and remove them from the 437 430 * respages array. They are our pages until the I/O ··· 455 442 } 456 443 rqstp->rq_next_page = rqstp->rq_respages + 1; 457 444 458 - BUG_ON(sge_no > rdma->sc_max_sge); 445 + if (sge_no > rdma->sc_max_sge) { 446 + pr_err("svcrdma: Too many sges (%d)\n", sge_no); 447 + goto err; 448 + } 459 449 memset(&send_wr, 0, sizeof send_wr); 460 450 ctxt->wr_op = IB_WR_SEND; 461 451 send_wr.wr_id = (unsigned long)ctxt; ··· 483 467 { 484 468 } 485 469 486 - /* 487 - * Return the start of an xdr buffer. 488 - */ 489 - static void *xdr_start(struct xdr_buf *xdr) 490 - { 491 - return xdr->head[0].iov_base - 492 - (xdr->len - 493 - xdr->page_len - 494 - xdr->tail[0].iov_len - 495 - xdr->head[0].iov_len); 496 - } 497 - 498 470 int svc_rdma_sendto(struct svc_rqst *rqstp) 499 471 { 500 472 struct svc_xprt *xprt = rqstp->rq_xprt; ··· 500 496 501 497 dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); 502 498 503 - /* Get the RDMA request header. */ 504 - rdma_argp = xdr_start(&rqstp->rq_arg); 499 + /* Get the RDMA request header. The receive logic always 500 + * places this at the start of page 0. 501 + */ 502 + rdma_argp = page_address(rqstp->rq_pages[0]); 505 503 506 504 /* Build an req vec for the XDR */ 507 505 ctxt = svc_rdma_get_context(rdma);

+29 -18

net/sunrpc/xprtrdma/svc_rdma_transport.c

··· 139 139 struct svcxprt_rdma *xprt; 140 140 int i; 141 141 142 - BUG_ON(!ctxt); 143 142 xprt = ctxt->xprt; 144 143 if (free_pages) 145 144 for (i = 0; i < ctxt->count; i++) ··· 338 339 339 340 switch (ctxt->wr_op) { 340 341 case IB_WR_SEND: 341 - BUG_ON(ctxt->frmr); 342 + if (ctxt->frmr) 343 + pr_err("svcrdma: SEND: ctxt->frmr != NULL\n"); 342 344 svc_rdma_put_context(ctxt, 1); 343 345 break; 344 346 345 347 case IB_WR_RDMA_WRITE: 346 - BUG_ON(ctxt->frmr); 348 + if (ctxt->frmr) 349 + pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n"); 347 350 svc_rdma_put_context(ctxt, 0); 348 351 break; 349 352 ··· 354 353 svc_rdma_put_frmr(xprt, ctxt->frmr); 355 354 if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { 356 355 struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; 357 - BUG_ON(!read_hdr); 358 - spin_lock_bh(&xprt->sc_rq_dto_lock); 359 - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 360 - list_add_tail(&read_hdr->dto_q, 361 - &xprt->sc_read_complete_q); 362 - spin_unlock_bh(&xprt->sc_rq_dto_lock); 356 + if (read_hdr) { 357 + spin_lock_bh(&xprt->sc_rq_dto_lock); 358 + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); 359 + list_add_tail(&read_hdr->dto_q, 360 + &xprt->sc_read_complete_q); 361 + spin_unlock_bh(&xprt->sc_rq_dto_lock); 362 + } else { 363 + pr_err("svcrdma: ctxt->read_hdr == NULL\n"); 364 + } 363 365 svc_xprt_enqueue(&xprt->sc_xprt); 364 366 } 365 367 svc_rdma_put_context(ctxt, 0); 366 368 break; 367 369 368 370 default: 369 - BUG_ON(1); 370 371 printk(KERN_ERR "svcrdma: unexpected completion type, " 371 372 "opcode=%d\n", 372 373 ctxt->wr_op); ··· 516 513 buflen = 0; 517 514 ctxt->direction = DMA_FROM_DEVICE; 518 515 for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { 519 - BUG_ON(sge_no >= xprt->sc_max_sge); 516 + if (sge_no >= xprt->sc_max_sge) { 517 + pr_err("svcrdma: Too many sges (%d)\n", sge_no); 518 + goto err_put_ctxt; 519 + } 520 520 page = svc_rdma_get_page(); 521 521 ctxt->pages[sge_no] = page; 522 522 pa = ib_dma_map_page(xprt->sc_cm_id->device, ··· 693 687 { 694 688 struct rdma_cm_id *listen_id; 695 689 struct svcxprt_rdma *cma_xprt; 696 - struct svc_xprt *xprt; 697 690 int ret; 698 691 699 692 dprintk("svcrdma: Creating RDMA socket\n"); ··· 703 698 cma_xprt = rdma_create_xprt(serv, 1); 704 699 if (!cma_xprt) 705 700 return ERR_PTR(-ENOMEM); 706 - xprt = &cma_xprt->sc_xprt; 707 701 708 702 listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP, 709 703 IB_QPT_RC); ··· 826 822 if (frmr) { 827 823 frmr_unmap_dma(rdma, frmr); 828 824 spin_lock_bh(&rdma->sc_frmr_q_lock); 829 - BUG_ON(!list_empty(&frmr->frmr_list)); 825 + WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); 830 826 list_add(&frmr->frmr_list, &rdma->sc_frmr_q); 831 827 spin_unlock_bh(&rdma->sc_frmr_q_lock); 832 828 } ··· 974 970 * NB: iWARP requires remote write access for the data sink 975 971 * of an RDMA_READ. IB does not. 976 972 */ 973 + newxprt->sc_reader = rdma_read_chunk_lcl; 977 974 if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { 978 975 newxprt->sc_frmr_pg_list_len = 979 976 devattr.max_fast_reg_page_list_len; 980 977 newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; 978 + newxprt->sc_reader = rdma_read_chunk_frmr; 981 979 } 982 980 983 981 /* ··· 1131 1125 dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); 1132 1126 1133 1127 /* We should only be called from kref_put */ 1134 - BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); 1128 + if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0) 1129 + pr_err("svcrdma: sc_xprt still in use? (%d)\n", 1130 + atomic_read(&rdma->sc_xprt.xpt_ref.refcount)); 1135 1131 1136 1132 /* 1137 1133 * Destroy queued, but not processed read completions. Note ··· 1161 1153 } 1162 1154 1163 1155 /* Warn if we leaked a resource or under-referenced */ 1164 - WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); 1165 - WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); 1156 + if (atomic_read(&rdma->sc_ctxt_used) != 0) 1157 + pr_err("svcrdma: ctxt still in use? (%d)\n", 1158 + atomic_read(&rdma->sc_ctxt_used)); 1159 + if (atomic_read(&rdma->sc_dma_used) != 0) 1160 + pr_err("svcrdma: dma still in use? (%d)\n", 1161 + atomic_read(&rdma->sc_dma_used)); 1166 1162 1167 1163 /* De-allocate fastreg mr */ 1168 1164 rdma_dealloc_frmr_q(rdma); ··· 1266 1254 if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) 1267 1255 return -ENOTCONN; 1268 1256 1269 - BUG_ON(wr->send_flags != IB_SEND_SIGNALED); 1270 1257 wr_count = 1; 1271 1258 for (n_wr = wr->next; n_wr; n_wr = n_wr->next) 1272 1259 wr_count++;