Merge tag 'nfs-for-4.8-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs

+1 -1

fs/nfs/Makefile

··· 6 6 7 7 CFLAGS_nfstrace.o += -I$(src) 8 8 nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ 9 - direct.o pagelist.o read.o symlink.o unlink.o \ 9 + io.o direct.o pagelist.o read.o symlink.o unlink.o \ 10 10 write.o namespace.o mount_clnt.o nfstrace.o 11 11 nfs-$(CONFIG_ROOT_NFS) += nfsroot.o 12 12 nfs-$(CONFIG_SYSCTL) += sysctl.o

+75 -35

fs/nfs/blocklayout/dev.c

··· 65 65 if (!p) 66 66 return -EIO; 67 67 b->simple.nr_sigs = be32_to_cpup(p++); 68 - if (!b->simple.nr_sigs) { 69 - dprintk("no signature\n"); 68 + if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) { 69 + dprintk("Bad signature count: %d\n", b->simple.nr_sigs); 70 70 return -EIO; 71 71 } 72 72 ··· 89 89 memcpy(&b->simple.sigs[i].sig, p, 90 90 b->simple.sigs[i].sig_len); 91 91 92 - b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; 92 + b->simple.len += 8 + 4 + \ 93 + (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2); 93 94 } 94 95 break; 95 96 case PNFS_BLOCK_VOLUME_SLICE: ··· 105 104 p = xdr_inline_decode(xdr, 4); 106 105 if (!p) 107 106 return -EIO; 107 + 108 108 b->concat.volumes_count = be32_to_cpup(p++); 109 + if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) { 110 + dprintk("Too many volumes: %d\n", b->concat.volumes_count); 111 + return -EIO; 112 + } 109 113 110 114 p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); 111 115 if (!p) ··· 122 116 p = xdr_inline_decode(xdr, 8 + 4); 123 117 if (!p) 124 118 return -EIO; 119 + 125 120 p = xdr_decode_hyper(p, &b->stripe.chunk_size); 126 121 b->stripe.volumes_count = be32_to_cpup(p++); 122 + if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) { 123 + dprintk("Too many volumes: %d\n", b->stripe.volumes_count); 124 + return -EIO; 125 + } 127 126 128 127 p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); 129 128 if (!p) ··· 235 224 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) 236 225 { 237 226 struct pnfs_block_volume *v = &volumes[idx]; 227 + struct block_device *bdev; 238 228 dev_t dev; 239 229 240 230 dev = bl_resolve_deviceid(server, v, gfp_mask); 241 231 if (!dev) 242 232 return -EIO; 243 233 244 - d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); 245 - if (IS_ERR(d->bdev)) { 234 + bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); 235 + if (IS_ERR(bdev)) { 246 236 printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", 247 - MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); 248 - return PTR_ERR(d->bdev); 237 + MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); 238 + return PTR_ERR(bdev); 249 239 } 240 + d->bdev = bdev; 250 241 251 242 252 243 d->len = i_size_read(d->bdev->bd_inode); ··· 300 287 } 301 288 } 302 289 290 + /* 291 + * Try to open the udev path for the WWN. At least on Debian the udev 292 + * by-id path will always point to the dm-multipath device if one exists. 293 + */ 294 + static struct block_device * 295 + bl_open_udev_path(struct pnfs_block_volume *v) 296 + { 297 + struct block_device *bdev; 298 + const char *devname; 299 + 300 + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN", 301 + v->scsi.designator_len, v->scsi.designator); 302 + if (!devname) 303 + return ERR_PTR(-ENOMEM); 304 + 305 + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); 306 + if (IS_ERR(bdev)) { 307 + pr_warn("pNFS: failed to open device %s (%ld)\n", 308 + devname, PTR_ERR(bdev)); 309 + } 310 + 311 + kfree(devname); 312 + return bdev; 313 + } 314 + 315 + /* 316 + * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the 317 + * wwn- links will only point to the first discovered SCSI device there. 318 + */ 319 + static struct block_device * 320 + bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v) 321 + { 322 + struct block_device *bdev; 323 + const char *devname; 324 + 325 + devname = kasprintf(GFP_KERNEL, 326 + "/dev/disk/by-id/dm-uuid-mpath-%d%*phN", 327 + v->scsi.designator_type, 328 + v->scsi.designator_len, v->scsi.designator); 329 + if (!devname) 330 + return ERR_PTR(-ENOMEM); 331 + 332 + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); 333 + kfree(devname); 334 + return bdev; 335 + } 336 + 303 337 static int 304 338 bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, 305 339 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) 306 340 { 307 341 struct pnfs_block_volume *v = &volumes[idx]; 342 + struct block_device *bdev; 308 343 const struct pr_ops *ops; 309 - const char *devname; 310 344 int error; 311 345 312 346 if (!bl_validate_designator(v)) 313 347 return -EINVAL; 314 348 315 - switch (v->scsi.designator_len) { 316 - case 8: 317 - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN", 318 - v->scsi.designator); 319 - break; 320 - case 12: 321 - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN", 322 - v->scsi.designator); 323 - break; 324 - case 16: 325 - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN", 326 - v->scsi.designator); 327 - break; 328 - default: 329 - return -EINVAL; 330 - } 331 - 332 - d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL); 333 - if (IS_ERR(d->bdev)) { 334 - pr_warn("pNFS: failed to open device %s (%ld)\n", 335 - devname, PTR_ERR(d->bdev)); 336 - kfree(devname); 337 - return PTR_ERR(d->bdev); 338 - } 339 - 340 - kfree(devname); 349 + bdev = bl_open_dm_mpath_udev_path(v); 350 + if (IS_ERR(bdev)) 351 + bdev = bl_open_udev_path(v); 352 + if (IS_ERR(bdev)) 353 + return PTR_ERR(bdev); 354 + d->bdev = bdev; 341 355 342 356 d->len = i_size_read(d->bdev->bd_inode); 343 357 d->map = bl_map_simple; ··· 392 352 return 0; 393 353 394 354 out_blkdev_put: 395 - blkdev_put(d->bdev, FMODE_READ); 355 + blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE); 396 356 return error; 397 357 } 398 358

+21 -6

fs/nfs/blocklayout/extent_tree.c

··· 121 121 return be; 122 122 } 123 123 124 + static void __ext_put_deviceids(struct list_head *head) 125 + { 126 + struct pnfs_block_extent *be, *tmp; 127 + 128 + list_for_each_entry_safe(be, tmp, head, be_list) { 129 + nfs4_put_deviceid_node(be->be_device); 130 + kfree(be); 131 + } 132 + } 133 + 124 134 static void 125 135 __ext_tree_insert(struct rb_root *root, 126 136 struct pnfs_block_extent *new, bool merge_ok) ··· 173 163 } 174 164 175 165 static int 176 - __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) 166 + __ext_tree_remove(struct rb_root *root, 167 + sector_t start, sector_t end, struct list_head *tmp) 177 168 { 178 169 struct pnfs_block_extent *be; 179 170 sector_t len1 = 0, len2 = 0; ··· 234 223 struct pnfs_block_extent *next = ext_tree_next(be); 235 224 236 225 rb_erase(&be->be_node, root); 237 - nfs4_put_deviceid_node(be->be_device); 238 - kfree(be); 226 + list_add_tail(&be->be_list, tmp); 239 227 be = next; 240 228 } 241 229 ··· 360 350 sector_t start, sector_t end) 361 351 { 362 352 int err, err2; 353 + LIST_HEAD(tmp); 363 354 364 355 spin_lock(&bl->bl_ext_lock); 365 - err = __ext_tree_remove(&bl->bl_ext_ro, start, end); 356 + err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); 366 357 if (rw) { 367 - err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); 358 + err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp); 368 359 if (!err) 369 360 err = err2; 370 361 } 371 362 spin_unlock(&bl->bl_ext_lock); 372 363 364 + __ext_put_deviceids(&tmp); 373 365 return err; 374 366 } 375 367 ··· 408 396 sector_t end = start + len; 409 397 struct pnfs_block_extent *be; 410 398 int err = 0; 399 + LIST_HEAD(tmp); 411 400 412 401 spin_lock(&bl->bl_ext_lock); 413 402 /* 414 403 * First remove all COW extents or holes from written to range. 415 404 */ 416 - err = __ext_tree_remove(&bl->bl_ext_ro, start, end); 405 + err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); 417 406 if (err) 418 407 goto out; 419 408 ··· 472 459 } 473 460 out: 474 461 spin_unlock(&bl->bl_ext_lock); 462 + 463 + __ext_put_deviceids(&tmp); 475 464 return err; 476 465 } 477 466

+47 -23

fs/nfs/callback_proc.c

··· 119 119 * hashed by filehandle. 120 120 */ 121 121 static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, 122 - struct nfs_fh *fh, nfs4_stateid *stateid) 122 + struct nfs_fh *fh) 123 123 { 124 124 struct nfs_server *server; 125 + struct nfs_inode *nfsi; 125 126 struct inode *ino; 126 127 struct pnfs_layout_hdr *lo; 127 128 129 + restart: 128 130 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 129 131 list_for_each_entry(lo, &server->layouts, plh_layouts) { 130 - if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid)) 132 + nfsi = NFS_I(lo->plh_inode); 133 + if (nfs_compare_fh(fh, &nfsi->fh)) 131 134 continue; 132 - if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh)) 135 + if (nfsi->layout != lo) 133 136 continue; 134 137 ino = igrab(lo->plh_inode); 135 138 if (!ino) 136 139 break; 137 140 spin_lock(&ino->i_lock); 138 141 /* Is this layout in the process of being freed? */ 139 - if (NFS_I(ino)->layout != lo) { 142 + if (nfsi->layout != lo) { 140 143 spin_unlock(&ino->i_lock); 141 144 iput(ino); 142 - break; 145 + goto restart; 143 146 } 144 147 pnfs_get_layout_hdr(lo); 145 148 spin_unlock(&ino->i_lock); ··· 154 151 } 155 152 156 153 static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, 157 - struct nfs_fh *fh, nfs4_stateid *stateid) 154 + struct nfs_fh *fh) 158 155 { 159 156 struct pnfs_layout_hdr *lo; 160 157 161 158 spin_lock(&clp->cl_lock); 162 159 rcu_read_lock(); 163 - lo = get_layout_by_fh_locked(clp, fh, stateid); 160 + lo = get_layout_by_fh_locked(clp, fh); 164 161 rcu_read_unlock(); 165 162 spin_unlock(&clp->cl_lock); 166 163 ··· 170 167 /* 171 168 * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) 172 169 */ 173 - static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, 170 + static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo, 174 171 const nfs4_stateid *new) 175 172 { 176 173 u32 oldseq, newseq; 177 174 178 - oldseq = be32_to_cpu(lo->plh_stateid.seqid); 179 - newseq = be32_to_cpu(new->seqid); 175 + /* Is the stateid still not initialised? */ 176 + if (!pnfs_layout_is_valid(lo)) 177 + return NFS4ERR_DELAY; 180 178 179 + /* Mismatched stateid? */ 180 + if (!nfs4_stateid_match_other(&lo->plh_stateid, new)) 181 + return NFS4ERR_BAD_STATEID; 182 + 183 + newseq = be32_to_cpu(new->seqid); 184 + /* Are we already in a layout recall situation? */ 185 + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && 186 + lo->plh_return_seq != 0) { 187 + if (newseq < lo->plh_return_seq) 188 + return NFS4ERR_OLD_STATEID; 189 + if (newseq > lo->plh_return_seq) 190 + return NFS4ERR_DELAY; 191 + goto out; 192 + } 193 + 194 + /* Check that the stateid matches what we think it should be. */ 195 + oldseq = be32_to_cpu(lo->plh_stateid.seqid); 181 196 if (newseq > oldseq + 1) 182 - return false; 183 - return true; 197 + return NFS4ERR_DELAY; 198 + /* Crazy server! */ 199 + if (newseq <= oldseq) 200 + return NFS4ERR_OLD_STATEID; 201 + out: 202 + return NFS_OK; 184 203 } 185 204 186 205 static u32 initiate_file_draining(struct nfs_client *clp, ··· 213 188 u32 rv = NFS4ERR_NOMATCHING_LAYOUT; 214 189 LIST_HEAD(free_me_list); 215 190 216 - lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); 191 + lo = get_layout_by_fh(clp, &args->cbl_fh); 217 192 if (!lo) { 218 193 trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, 219 194 &args->cbl_stateid, -rv); ··· 221 196 } 222 197 223 198 ino = lo->plh_inode; 224 - 225 - spin_lock(&ino->i_lock); 226 - if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { 227 - rv = NFS4ERR_DELAY; 228 - goto unlock; 229 - } 230 - pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); 231 - spin_unlock(&ino->i_lock); 232 - 233 199 pnfs_layoutcommit_inode(ino, false); 234 200 201 + 235 202 spin_lock(&ino->i_lock); 203 + rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); 204 + if (rv != NFS_OK) 205 + goto unlock; 206 + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); 207 + 236 208 /* 237 209 * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) 238 210 */ ··· 245 223 goto unlock; 246 224 } 247 225 226 + /* Embrace your forgetfulness! */ 227 + rv = NFS4ERR_NOMATCHING_LAYOUT; 228 + 248 229 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { 249 230 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, 250 231 &args->cbl_range); 251 232 } 252 - pnfs_mark_layout_returned_if_empty(lo); 253 233 unlock: 254 234 spin_unlock(&ino->i_lock); 255 235 pnfs_free_lseg_list(&free_me_list);

+5 -1

fs/nfs/callback_xdr.c

··· 925 925 if (hdr_arg.minorversion == 0) { 926 926 cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident); 927 927 if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp)) 928 - return rpc_drop_reply; 928 + goto out_invalidcred; 929 929 } 930 930 931 931 cps.minorversion = hdr_arg.minorversion; ··· 953 953 nfs_put_client(cps.clp); 954 954 dprintk("%s: done, status = %u\n", __func__, ntohl(status)); 955 955 return rpc_success; 956 + 957 + out_invalidcred: 958 + pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n"); 959 + return rpc_autherr_badcred; 956 960 } 957 961 958 962 /*

+10 -12

fs/nfs/client.c

··· 367 367 */ 368 368 struct nfs_client * 369 369 nfs_get_client(const struct nfs_client_initdata *cl_init, 370 - const struct rpc_timeout *timeparms, 371 - const char *ip_addr, 372 370 rpc_authflavor_t authflavour) 373 371 { 374 372 struct nfs_client *clp, *new = NULL; ··· 397 399 &nn->nfs_client_list); 398 400 spin_unlock(&nn->nfs_client_lock); 399 401 new->cl_flags = cl_init->init_flags; 400 - return rpc_ops->init_client(new, timeparms, ip_addr); 402 + return rpc_ops->init_client(new, cl_init); 401 403 } 402 404 403 405 spin_unlock(&nn->nfs_client_lock); ··· 468 470 * Create an RPC client handle 469 471 */ 470 472 int nfs_create_rpc_client(struct nfs_client *clp, 471 - const struct rpc_timeout *timeparms, 473 + const struct nfs_client_initdata *cl_init, 472 474 rpc_authflavor_t flavor) 473 475 { 474 476 struct rpc_clnt *clnt = NULL; ··· 477 479 .protocol = clp->cl_proto, 478 480 .address = (struct sockaddr *)&clp->cl_addr, 479 481 .addrsize = clp->cl_addrlen, 480 - .timeout = timeparms, 482 + .timeout = cl_init->timeparms, 481 483 .servername = clp->cl_hostname, 484 + .nodename = cl_init->nodename, 482 485 .program = &nfs_program, 483 486 .version = clp->rpc_ops->version, 484 487 .authflavor = flavor, ··· 590 591 * nfs_init_client - Initialise an NFS2 or NFS3 client 591 592 * 592 593 * @clp: nfs_client to initialise 593 - * @timeparms: timeout parameters for underlying RPC transport 594 - * @ip_addr: IP presentation address (not used) 594 + * @cl_init: Initialisation parameters 595 595 * 596 596 * Returns pointer to an NFS client, or an ERR_PTR value. 597 597 */ 598 598 struct nfs_client *nfs_init_client(struct nfs_client *clp, 599 - const struct rpc_timeout *timeparms, 600 - const char *ip_addr) 599 + const struct nfs_client_initdata *cl_init) 601 600 { 602 601 int error; 603 602 ··· 609 612 * Create a client RPC handle for doing FSSTAT with UNIX auth only 610 613 * - RFC 2623, sec 2.3.2 611 614 */ 612 - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); 615 + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); 613 616 if (error < 0) 614 617 goto error; 615 618 nfs_mark_client_ready(clp, NFS_CS_READY); ··· 630 633 const struct nfs_parsed_mount_data *data, 631 634 struct nfs_subversion *nfs_mod) 632 635 { 636 + struct rpc_timeout timeparms; 633 637 struct nfs_client_initdata cl_init = { 634 638 .hostname = data->nfs_server.hostname, 635 639 .addr = (const struct sockaddr *)&data->nfs_server.address, ··· 638 640 .nfs_mod = nfs_mod, 639 641 .proto = data->nfs_server.protocol, 640 642 .net = data->net, 643 + .timeparms = &timeparms, 641 644 }; 642 - struct rpc_timeout timeparms; 643 645 struct nfs_client *clp; 644 646 int error; 645 647 ··· 651 653 set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 652 654 653 655 /* Allocate or find a client reference we can use */ 654 - clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX); 656 + clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX); 655 657 if (IS_ERR(clp)) { 656 658 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); 657 659 return PTR_ERR(clp);

+31 -21

fs/nfs/dir.c

··· 2252 2252 return NULL; 2253 2253 } 2254 2254 2255 - static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) 2255 + static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block) 2256 2256 { 2257 2257 struct nfs_inode *nfsi = NFS_I(inode); 2258 2258 struct nfs_access_entry *cache; 2259 - int err = -ENOENT; 2259 + bool retry = true; 2260 + int err; 2260 2261 2261 2262 spin_lock(&inode->i_lock); 2262 - if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) 2263 - goto out_zap; 2264 - cache = nfs_access_search_rbtree(inode, cred); 2265 - if (cache == NULL) 2266 - goto out; 2267 - if (!nfs_have_delegated_attributes(inode) && 2268 - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 2269 - goto out_stale; 2263 + for(;;) { 2264 + if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS) 2265 + goto out_zap; 2266 + cache = nfs_access_search_rbtree(inode, cred); 2267 + err = -ENOENT; 2268 + if (cache == NULL) 2269 + goto out; 2270 + /* Found an entry, is our attribute cache valid? */ 2271 + if (!nfs_attribute_cache_expired(inode) && 2272 + !(nfsi->cache_validity & NFS_INO_INVALID_ATTR)) 2273 + break; 2274 + err = -ECHILD; 2275 + if (!may_block) 2276 + goto out; 2277 + if (!retry) 2278 + goto out_zap; 2279 + spin_unlock(&inode->i_lock); 2280 + err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); 2281 + if (err) 2282 + return err; 2283 + spin_lock(&inode->i_lock); 2284 + retry = false; 2285 + } 2270 2286 res->jiffies = cache->jiffies; 2271 2287 res->cred = cache->cred; 2272 2288 res->mask = cache->mask; ··· 2291 2275 out: 2292 2276 spin_unlock(&inode->i_lock); 2293 2277 return err; 2294 - out_stale: 2295 - rb_erase(&cache->rb_node, &nfsi->access_cache); 2296 - list_del(&cache->lru); 2297 - spin_unlock(&inode->i_lock); 2298 - nfs_access_free_entry(cache); 2299 - return -ENOENT; 2300 2278 out_zap: 2301 2279 spin_unlock(&inode->i_lock); 2302 2280 nfs_access_zap_cache(inode); ··· 2317 2307 cache = NULL; 2318 2308 if (cache == NULL) 2319 2309 goto out; 2320 - if (!nfs_have_delegated_attributes(inode) && 2321 - !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 2310 + err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode); 2311 + if (err) 2322 2312 goto out; 2323 2313 res->jiffies = cache->jiffies; 2324 2314 res->cred = cache->cred; 2325 2315 res->mask = cache->mask; 2326 - err = 0; 2327 2316 out: 2328 2317 rcu_read_unlock(); 2329 2318 return err; ··· 2411 2402 static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) 2412 2403 { 2413 2404 struct nfs_access_entry cache; 2405 + bool may_block = (mask & MAY_NOT_BLOCK) == 0; 2414 2406 int status; 2415 2407 2416 2408 trace_nfs_access_enter(inode); 2417 2409 2418 2410 status = nfs_access_get_cached_rcu(inode, cred, &cache); 2419 2411 if (status != 0) 2420 - status = nfs_access_get_cached(inode, cred, &cache); 2412 + status = nfs_access_get_cached(inode, cred, &cache, may_block); 2421 2413 if (status == 0) 2422 2414 goto out_cached; 2423 2415 2424 2416 status = -ECHILD; 2425 - if (mask & MAY_NOT_BLOCK) 2417 + if (!may_block) 2426 2418 goto out; 2427 2419 2428 2420 /* Be clever: ask server to check for all possible rights */

+32 -61

fs/nfs/direct.c

··· 196 196 WARN_ON_ONCE(verfp->committed < 0); 197 197 } 198 198 199 + static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1, 200 + const struct nfs_writeverf *v2) 201 + { 202 + return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier); 203 + } 204 + 199 205 /* 200 206 * nfs_direct_cmp_hdr_verf - compare verifier for pgio header 201 207 * @dreq - direct request possibly spanning multiple servers ··· 221 215 nfs_direct_set_hdr_verf(dreq, hdr); 222 216 return 0; 223 217 } 224 - return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 218 + return nfs_direct_cmp_verf(verfp, &hdr->verf); 225 219 } 226 220 227 221 /* ··· 244 238 if (verfp->committed < 0) 245 239 return 1; 246 240 247 - return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); 241 + return nfs_direct_cmp_verf(verfp, &data->verf); 248 242 } 249 243 250 244 /** ··· 372 366 * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust 373 367 * the iocb is still valid here if this is a synchronous request. 374 368 */ 375 - static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) 369 + static void nfs_direct_complete(struct nfs_direct_req *dreq) 376 370 { 377 371 struct inode *inode = dreq->inode; 378 - 379 - if (dreq->iocb && write) { 380 - loff_t pos = dreq->iocb->ki_pos + dreq->count; 381 - 382 - spin_lock(&inode->i_lock); 383 - if (i_size_read(inode) < pos) 384 - i_size_write(inode, pos); 385 - spin_unlock(&inode->i_lock); 386 - } 387 - 388 - if (write) 389 - nfs_zap_mapping(inode, inode->i_mapping); 390 372 391 373 inode_dio_end(inode); 392 374 ··· 430 436 } 431 437 out_put: 432 438 if (put_dreq(dreq)) 433 - nfs_direct_complete(dreq, false); 439 + nfs_direct_complete(dreq); 434 440 hdr->release(hdr); 435 441 } 436 442 ··· 536 542 } 537 543 538 544 if (put_dreq(dreq)) 539 - nfs_direct_complete(dreq, false); 545 + nfs_direct_complete(dreq); 540 546 return 0; 541 547 } 542 548 ··· 577 583 if (!count) 578 584 goto out; 579 585 580 - inode_lock(inode); 581 - result = nfs_sync_mapping(mapping); 582 - if (result) 583 - goto out_unlock; 584 - 585 586 task_io_account_read(count); 586 587 587 588 result = -ENOMEM; 588 589 dreq = nfs_direct_req_alloc(); 589 590 if (dreq == NULL) 590 - goto out_unlock; 591 + goto out; 591 592 592 593 dreq->inode = inode; 593 594 dreq->bytes_left = dreq->max_count = count; ··· 597 608 if (!is_sync_kiocb(iocb)) 598 609 dreq->iocb = iocb; 599 610 611 + nfs_start_io_direct(inode); 612 + 600 613 NFS_I(inode)->read_io += count; 601 614 result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); 602 615 603 - inode_unlock(inode); 616 + nfs_end_io_direct(inode); 604 617 605 618 if (!result) { 606 619 result = nfs_direct_wait(dreq); ··· 610 619 iocb->ki_pos += result; 611 620 } 612 621 613 - nfs_direct_req_release(dreq); 614 - return result; 615 - 616 622 out_release: 617 623 nfs_direct_req_release(dreq); 618 - out_unlock: 619 - inode_unlock(inode); 620 624 out: 621 625 return result; 622 626 } ··· 643 657 nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); 644 658 645 659 dreq->count = 0; 660 + dreq->verf.committed = NFS_INVALID_STABLE_HOW; 661 + nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); 646 662 for (i = 0; i < dreq->mirror_count; i++) 647 663 dreq->mirrors[i].count = 0; 648 664 get_dreq(dreq); ··· 763 775 nfs_direct_write_reschedule(dreq); 764 776 break; 765 777 default: 766 - nfs_direct_complete(dreq, true); 778 + nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); 779 + nfs_direct_complete(dreq); 767 780 } 768 781 } 769 782 ··· 980 991 ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) 981 992 { 982 993 ssize_t result = -EINVAL; 994 + size_t count; 983 995 struct file *file = iocb->ki_filp; 984 996 struct address_space *mapping = file->f_mapping; 985 997 struct inode *inode = mapping->host; ··· 991 1001 dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", 992 1002 file, iov_iter_count(iter), (long long) iocb->ki_pos); 993 1003 994 - nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, 995 - iov_iter_count(iter)); 1004 + result = generic_write_checks(iocb, iter); 1005 + if (result <= 0) 1006 + return result; 1007 + count = result; 1008 + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); 996 1009 997 1010 pos = iocb->ki_pos; 998 1011 end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; 999 1012 1000 - inode_lock(inode); 1001 - 1002 - result = nfs_sync_mapping(mapping); 1003 - if (result) 1004 - goto out_unlock; 1005 - 1006 - if (mapping->nrpages) { 1007 - result = invalidate_inode_pages2_range(mapping, 1008 - pos >> PAGE_SHIFT, end); 1009 - if (result) 1010 - goto out_unlock; 1011 - } 1012 - 1013 - task_io_account_write(iov_iter_count(iter)); 1013 + task_io_account_write(count); 1014 1014 1015 1015 result = -ENOMEM; 1016 1016 dreq = nfs_direct_req_alloc(); 1017 1017 if (!dreq) 1018 - goto out_unlock; 1018 + goto out; 1019 1019 1020 1020 dreq->inode = inode; 1021 - dreq->bytes_left = dreq->max_count = iov_iter_count(iter); 1021 + dreq->bytes_left = dreq->max_count = count; 1022 1022 dreq->io_start = pos; 1023 1023 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 1024 1024 l_ctx = nfs_get_lock_context(dreq->ctx); ··· 1020 1040 if (!is_sync_kiocb(iocb)) 1021 1041 dreq->iocb = iocb; 1022 1042 1043 + nfs_start_io_direct(inode); 1044 + 1023 1045 result = nfs_direct_write_schedule_iovec(dreq, iter, pos); 1024 1046 1025 1047 if (mapping->nrpages) { ··· 1029 1047 pos >> PAGE_SHIFT, end); 1030 1048 } 1031 1049 1032 - inode_unlock(inode); 1050 + nfs_end_io_direct(inode); 1033 1051 1034 1052 if (!result) { 1035 1053 result = nfs_direct_wait(dreq); 1036 1054 if (result > 0) { 1037 - struct inode *inode = mapping->host; 1038 - 1039 1055 iocb->ki_pos = pos + result; 1040 - spin_lock(&inode->i_lock); 1041 - if (i_size_read(inode) < iocb->ki_pos) 1042 - i_size_write(inode, iocb->ki_pos); 1043 - spin_unlock(&inode->i_lock); 1044 - 1045 1056 /* XXX: should check the generic_write_sync retval */ 1046 1057 generic_write_sync(iocb, result); 1047 1058 } 1048 1059 } 1049 - nfs_direct_req_release(dreq); 1050 - return result; 1051 - 1052 1060 out_release: 1053 1061 nfs_direct_req_release(dreq); 1054 - out_unlock: 1055 - inode_unlock(inode); 1062 + out: 1056 1063 return result; 1057 1064 } 1058 1065

+29 -71

fs/nfs/file.c

··· 170 170 iocb->ki_filp, 171 171 iov_iter_count(to), (unsigned long) iocb->ki_pos); 172 172 173 - result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); 173 + nfs_start_io_read(inode); 174 + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); 174 175 if (!result) { 175 176 result = generic_file_read_iter(iocb, to); 176 177 if (result > 0) 177 178 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); 178 179 } 180 + nfs_end_io_read(inode); 179 181 return result; 180 182 } 181 183 EXPORT_SYMBOL_GPL(nfs_file_read); ··· 193 191 dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", 194 192 filp, (unsigned long) count, (unsigned long long) *ppos); 195 193 196 - res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); 194 + nfs_start_io_read(inode); 195 + res = nfs_revalidate_mapping(inode, filp->f_mapping); 197 196 if (!res) { 198 197 res = generic_file_splice_read(filp, ppos, pipe, count, flags); 199 198 if (res > 0) 200 199 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); 201 200 } 201 + nfs_end_io_read(inode); 202 202 return res; 203 203 } 204 204 EXPORT_SYMBOL_GPL(nfs_file_splice_read); ··· 276 272 277 273 trace_nfs_fsync_enter(inode); 278 274 279 - inode_dio_wait(inode); 280 275 do { 281 276 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 282 277 if (ret != 0) 283 278 break; 284 - inode_lock(inode); 285 279 ret = nfs_file_fsync_commit(file, start, end, datasync); 286 280 if (!ret) 287 281 ret = pnfs_sync_inode(inode, !!datasync); 288 - inode_unlock(inode); 289 282 /* 290 283 * If nfs_file_fsync_commit detected a server reboot, then 291 284 * resend all dirty pages that might have been covered by ··· 360 359 file, mapping->host->i_ino, len, (long long) pos); 361 360 362 361 start: 363 - /* 364 - * Prevent starvation issues if someone is doing a consistency 365 - * sync-to-disk 366 - */ 367 - ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, 368 - nfs_wait_bit_killable, TASK_KILLABLE); 369 - if (ret) 370 - return ret; 371 - /* 372 - * Wait for O_DIRECT to complete 373 - */ 374 - inode_dio_wait(mapping->host); 375 - 376 362 page = grab_cache_page_write_begin(mapping, index, flags); 377 363 if (!page) 378 364 return -ENOMEM; ··· 420 432 return status; 421 433 NFS_I(mapping->host)->write_io += copied; 422 434 423 - if (nfs_ctx_key_to_expire(ctx)) { 435 + if (nfs_ctx_key_to_expire(ctx, mapping->host)) { 424 436 status = nfs_wb_all(mapping->host); 425 437 if (status < 0) 426 438 return status; ··· 458 470 */ 459 471 static int nfs_release_page(struct page *page, gfp_t gfp) 460 472 { 461 - struct address_space *mapping = page->mapping; 462 - 463 473 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 464 474 465 - /* Always try to initiate a 'commit' if relevant, but only 466 - * wait for it if the caller allows blocking. Even then, 467 - * only wait 1 second and only if the 'bdi' is not congested. 468 - * Waiting indefinitely can cause deadlocks when the NFS 469 - * server is on this machine, when a new TCP connection is 470 - * needed and in other rare cases. There is no particular 471 - * need to wait extensively here. A short wait has the 472 - * benefit that someone else can worry about the freezer. 473 - */ 474 - if (mapping) { 475 - struct nfs_server *nfss = NFS_SERVER(mapping->host); 476 - nfs_commit_inode(mapping->host, 0); 477 - if (gfpflags_allow_blocking(gfp) && 478 - !bdi_write_congested(&nfss->backing_dev_info)) { 479 - wait_on_page_bit_killable_timeout(page, PG_private, 480 - HZ); 481 - if (PagePrivate(page)) 482 - set_bdi_congested(&nfss->backing_dev_info, 483 - BLK_RW_ASYNC); 484 - } 485 - } 486 475 /* If PagePrivate() is set, then the page is not freeable */ 487 476 if (PagePrivate(page)) 488 477 return 0; ··· 569 604 filp, filp->f_mapping->host->i_ino, 570 605 (long long)page_offset(page)); 571 606 607 + sb_start_pagefault(inode->i_sb); 608 + 572 609 /* make sure the cache has finished storing the page */ 573 610 nfs_fscache_wait_on_page_write(NFS_I(inode), page); 574 611 ··· 597 630 out_unlock: 598 631 unlock_page(page); 599 632 out: 633 + sb_end_pagefault(inode->i_sb); 600 634 return ret; 601 635 } 602 636 ··· 613 645 614 646 ctx = nfs_file_open_context(filp); 615 647 if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) || 616 - nfs_ctx_key_to_expire(ctx)) 648 + nfs_ctx_key_to_expire(ctx, inode)) 617 649 return 1; 618 650 return 0; 619 651 } ··· 624 656 struct inode *inode = file_inode(file); 625 657 unsigned long written = 0; 626 658 ssize_t result; 627 - size_t count = iov_iter_count(from); 628 659 629 660 result = nfs_key_timeout_notify(file, inode); 630 661 if (result) 631 662 return result; 632 663 633 - if (iocb->ki_flags & IOCB_DIRECT) { 634 - result = generic_write_checks(iocb, from); 635 - if (result <= 0) 636 - return result; 664 + if (iocb->ki_flags & IOCB_DIRECT) 637 665 return nfs_file_direct_write(iocb, from); 638 - } 639 666 640 667 dprintk("NFS: write(%pD2, %zu@%Ld)\n", 641 - file, count, (long long) iocb->ki_pos); 668 + file, iov_iter_count(from), (long long) iocb->ki_pos); 642 669 643 - result = -EBUSY; 644 670 if (IS_SWAPFILE(inode)) 645 671 goto out_swapfile; 646 672 /* ··· 646 684 goto out; 647 685 } 648 686 649 - result = count; 650 - if (!count) 687 + nfs_start_io_write(inode); 688 + result = generic_write_checks(iocb, from); 689 + if (result > 0) { 690 + current->backing_dev_info = inode_to_bdi(inode); 691 + result = generic_perform_write(file, from, iocb->ki_pos); 692 + current->backing_dev_info = NULL; 693 + } 694 + nfs_end_io_write(inode); 695 + if (result <= 0) 651 696 goto out; 652 697 653 - result = generic_file_write_iter(iocb, from); 654 - if (result > 0) 655 - written = result; 698 + written = generic_write_sync(iocb, result); 699 + iocb->ki_pos += written; 656 700 657 701 /* Return error values */ 658 - if (result >= 0 && nfs_need_check_write(file, inode)) { 702 + if (nfs_need_check_write(file, inode)) { 659 703 int err = vfs_fsync(file, 0); 660 704 if (err < 0) 661 705 result = err; 662 706 } 663 - if (result > 0) 664 - nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); 707 + nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written); 665 708 out: 666 709 return result; 667 710 668 711 out_swapfile: 669 712 printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); 670 - goto out; 713 + return -EBUSY; 671 714 } 672 715 EXPORT_SYMBOL_GPL(nfs_file_write); 673 716 ··· 747 780 } 748 781 749 782 static int 750 - is_time_granular(struct timespec *ts) { 751 - return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000)); 752 - } 753 - 754 - static int 755 783 do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) 756 784 { 757 785 struct inode *inode = filp->f_mapping->host; ··· 779 817 * This makes locking act as a cache coherency point. 780 818 */ 781 819 nfs_sync_mapping(filp->f_mapping); 782 - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { 783 - if (is_time_granular(&NFS_SERVER(inode)->time_delta)) 784 - __nfs_revalidate_inode(NFS_SERVER(inode), inode); 785 - else 786 - nfs_zap_caches(inode); 787 - } 820 + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) 821 + nfs_zap_mapping(inode, filp->f_mapping); 788 822 out: 789 823 return status; 790 824 }

+13 -5

fs/nfs/filelayout/filelayout.c

··· 255 255 static void 256 256 filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) 257 257 { 258 + loff_t end_offs = 0; 258 259 259 260 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || 260 - hdr->res.verf->committed != NFS_DATA_SYNC) 261 + hdr->res.verf->committed == NFS_FILE_SYNC) 261 262 return; 263 + if (hdr->res.verf->committed == NFS_DATA_SYNC) 264 + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; 262 265 263 - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, 264 - hdr->mds_offset + hdr->res.count); 266 + /* Note: if the write is unstable, don't set end_offs until commit */ 267 + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); 265 268 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, 266 269 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); 267 270 } ··· 357 354 } 358 355 359 356 filelayout_set_layoutcommit(hdr); 357 + 358 + /* zero out the fattr */ 359 + hdr->fattr.valid = 0; 360 + if (task->tk_status >= 0) 361 + nfs_writeback_update_inode(hdr); 362 + 360 363 return 0; 361 364 } 362 365 ··· 384 375 return -EAGAIN; 385 376 } 386 377 387 - if (data->verf.committed == NFS_UNSTABLE) 388 - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); 378 + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); 389 379 390 380 return 0; 391 381 }

+13 -10

fs/nfs/flexfilelayout/flexfilelayout.c

··· 1325 1325 * we always send layoutcommit after DS writes. 1326 1326 */ 1327 1327 static void 1328 - ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) 1328 + ff_layout_set_layoutcommit(struct inode *inode, 1329 + struct pnfs_layout_segment *lseg, 1330 + loff_t end_offset) 1329 1331 { 1330 - if (!ff_layout_need_layoutcommit(hdr->lseg)) 1332 + if (!ff_layout_need_layoutcommit(lseg)) 1331 1333 return; 1332 1334 1333 - pnfs_set_layoutcommit(hdr->inode, hdr->lseg, 1334 - hdr->mds_offset + hdr->res.count); 1335 - dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, 1336 - (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); 1335 + pnfs_set_layoutcommit(inode, lseg, end_offset); 1336 + dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino, 1337 + (unsigned long long) NFS_I(inode)->layout->plh_lwb); 1337 1338 } 1338 1339 1339 1340 static bool ··· 1470 1469 static int ff_layout_write_done_cb(struct rpc_task *task, 1471 1470 struct nfs_pgio_header *hdr) 1472 1471 { 1472 + loff_t end_offs = 0; 1473 1473 int err; 1474 1474 1475 1475 trace_nfs4_pnfs_write(hdr, task->tk_status); ··· 1496 1494 1497 1495 if (hdr->res.verf->committed == NFS_FILE_SYNC || 1498 1496 hdr->res.verf->committed == NFS_DATA_SYNC) 1499 - ff_layout_set_layoutcommit(hdr); 1497 + end_offs = hdr->mds_offset + (loff_t)hdr->res.count; 1498 + 1499 + /* Note: if the write is unstable, don't set end_offs until commit */ 1500 + ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); 1500 1501 1501 1502 /* zero out fattr since we don't care DS attr at all */ 1502 1503 hdr->fattr.valid = 0; ··· 1535 1530 return -EAGAIN; 1536 1531 } 1537 1532 1538 - if (data->verf.committed == NFS_UNSTABLE 1539 - && ff_layout_need_layoutcommit(data->lseg)) 1540 - pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); 1533 + ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); 1541 1534 1542 1535 return 0; 1543 1536 }

+68 -70

fs/nfs/inode.c

··· 662 662 trace_nfs_getattr_enter(inode); 663 663 /* Flush out writes to the server in order to update c/mtime. */ 664 664 if (S_ISREG(inode->i_mode)) { 665 - inode_lock(inode); 666 - err = nfs_sync_inode(inode); 667 - inode_unlock(inode); 665 + err = filemap_write_and_wait(inode->i_mapping); 668 666 if (err) 669 667 goto out; 670 668 } ··· 877 879 struct nfs_inode *nfsi = NFS_I(inode); 878 880 879 881 spin_lock(&inode->i_lock); 880 - list_add(&ctx->list, &nfsi->open_files); 882 + if (ctx->mode & FMODE_WRITE) 883 + list_add(&ctx->list, &nfsi->open_files); 884 + else 885 + list_add_tail(&ctx->list, &nfsi->open_files); 881 886 spin_unlock(&inode->i_lock); 882 887 } 883 888 EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); ··· 972 971 goto out; 973 972 if (NFS_STALE(inode)) 974 973 goto out; 974 + 975 + /* pNFS: Attributes aren't updated until we layoutcommit */ 976 + if (S_ISREG(inode->i_mode)) { 977 + status = pnfs_sync_inode(inode, false); 978 + if (status) 979 + goto out; 980 + } 975 981 976 982 status = -ENOMEM; 977 983 fattr = nfs_alloc_fattr(); ··· 1130 1122 } 1131 1123 1132 1124 /** 1133 - * __nfs_revalidate_mapping - Revalidate the pagecache 1125 + * nfs_revalidate_mapping - Revalidate the pagecache 1134 1126 * @inode - pointer to host inode 1135 1127 * @mapping - pointer to mapping 1136 - * @may_lock - take inode->i_mutex? 1137 1128 */ 1138 - static int __nfs_revalidate_mapping(struct inode *inode, 1139 - struct address_space *mapping, 1140 - bool may_lock) 1129 + int nfs_revalidate_mapping(struct inode *inode, 1130 + struct address_space *mapping) 1141 1131 { 1142 1132 struct nfs_inode *nfsi = NFS_I(inode); 1143 1133 unsigned long *bitlock = &nfsi->flags; ··· 1184 1178 nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; 1185 1179 spin_unlock(&inode->i_lock); 1186 1180 trace_nfs_invalidate_mapping_enter(inode); 1187 - if (may_lock) { 1188 - inode_lock(inode); 1189 - ret = nfs_invalidate_mapping(inode, mapping); 1190 - inode_unlock(inode); 1191 - } else 1192 - ret = nfs_invalidate_mapping(inode, mapping); 1181 + ret = nfs_invalidate_mapping(inode, mapping); 1193 1182 trace_nfs_invalidate_mapping_exit(inode, ret); 1194 1183 1195 1184 clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); ··· 1194 1193 return ret; 1195 1194 } 1196 1195 1197 - /** 1198 - * nfs_revalidate_mapping - Revalidate the pagecache 1199 - * @inode - pointer to host inode 1200 - * @mapping - pointer to mapping 1201 - */ 1202 - int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) 1196 + static bool nfs_file_has_writers(struct nfs_inode *nfsi) 1203 1197 { 1204 - return __nfs_revalidate_mapping(inode, mapping, false); 1198 + struct inode *inode = &nfsi->vfs_inode; 1199 + 1200 + assert_spin_locked(&inode->i_lock); 1201 + 1202 + if (!S_ISREG(inode->i_mode)) 1203 + return false; 1204 + if (list_empty(&nfsi->open_files)) 1205 + return false; 1206 + /* Note: This relies on nfsi->open_files being ordered with writers 1207 + * being placed at the head of the list. 1208 + * See nfs_inode_attach_open_context() 1209 + */ 1210 + return (list_first_entry(&nfsi->open_files, 1211 + struct nfs_open_context, 1212 + list)->mode & FMODE_WRITE) == FMODE_WRITE; 1205 1213 } 1206 1214 1207 - /** 1208 - * nfs_revalidate_mapping_protected - Revalidate the pagecache 1209 - * @inode - pointer to host inode 1210 - * @mapping - pointer to mapping 1211 - * 1212 - * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex 1213 - * while invalidating the mapping. 1214 - */ 1215 - int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) 1215 + static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) 1216 1216 { 1217 - return __nfs_revalidate_mapping(inode, mapping, true); 1217 + return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); 1218 1218 } 1219 1219 1220 1220 static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) ··· 1282 1280 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) 1283 1281 return -EIO; 1284 1282 1285 - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && 1286 - inode->i_version != fattr->change_attr) 1287 - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1283 + if (!nfs_file_has_buffered_writers(nfsi)) { 1284 + /* Verify a few of the more important attributes */ 1285 + if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr) 1286 + invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE; 1288 1287 1289 - /* Verify a few of the more important attributes */ 1290 - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) 1291 - invalid |= NFS_INO_INVALID_ATTR; 1288 + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime)) 1289 + invalid |= NFS_INO_INVALID_ATTR; 1292 1290 1293 - if (fattr->valid & NFS_ATTR_FATTR_SIZE) { 1294 - cur_size = i_size_read(inode); 1295 - new_isize = nfs_size_to_loff_t(fattr->size); 1296 - if (cur_size != new_isize) 1297 - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1291 + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime)) 1292 + invalid |= NFS_INO_INVALID_ATTR; 1293 + 1294 + if (fattr->valid & NFS_ATTR_FATTR_SIZE) { 1295 + cur_size = i_size_read(inode); 1296 + new_isize = nfs_size_to_loff_t(fattr->size); 1297 + if (cur_size != new_isize) 1298 + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1299 + } 1298 1300 } 1299 - if (nfsi->nrequests != 0) 1300 - invalid &= ~NFS_INO_REVAL_PAGECACHE; 1301 1301 1302 1302 /* Have any file permissions changed? */ 1303 1303 if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) ··· 1474 1470 ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); 1475 1471 } 1476 1472 1477 - /* 1478 - * Don't trust the change_attribute, mtime, ctime or size if 1479 - * a pnfs LAYOUTCOMMIT is outstanding 1480 - */ 1481 - static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, 1482 - struct nfs_fattr *fattr) 1483 - { 1484 - if (pnfs_layoutcommit_outstanding(inode)) 1485 - fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | 1486 - NFS_ATTR_FATTR_MTIME | 1487 - NFS_ATTR_FATTR_CTIME | 1488 - NFS_ATTR_FATTR_SIZE); 1489 - } 1490 - 1491 1473 static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) 1492 1474 { 1493 1475 int ret; 1494 1476 1495 1477 trace_nfs_refresh_inode_enter(inode); 1496 - 1497 - nfs_inode_attrs_handle_layoutcommit(inode, fattr); 1498 1478 1499 1479 if (nfs_inode_attrs_need_update(inode, fattr)) 1500 1480 ret = nfs_update_inode(inode, fattr); ··· 1515 1527 1516 1528 static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) 1517 1529 { 1518 - unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1530 + unsigned long invalid = NFS_INO_INVALID_ATTR; 1519 1531 1520 1532 /* 1521 1533 * Don't revalidate the pagecache if we hold a delegation, but do ··· 1664 1676 unsigned long invalid = 0; 1665 1677 unsigned long now = jiffies; 1666 1678 unsigned long save_cache_validity; 1679 + bool have_writers = nfs_file_has_buffered_writers(nfsi); 1667 1680 bool cache_revalidated = true; 1668 1681 1669 1682 dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", ··· 1714 1725 /* Do atomic weak cache consistency updates */ 1715 1726 invalid |= nfs_wcc_update_inode(inode, fattr); 1716 1727 1728 + if (pnfs_layoutcommit_outstanding(inode)) { 1729 + nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; 1730 + cache_revalidated = false; 1731 + } 1732 + 1717 1733 /* More cache consistency checks */ 1718 1734 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { 1719 1735 if (inode->i_version != fattr->change_attr) { 1720 1736 dprintk("NFS: change_attr change on server for file %s/%ld\n", 1721 1737 inode->i_sb->s_id, inode->i_ino); 1722 - invalid |= NFS_INO_INVALID_ATTR 1723 - | NFS_INO_INVALID_DATA 1724 - | NFS_INO_INVALID_ACCESS 1725 - | NFS_INO_INVALID_ACL; 1726 - if (S_ISDIR(inode->i_mode)) 1727 - nfs_force_lookup_revalidate(inode); 1738 + /* Could it be a race with writeback? */ 1739 + if (!have_writers) { 1740 + invalid |= NFS_INO_INVALID_ATTR 1741 + | NFS_INO_INVALID_DATA 1742 + | NFS_INO_INVALID_ACCESS 1743 + | NFS_INO_INVALID_ACL; 1744 + if (S_ISDIR(inode->i_mode)) 1745 + nfs_force_lookup_revalidate(inode); 1746 + } 1728 1747 inode->i_version = fattr->change_attr; 1729 1748 } 1730 1749 } else { ··· 1765 1768 if (new_isize != cur_isize) { 1766 1769 /* Do we perhaps have any outstanding writes, or has 1767 1770 * the file grown beyond our last write? */ 1768 - if ((nfsi->nrequests == 0) || new_isize > cur_isize) { 1771 + if (nfsi->nrequests == 0 || new_isize > cur_isize) { 1769 1772 i_size_write(inode, new_isize); 1770 - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1773 + if (!have_writers) 1774 + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1771 1775 } 1772 1776 dprintk("NFS: isize change on server for file %s/%ld " 1773 1777 "(%Ld to %Ld)\n",

+51 -11

fs/nfs/internal.h

··· 66 66 67 67 struct nfs_client_initdata { 68 68 unsigned long init_flags; 69 - const char *hostname; 70 - const struct sockaddr *addr; 69 + const char *hostname; /* Hostname of the server */ 70 + const struct sockaddr *addr; /* Address of the server */ 71 + const char *nodename; /* Hostname of the client */ 72 + const char *ip_addr; /* IP address of the client */ 71 73 size_t addrlen; 72 74 struct nfs_subversion *nfs_mod; 73 75 int proto; 74 76 u32 minorversion; 75 77 struct net *net; 78 + const struct rpc_timeout *timeparms; 76 79 }; 77 80 78 81 /* ··· 150 147 extern const struct rpc_program nfs_program; 151 148 extern void nfs_clients_init(struct net *net); 152 149 extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); 153 - int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t); 150 + int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); 154 151 struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, 155 - const struct rpc_timeout *, const char *, 156 152 rpc_authflavor_t); 157 153 int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); 158 154 void nfs_server_insert_lists(struct nfs_server *); ··· 186 184 rpc_authflavor_t); 187 185 extern int nfs_wait_client_init_complete(const struct nfs_client *clp); 188 186 extern void nfs_mark_client_ready(struct nfs_client *clp, int state); 189 - extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, 187 + extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, 190 188 const struct sockaddr *ds_addr, 191 189 int ds_addrlen, int ds_proto, 192 190 unsigned int ds_timeo, ··· 195 193 rpc_authflavor_t au_flavor); 196 194 extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, 197 195 struct inode *); 198 - extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, 196 + extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, 199 197 const struct sockaddr *ds_addr, int ds_addrlen, 200 198 int ds_proto, unsigned int ds_timeo, 201 199 unsigned int ds_retrans, rpc_authflavor_t au_flavor); ··· 340 338 /* proc.c */ 341 339 void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 342 340 extern struct nfs_client *nfs_init_client(struct nfs_client *clp, 343 - const struct rpc_timeout *timeparms, 344 - const char *ip_addr); 341 + const struct nfs_client_initdata *); 345 342 346 343 /* dir.c */ 347 344 extern void nfs_force_use_readdirplus(struct inode *dir); ··· 411 410 extern void __exit unregister_nfs_fs(void); 412 411 extern bool nfs_sb_active(struct super_block *sb); 413 412 extern void nfs_sb_deactive(struct super_block *sb); 413 + 414 + /* io.c */ 415 + extern void nfs_start_io_read(struct inode *inode); 416 + extern void nfs_end_io_read(struct inode *inode); 417 + extern void nfs_start_io_write(struct inode *inode); 418 + extern void nfs_end_io_write(struct inode *inode); 419 + extern void nfs_start_io_direct(struct inode *inode); 420 + extern void nfs_end_io_direct(struct inode *inode); 421 + 422 + static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) 423 + { 424 + return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; 425 + } 414 426 415 427 /* namespace.c */ 416 428 #define NFS_PATH_CANONICAL 1 ··· 510 496 struct inode *inode, 511 497 struct nfs_direct_req *dreq); 512 498 int nfs_key_timeout_notify(struct file *filp, struct inode *inode); 513 - bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); 499 + bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); 514 500 void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); 501 + 502 + int nfs_filemap_write_and_wait_range(struct address_space *mapping, 503 + loff_t lstart, loff_t lend); 504 + 505 + #ifdef CONFIG_NFS_V4_1 506 + static inline 507 + void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) 508 + { 509 + int i; 510 + 511 + for (i = 0; i < cinfo->nbuckets; i++) 512 + cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; 513 + } 514 + #else 515 + static inline 516 + void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) 517 + { 518 + } 519 + #endif 520 + 515 521 516 522 #ifdef CONFIG_MIGRATION 517 523 extern int nfs_migrate_page(struct address_space *, ··· 539 505 #else 540 506 #define nfs_migrate_page NULL 541 507 #endif 508 + 509 + static inline int 510 + nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, 511 + const struct nfs_write_verifier *v2) 512 + { 513 + return memcmp(v1->data, v2->data, sizeof(v1->data)); 514 + } 542 515 543 516 /* unlink.c */ 544 517 extern struct rpc_task * ··· 562 521 /* nfs4proc.c */ 563 522 extern void __nfs4_read_done_cb(struct nfs_pgio_header *); 564 523 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, 565 - const struct rpc_timeout *timeparms, 566 - const char *ip_addr); 524 + const struct nfs_client_initdata *); 567 525 extern int nfs40_walk_client_list(struct nfs_client *clp, 568 526 struct nfs_client **result, 569 527 struct rpc_cred *cred);

+147

fs/nfs/io.c

··· 1 + /* 2 + * Copyright (c) 2016 Trond Myklebust 3 + * 4 + * I/O and data path helper functionality. 5 + */ 6 + 7 + #include <linux/types.h> 8 + #include <linux/kernel.h> 9 + #include <linux/bitops.h> 10 + #include <linux/rwsem.h> 11 + #include <linux/fs.h> 12 + #include <linux/nfs_fs.h> 13 + 14 + #include "internal.h" 15 + 16 + /* Call with exclusively locked inode->i_rwsem */ 17 + static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) 18 + { 19 + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { 20 + clear_bit(NFS_INO_ODIRECT, &nfsi->flags); 21 + inode_dio_wait(inode); 22 + } 23 + } 24 + 25 + /** 26 + * nfs_start_io_read - declare the file is being used for buffered reads 27 + * @inode - file inode 28 + * 29 + * Declare that a buffered read operation is about to start, and ensure 30 + * that we block all direct I/O. 31 + * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset, 32 + * and holds a shared lock on inode->i_rwsem to ensure that the flag 33 + * cannot be changed. 34 + * In practice, this means that buffered read operations are allowed to 35 + * execute in parallel, thanks to the shared lock, whereas direct I/O 36 + * operations need to wait to grab an exclusive lock in order to set 37 + * NFS_INO_ODIRECT. 38 + * Note that buffered writes and truncates both take a write lock on 39 + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. 40 + */ 41 + void 42 + nfs_start_io_read(struct inode *inode) 43 + { 44 + struct nfs_inode *nfsi = NFS_I(inode); 45 + /* Be an optimist! */ 46 + down_read(&inode->i_rwsem); 47 + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) 48 + return; 49 + up_read(&inode->i_rwsem); 50 + /* Slow path.... */ 51 + down_write(&inode->i_rwsem); 52 + nfs_block_o_direct(nfsi, inode); 53 + downgrade_write(&inode->i_rwsem); 54 + } 55 + 56 + /** 57 + * nfs_end_io_read - declare that the buffered read operation is done 58 + * @inode - file inode 59 + * 60 + * Declare that a buffered read operation is done, and release the shared 61 + * lock on inode->i_rwsem. 62 + */ 63 + void 64 + nfs_end_io_read(struct inode *inode) 65 + { 66 + up_read(&inode->i_rwsem); 67 + } 68 + 69 + /** 70 + * nfs_start_io_write - declare the file is being used for buffered writes 71 + * @inode - file inode 72 + * 73 + * Declare that a buffered read operation is about to start, and ensure 74 + * that we block all direct I/O. 75 + */ 76 + void 77 + nfs_start_io_write(struct inode *inode) 78 + { 79 + down_write(&inode->i_rwsem); 80 + nfs_block_o_direct(NFS_I(inode), inode); 81 + } 82 + 83 + /** 84 + * nfs_end_io_write - declare that the buffered write operation is done 85 + * @inode - file inode 86 + * 87 + * Declare that a buffered write operation is done, and release the 88 + * lock on inode->i_rwsem. 89 + */ 90 + void 91 + nfs_end_io_write(struct inode *inode) 92 + { 93 + up_write(&inode->i_rwsem); 94 + } 95 + 96 + /* Call with exclusively locked inode->i_rwsem */ 97 + static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) 98 + { 99 + if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { 100 + set_bit(NFS_INO_ODIRECT, &nfsi->flags); 101 + nfs_wb_all(inode); 102 + } 103 + } 104 + 105 + /** 106 + * nfs_end_io_direct - declare the file is being used for direct i/o 107 + * @inode - file inode 108 + * 109 + * Declare that a direct I/O operation is about to start, and ensure 110 + * that we block all buffered I/O. 111 + * On exit, the function ensures that the NFS_INO_ODIRECT flag is set, 112 + * and holds a shared lock on inode->i_rwsem to ensure that the flag 113 + * cannot be changed. 114 + * In practice, this means that direct I/O operations are allowed to 115 + * execute in parallel, thanks to the shared lock, whereas buffered I/O 116 + * operations need to wait to grab an exclusive lock in order to clear 117 + * NFS_INO_ODIRECT. 118 + * Note that buffered writes and truncates both take a write lock on 119 + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. 120 + */ 121 + void 122 + nfs_start_io_direct(struct inode *inode) 123 + { 124 + struct nfs_inode *nfsi = NFS_I(inode); 125 + /* Be an optimist! */ 126 + down_read(&inode->i_rwsem); 127 + if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) 128 + return; 129 + up_read(&inode->i_rwsem); 130 + /* Slow path.... */ 131 + down_write(&inode->i_rwsem); 132 + nfs_block_buffered(nfsi, inode); 133 + downgrade_write(&inode->i_rwsem); 134 + } 135 + 136 + /** 137 + * nfs_end_io_direct - declare that the direct i/o operation is done 138 + * @inode - file inode 139 + * 140 + * Declare that a direct I/O operation is done, and release the shared 141 + * lock on inode->i_rwsem. 142 + */ 143 + void 144 + nfs_end_io_direct(struct inode *inode) 145 + { 146 + up_read(&inode->i_rwsem); 147 + }

+10 -4

fs/nfs/nfs3client.c

··· 76 76 * low timeout interval so that if a connection is lost, we retry through 77 77 * the MDS. 78 78 */ 79 - struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, 79 + struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, 80 80 const struct sockaddr *ds_addr, int ds_addrlen, 81 81 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, 82 82 rpc_authflavor_t au_flavor) 83 83 { 84 + struct rpc_timeout ds_timeout; 85 + struct nfs_client *mds_clp = mds_srv->nfs_client; 84 86 struct nfs_client_initdata cl_init = { 85 87 .addr = ds_addr, 86 88 .addrlen = ds_addrlen, 89 + .nodename = mds_clp->cl_rpcclient->cl_nodename, 90 + .ip_addr = mds_clp->cl_ipaddr, 87 91 .nfs_mod = &nfs_v3, 88 92 .proto = ds_proto, 89 93 .net = mds_clp->cl_net, 94 + .timeparms = &ds_timeout, 90 95 }; 91 - struct rpc_timeout ds_timeout; 92 96 struct nfs_client *clp; 93 97 char buf[INET6_ADDRSTRLEN + 1]; 94 98 ··· 101 97 return ERR_PTR(-EINVAL); 102 98 cl_init.hostname = buf; 103 99 100 + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) 101 + set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 102 + 104 103 /* Use the MDS nfs_client cl_ipaddr. */ 105 104 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); 106 - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, 107 - au_flavor); 105 + clp = nfs_get_client(&cl_init, au_flavor); 108 106 109 107 return clp; 110 108 }

+19 -5

fs/nfs/nfs42proc.c

··· 113 113 if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) 114 114 return -EOPNOTSUPP; 115 115 116 - nfs_wb_all(inode); 117 116 inode_lock(inode); 117 + err = nfs_sync_inode(inode); 118 + if (err) 119 + goto out_unlock; 118 120 119 121 err = nfs42_proc_fallocate(&msg, filep, offset, len); 120 122 if (err == 0) 121 123 truncate_pagecache_range(inode, offset, (offset + len) -1); 122 124 if (err == -EOPNOTSUPP) 123 125 NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; 124 - 126 + out_unlock: 125 127 inode_unlock(inode); 126 128 return err; 127 129 } ··· 156 154 if (status) 157 155 return status; 158 156 157 + status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, 158 + pos_src, pos_src + (loff_t)count - 1); 159 + if (status) 160 + return status; 161 + 159 162 status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, 160 163 dst_lock, FMODE_WRITE); 164 + if (status) 165 + return status; 166 + 167 + status = nfs_sync_inode(dst_inode); 161 168 if (status) 162 169 return status; 163 170 ··· 269 258 if (status) 270 259 return status; 271 260 272 - nfs_wb_all(inode); 261 + status = nfs_filemap_write_and_wait_range(inode->i_mapping, 262 + offset, LLONG_MAX); 263 + if (status) 264 + return status; 265 + 273 266 status = nfs4_call_sync(server->client, server, &msg, 274 267 &args.seq_args, &res.seq_res, 0); 275 268 if (status == -ENOTSUPP) ··· 351 336 * Mark the bad layout state as invalid, then retry 352 337 * with the current stateid. 353 338 */ 354 - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 355 - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); 339 + pnfs_mark_layout_stateid_invalid(lo, &head); 356 340 spin_unlock(&inode->i_lock); 357 341 pnfs_free_lseg_list(&head); 358 342 } else

+10 -2

fs/nfs/nfs42xdr.c

··· 330 330 struct nfs42_write_res *res) 331 331 { 332 332 __be32 *p; 333 - int stateids; 334 333 335 334 p = xdr_inline_decode(xdr, 4 + 8 + 4); 336 335 if (unlikely(!p)) 337 336 goto out_overflow; 338 337 339 - stateids = be32_to_cpup(p++); 338 + /* 339 + * We never use asynchronous mode, so warn if a server returns 340 + * a stateid. 341 + */ 342 + if (unlikely(*p != 0)) { 343 + pr_err_once("%s: server has set unrequested " 344 + "asynchronous mode\n", __func__); 345 + return -EREMOTEIO; 346 + } 347 + p++; 340 348 p = xdr_decode_hyper(p, &res->count); 341 349 res->verifier.committed = be32_to_cpup(p); 342 350 return decode_verifier(xdr, &res->verifier.verifier);

+1

fs/nfs/nfs4_fs.h

··· 185 185 struct nfs4_exception { 186 186 struct nfs4_state *state; 187 187 struct inode *inode; 188 + nfs4_stateid *stateid; 188 189 long timeout; 189 190 unsigned char delay : 1, 190 191 recovering : 1,

+17 -9

fs/nfs/nfs4client.c

··· 349 349 * Returns pointer to an NFS client, or an ERR_PTR value. 350 350 */ 351 351 struct nfs_client *nfs4_init_client(struct nfs_client *clp, 352 - const struct rpc_timeout *timeparms, 353 - const char *ip_addr) 352 + const struct nfs_client_initdata *cl_init) 354 353 { 355 354 char buf[INET6_ADDRSTRLEN + 1]; 355 + const char *ip_addr = cl_init->ip_addr; 356 356 struct nfs_client *old; 357 357 int error; 358 358 ··· 370 370 __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); 371 371 __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); 372 372 373 - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); 373 + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); 374 374 if (error == -EINVAL) 375 - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); 375 + error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); 376 376 if (error < 0) 377 377 goto error; 378 378 ··· 793 793 .hostname = hostname, 794 794 .addr = addr, 795 795 .addrlen = addrlen, 796 + .ip_addr = ip_addr, 796 797 .nfs_mod = &nfs_v4, 797 798 .proto = proto, 798 799 .minorversion = minorversion, 799 800 .net = net, 801 + .timeparms = timeparms, 800 802 }; 801 803 struct nfs_client *clp; 802 804 int error; ··· 811 809 set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); 812 810 813 811 /* Allocate or find a client reference we can use */ 814 - clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour); 812 + clp = nfs_get_client(&cl_init, authflavour); 815 813 if (IS_ERR(clp)) { 816 814 error = PTR_ERR(clp); 817 815 goto error; ··· 844 842 * low timeout interval so that if a connection is lost, we retry through 845 843 * the MDS. 846 844 */ 847 - struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, 845 + struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, 848 846 const struct sockaddr *ds_addr, int ds_addrlen, 849 847 int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, 850 848 u32 minor_version, rpc_authflavor_t au_flavor) 851 849 { 850 + struct rpc_timeout ds_timeout; 851 + struct nfs_client *mds_clp = mds_srv->nfs_client; 852 852 struct nfs_client_initdata cl_init = { 853 853 .addr = ds_addr, 854 854 .addrlen = ds_addrlen, 855 + .nodename = mds_clp->cl_rpcclient->cl_nodename, 856 + .ip_addr = mds_clp->cl_ipaddr, 855 857 .nfs_mod = &nfs_v4, 856 858 .proto = ds_proto, 857 859 .minorversion = minor_version, 858 860 .net = mds_clp->cl_net, 861 + .timeparms = &ds_timeout, 859 862 }; 860 - struct rpc_timeout ds_timeout; 861 863 struct nfs_client *clp; 862 864 char buf[INET6_ADDRSTRLEN + 1]; 863 865 ··· 869 863 return ERR_PTR(-EINVAL); 870 864 cl_init.hostname = buf; 871 865 866 + if (mds_srv->flags & NFS_MOUNT_NORESVPORT) 867 + __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); 868 + 872 869 /* 873 870 * Set an authflavor equual to the MDS value. Use the MDS nfs_client 874 871 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS 875 872 * (section 13.1 RFC 5661). 876 873 */ 877 874 nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); 878 - clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, 879 - au_flavor); 875 + clp = nfs_get_client(&cl_init, au_flavor); 880 876 881 877 dprintk("<-- %s %p\n", __func__, clp); 882 878 return clp;

+2 -14

fs/nfs/nfs4file.c

··· 66 66 if (openflags & O_TRUNC) { 67 67 attr.ia_valid |= ATTR_SIZE; 68 68 attr.ia_size = 0; 69 - nfs_sync_inode(inode); 69 + filemap_write_and_wait(inode->i_mapping); 70 70 } 71 71 72 72 inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL); ··· 133 133 struct file *file_out, loff_t pos_out, 134 134 size_t count, unsigned int flags) 135 135 { 136 - struct inode *in_inode = file_inode(file_in); 137 - struct inode *out_inode = file_inode(file_out); 138 - int ret; 139 - 140 - if (in_inode == out_inode) 136 + if (file_inode(file_in) == file_inode(file_out)) 141 137 return -EINVAL; 142 - 143 - /* flush any pending writes */ 144 - ret = nfs_sync_inode(in_inode); 145 - if (ret) 146 - return ret; 147 - ret = nfs_sync_inode(out_inode); 148 - if (ret) 149 - return ret; 150 138 151 139 return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); 152 140 }

+108 -101

fs/nfs/nfs4proc.c

··· 363 363 { 364 364 struct nfs_client *clp = server->nfs_client; 365 365 struct nfs4_state *state = exception->state; 366 + const nfs4_stateid *stateid = exception->stateid; 366 367 struct inode *inode = exception->inode; 367 368 int ret = errorcode; 368 369 ··· 377 376 case -NFS4ERR_DELEG_REVOKED: 378 377 case -NFS4ERR_ADMIN_REVOKED: 379 378 case -NFS4ERR_BAD_STATEID: 380 - if (inode && nfs_async_inode_return_delegation(inode, 381 - NULL) == 0) 382 - goto wait_on_recovery; 379 + if (inode) { 380 + int err; 381 + 382 + err = nfs_async_inode_return_delegation(inode, 383 + stateid); 384 + if (err == 0) 385 + goto wait_on_recovery; 386 + if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) { 387 + exception->retry = 1; 388 + break; 389 + } 390 + } 383 391 if (state == NULL) 384 392 break; 385 393 ret = nfs4_schedule_stateid_recovery(server, state); ··· 437 427 case -NFS4ERR_DELAY: 438 428 nfs_inc_server_stats(server, NFSIOS_DELAY); 439 429 case -NFS4ERR_GRACE: 430 + case -NFS4ERR_LAYOUTTRYLATER: 440 431 case -NFS4ERR_RECALLCONFLICT: 441 432 exception->delay = 1; 442 433 return 0; ··· 2680 2669 return res; 2681 2670 } 2682 2671 2683 - static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 2684 - struct nfs_fattr *fattr, struct iattr *sattr, 2685 - struct nfs4_state *state, struct nfs4_label *ilabel, 2686 - struct nfs4_label *olabel) 2672 + static int _nfs4_do_setattr(struct inode *inode, 2673 + struct nfs_setattrargs *arg, 2674 + struct nfs_setattrres *res, 2675 + struct rpc_cred *cred, 2676 + struct nfs4_state *state) 2677 + { 2678 + struct nfs_server *server = NFS_SERVER(inode); 2679 + struct rpc_message msg = { 2680 + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], 2681 + .rpc_argp = arg, 2682 + .rpc_resp = res, 2683 + .rpc_cred = cred, 2684 + }; 2685 + struct rpc_cred *delegation_cred = NULL; 2686 + unsigned long timestamp = jiffies; 2687 + fmode_t fmode; 2688 + bool truncate; 2689 + int status; 2690 + 2691 + nfs_fattr_init(res->fattr); 2692 + 2693 + /* Servers should only apply open mode checks for file size changes */ 2694 + truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false; 2695 + fmode = truncate ? FMODE_WRITE : FMODE_READ; 2696 + 2697 + if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) { 2698 + /* Use that stateid */ 2699 + } else if (truncate && state != NULL) { 2700 + struct nfs_lockowner lockowner = { 2701 + .l_owner = current->files, 2702 + .l_pid = current->tgid, 2703 + }; 2704 + if (!nfs4_valid_open_stateid(state)) 2705 + return -EBADF; 2706 + if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, 2707 + &arg->stateid, &delegation_cred) == -EIO) 2708 + return -EBADF; 2709 + } else 2710 + nfs4_stateid_copy(&arg->stateid, &zero_stateid); 2711 + if (delegation_cred) 2712 + msg.rpc_cred = delegation_cred; 2713 + 2714 + status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1); 2715 + 2716 + put_rpccred(delegation_cred); 2717 + if (status == 0 && state != NULL) 2718 + renew_lease(server, timestamp); 2719 + trace_nfs4_setattr(inode, &arg->stateid, status); 2720 + return status; 2721 + } 2722 + 2723 + static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 2724 + struct nfs_fattr *fattr, struct iattr *sattr, 2725 + struct nfs4_state *state, struct nfs4_label *ilabel, 2726 + struct nfs4_label *olabel) 2687 2727 { 2688 2728 struct nfs_server *server = NFS_SERVER(inode); 2689 2729 struct nfs_setattrargs arg = { ··· 2749 2687 .label = olabel, 2750 2688 .server = server, 2751 2689 }; 2752 - struct rpc_message msg = { 2753 - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR], 2754 - .rpc_argp = &arg, 2755 - .rpc_resp = &res, 2756 - .rpc_cred = cred, 2757 - }; 2758 - struct rpc_cred *delegation_cred = NULL; 2759 - unsigned long timestamp = jiffies; 2760 - fmode_t fmode; 2761 - bool truncate; 2762 - int status; 2690 + struct nfs4_exception exception = { 2691 + .state = state, 2692 + .inode = inode, 2693 + .stateid = &arg.stateid, 2694 + }; 2695 + int err; 2763 2696 2764 2697 arg.bitmask = nfs4_bitmask(server, ilabel); 2765 2698 if (ilabel) 2766 2699 arg.bitmask = nfs4_bitmask(server, olabel); 2767 2700 2768 - nfs_fattr_init(fattr); 2769 - 2770 - /* Servers should only apply open mode checks for file size changes */ 2771 - truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; 2772 - fmode = truncate ? FMODE_WRITE : FMODE_READ; 2773 - 2774 - if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) { 2775 - /* Use that stateid */ 2776 - } else if (truncate && state != NULL) { 2777 - struct nfs_lockowner lockowner = { 2778 - .l_owner = current->files, 2779 - .l_pid = current->tgid, 2780 - }; 2781 - if (!nfs4_valid_open_stateid(state)) 2782 - return -EBADF; 2783 - if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, 2784 - &arg.stateid, &delegation_cred) == -EIO) 2785 - return -EBADF; 2786 - } else 2787 - nfs4_stateid_copy(&arg.stateid, &zero_stateid); 2788 - if (delegation_cred) 2789 - msg.rpc_cred = delegation_cred; 2790 - 2791 - status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); 2792 - 2793 - put_rpccred(delegation_cred); 2794 - if (status == 0 && state != NULL) 2795 - renew_lease(server, timestamp); 2796 - trace_nfs4_setattr(inode, &arg.stateid, status); 2797 - return status; 2798 - } 2799 - 2800 - static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 2801 - struct nfs_fattr *fattr, struct iattr *sattr, 2802 - struct nfs4_state *state, struct nfs4_label *ilabel, 2803 - struct nfs4_label *olabel) 2804 - { 2805 - struct nfs_server *server = NFS_SERVER(inode); 2806 - struct nfs4_exception exception = { 2807 - .state = state, 2808 - .inode = inode, 2809 - }; 2810 - int err; 2811 2701 do { 2812 - err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); 2702 + err = _nfs4_do_setattr(inode, &arg, &res, cred, state); 2813 2703 switch (err) { 2814 2704 case -NFS4ERR_OPENMODE: 2815 2705 if (!(sattr->ia_valid & ATTR_SIZE)) { ··· 3281 3267 return status; 3282 3268 } 3283 3269 3284 - static int nfs4_do_find_root_sec(struct nfs_server *server, 3285 - struct nfs_fh *fhandle, struct nfs_fsinfo *info) 3286 - { 3287 - int mv = server->nfs_client->cl_minorversion; 3288 - return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info); 3289 - } 3290 - 3291 3270 /** 3292 3271 * nfs4_proc_get_rootfh - get file handle for server's pseudoroot 3293 3272 * @server: initialized nfs_server handle ··· 3300 3293 status = nfs4_lookup_root(server, fhandle, info); 3301 3294 3302 3295 if (auth_probe || status == NFS4ERR_WRONGSEC) 3303 - status = nfs4_do_find_root_sec(server, fhandle, info); 3296 + status = server->nfs_client->cl_mvops->find_root_sec(server, 3297 + fhandle, info); 3304 3298 3305 3299 if (status == 0) 3306 3300 status = nfs4_server_capabilities(server, fhandle); ··· 4400 4392 struct rpc_message *msg) 4401 4393 { 4402 4394 hdr->timestamp = jiffies; 4403 - hdr->pgio_done_cb = nfs4_read_done_cb; 4395 + if (!hdr->pgio_done_cb) 4396 + hdr->pgio_done_cb = nfs4_read_done_cb; 4404 4397 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; 4405 4398 nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0); 4406 4399 } ··· 7878 7869 struct inode *inode = lgp->args.inode; 7879 7870 struct nfs_server *server = NFS_SERVER(inode); 7880 7871 struct pnfs_layout_hdr *lo; 7881 - int status = task->tk_status; 7872 + int nfs4err = task->tk_status; 7873 + int err, status = 0; 7874 + LIST_HEAD(head); 7882 7875 7883 7876 dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); 7884 7877 7885 - switch (status) { 7878 + switch (nfs4err) { 7886 7879 case 0: 7887 7880 goto out; 7888 7881 ··· 7916 7905 status = -EOVERFLOW; 7917 7906 goto out; 7918 7907 } 7919 - /* Fallthrough */ 7908 + status = -EBUSY; 7909 + break; 7920 7910 case -NFS4ERR_RECALLCONFLICT: 7921 - nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT, 7922 - exception); 7923 7911 status = -ERECALLCONFLICT; 7924 - goto out; 7912 + break; 7925 7913 case -NFS4ERR_EXPIRED: 7926 7914 case -NFS4ERR_BAD_STATEID: 7927 7915 exception->timeout = 0; 7928 7916 spin_lock(&inode->i_lock); 7929 - if (nfs4_stateid_match(&lgp->args.stateid, 7917 + lo = NFS_I(inode)->layout; 7918 + /* If the open stateid was bad, then recover it. */ 7919 + if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || 7920 + nfs4_stateid_match_other(&lgp->args.stateid, 7930 7921 &lgp->args.ctx->state->stateid)) { 7931 7922 spin_unlock(&inode->i_lock); 7932 - /* If the open stateid was bad, then recover it. */ 7933 7923 exception->state = lgp->args.ctx->state; 7934 7924 break; 7935 7925 } 7936 - lo = NFS_I(inode)->layout; 7937 - if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) && 7938 - nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { 7939 - LIST_HEAD(head); 7940 7926 7941 - /* 7942 - * Mark the bad layout state as invalid, then retry 7943 - * with the current stateid. 7944 - */ 7945 - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 7946 - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); 7947 - spin_unlock(&inode->i_lock); 7948 - pnfs_free_lseg_list(&head); 7949 - status = -EAGAIN; 7950 - goto out; 7951 - } else 7952 - spin_unlock(&inode->i_lock); 7927 + /* 7928 + * Mark the bad layout state as invalid, then retry 7929 + */ 7930 + pnfs_mark_layout_stateid_invalid(lo, &head); 7931 + spin_unlock(&inode->i_lock); 7932 + pnfs_free_lseg_list(&head); 7933 + status = -EAGAIN; 7934 + goto out; 7953 7935 } 7954 7936 7955 - status = nfs4_handle_exception(server, status, exception); 7956 - if (exception->retry) 7957 - status = -EAGAIN; 7937 + err = nfs4_handle_exception(server, nfs4err, exception); 7938 + if (!status) { 7939 + if (exception->retry) 7940 + status = -EAGAIN; 7941 + else 7942 + status = err; 7943 + } 7958 7944 out: 7959 7945 dprintk("<-- %s\n", __func__); 7960 7946 return status; ··· 8137 8129 spin_lock(&lo->plh_inode->i_lock); 8138 8130 pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, 8139 8131 be32_to_cpu(lrp->args.stateid.seqid)); 8140 - pnfs_mark_layout_returned_if_empty(lo); 8141 - if (lrp->res.lrs_present) 8132 + if (lrp->res.lrs_present && pnfs_layout_is_valid(lo)) 8142 8133 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); 8143 8134 pnfs_clear_layoutreturn_waitbit(lo); 8144 8135 spin_unlock(&lo->plh_inode->i_lock); ··· 8842 8835 #endif 8843 8836 }; 8844 8837 8845 - ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) 8838 + static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) 8846 8839 { 8847 8840 ssize_t error, error2; 8848 8841

+8 -3

fs/nfs/nfs4xdr.c

··· 1985 1985 p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */ 1986 1986 *p = cpu_to_be32(0); /* reclaim */ 1987 1987 encode_nfs4_stateid(xdr, &args->stateid); 1988 - p = reserve_space(xdr, 20); 1989 - *p++ = cpu_to_be32(1); /* newoffset = TRUE */ 1990 - p = xdr_encode_hyper(p, args->lastbytewritten); 1988 + if (args->lastbytewritten != U64_MAX) { 1989 + p = reserve_space(xdr, 20); 1990 + *p++ = cpu_to_be32(1); /* newoffset = TRUE */ 1991 + p = xdr_encode_hyper(p, args->lastbytewritten); 1992 + } else { 1993 + p = reserve_space(xdr, 12); 1994 + *p++ = cpu_to_be32(0); /* newoffset = FALSE */ 1995 + } 1991 1996 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ 1992 1997 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ 1993 1998

-1

fs/nfs/nfstrace.h

··· 37 37 { 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \ 38 38 { 1 << NFS_INO_STALE, "STALE" }, \ 39 39 { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ 40 - { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ 41 40 { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ 42 41 { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ 43 42 { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })

+116 -75

fs/nfs/pnfs.c

··· 259 259 * is required. 260 260 * Note that caller must hold inode->i_lock. 261 261 */ 262 - static int 262 + int 263 263 pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, 264 264 struct list_head *lseg_list) 265 265 { ··· 334 334 } 335 335 336 336 static void 337 - init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) 337 + pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, 338 + const struct pnfs_layout_range *range, 339 + const nfs4_stateid *stateid) 338 340 { 339 341 INIT_LIST_HEAD(&lseg->pls_list); 340 342 INIT_LIST_HEAD(&lseg->pls_lc_list); 341 343 atomic_set(&lseg->pls_refcount, 1); 342 - smp_mb(); 343 344 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 344 345 lseg->pls_layout = lo; 346 + lseg->pls_range = *range; 347 + lseg->pls_seq = be32_to_cpu(stateid->seqid); 345 348 } 346 349 347 350 static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) ··· 489 486 (end2 == NFS4_MAX_UINT64 || end2 > start1); 490 487 } 491 488 492 - static bool 493 - should_free_lseg(const struct pnfs_layout_range *lseg_range, 494 - const struct pnfs_layout_range *recall_range) 495 - { 496 - return (recall_range->iomode == IOMODE_ANY || 497 - lseg_range->iomode == recall_range->iomode) && 498 - pnfs_lseg_range_intersecting(lseg_range, recall_range); 499 - } 500 - 501 489 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, 502 490 struct list_head *tmp_list) 503 491 { ··· 527 533 return (s32)(s1 - s2) > 0; 528 534 } 529 535 536 + static bool 537 + pnfs_should_free_range(const struct pnfs_layout_range *lseg_range, 538 + const struct pnfs_layout_range *recall_range) 539 + { 540 + return (recall_range->iomode == IOMODE_ANY || 541 + lseg_range->iomode == recall_range->iomode) && 542 + pnfs_lseg_range_intersecting(lseg_range, recall_range); 543 + } 544 + 545 + static bool 546 + pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg, 547 + const struct pnfs_layout_range *recall_range, 548 + u32 seq) 549 + { 550 + if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq)) 551 + return false; 552 + if (recall_range == NULL) 553 + return true; 554 + return pnfs_should_free_range(&lseg->pls_range, recall_range); 555 + } 556 + 530 557 /** 531 558 * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later 532 559 * @lo: layout header containing the lsegs ··· 577 562 if (list_empty(&lo->plh_segs)) 578 563 return 0; 579 564 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 580 - if (!recall_range || 581 - should_free_lseg(&lseg->pls_range, recall_range)) { 582 - if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq)) 583 - continue; 565 + if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { 584 566 dprintk("%s: freeing lseg %p iomode %d seq %u" 585 567 "offset %llu length %llu\n", __func__, 586 568 lseg, lseg->pls_range.iomode, lseg->pls_seq, ··· 773 761 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 774 762 bool update_barrier) 775 763 { 776 - u32 oldseq, newseq, new_barrier; 777 - int empty = list_empty(&lo->plh_segs); 764 + u32 oldseq, newseq, new_barrier = 0; 765 + bool invalid = !pnfs_layout_is_valid(lo); 778 766 779 767 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 780 768 newseq = be32_to_cpu(new->seqid); 781 - if (empty || pnfs_seqid_is_newer(newseq, oldseq)) { 769 + if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) { 782 770 nfs4_stateid_copy(&lo->plh_stateid, new); 783 - if (update_barrier) { 784 - new_barrier = be32_to_cpu(new->seqid); 785 - } else { 786 - /* Because of wraparound, we want to keep the barrier 787 - * "close" to the current seqids. 788 - */ 789 - new_barrier = newseq - atomic_read(&lo->plh_outstanding); 790 - } 791 - if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) 792 - lo->plh_barrier = new_barrier; 771 + /* 772 + * Because of wraparound, we want to keep the barrier 773 + * "close" to the current seqids. 774 + */ 775 + new_barrier = newseq - atomic_read(&lo->plh_outstanding); 793 776 } 777 + if (update_barrier) 778 + new_barrier = be32_to_cpu(new->seqid); 779 + else if (new_barrier == 0) 780 + return; 781 + if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier)) 782 + lo->plh_barrier = new_barrier; 794 783 } 795 784 796 785 static bool ··· 886 873 rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); 887 874 } 888 875 876 + static void 877 + pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) 878 + { 879 + lo->plh_return_iomode = 0; 880 + lo->plh_return_seq = 0; 881 + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); 882 + } 883 + 889 884 static bool 890 - pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) 885 + pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, 886 + nfs4_stateid *stateid, 887 + enum pnfs_iomode *iomode) 891 888 { 892 889 if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) 893 890 return false; 894 - lo->plh_return_iomode = 0; 895 - lo->plh_return_seq = 0; 896 891 pnfs_get_layout_hdr(lo); 897 - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); 892 + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { 893 + if (stateid != NULL) { 894 + nfs4_stateid_copy(stateid, &lo->plh_stateid); 895 + if (lo->plh_return_seq != 0) 896 + stateid->seqid = cpu_to_be32(lo->plh_return_seq); 897 + } 898 + if (iomode != NULL) 899 + *iomode = lo->plh_return_iomode; 900 + pnfs_clear_layoutreturn_info(lo); 901 + return true; 902 + } 903 + if (stateid != NULL) 904 + nfs4_stateid_copy(stateid, &lo->plh_stateid); 905 + if (iomode != NULL) 906 + *iomode = IOMODE_ANY; 898 907 return true; 899 908 } 900 909 ··· 984 949 enum pnfs_iomode iomode; 985 950 bool send; 986 951 987 - nfs4_stateid_copy(&stateid, &lo->plh_stateid); 988 - stateid.seqid = cpu_to_be32(lo->plh_return_seq); 989 - iomode = lo->plh_return_iomode; 990 - send = pnfs_prepare_layoutreturn(lo); 952 + send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); 991 953 spin_unlock(&inode->i_lock); 992 954 if (send) { 993 955 /* Send an async layoutreturn so we dont deadlock */ ··· 1021 989 dprintk("NFS: %s no layout to return\n", __func__); 1022 990 goto out; 1023 991 } 1024 - nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); 1025 992 /* Reference matched in nfs4_layoutreturn_release */ 1026 993 pnfs_get_layout_hdr(lo); 1027 994 empty = list_empty(&lo->plh_segs); ··· 1043 1012 goto out_put_layout_hdr; 1044 1013 } 1045 1014 1046 - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 1047 - send = pnfs_prepare_layoutreturn(lo); 1015 + send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); 1048 1016 spin_unlock(&ino->i_lock); 1049 1017 pnfs_free_lseg_list(&tmp_list); 1050 1018 if (send) ··· 1110 1080 goto out_noroc; 1111 1081 } 1112 1082 1113 - nfs4_stateid_copy(&stateid, &lo->plh_stateid); 1114 1083 /* always send layoutreturn if being marked so */ 1115 - if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED, 1116 - &lo->plh_flags)) 1117 - layoutreturn = pnfs_prepare_layoutreturn(lo); 1084 + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) 1085 + layoutreturn = pnfs_prepare_layoutreturn(lo, 1086 + &stateid, NULL); 1118 1087 1119 1088 list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) 1120 1089 /* If we are sending layoutreturn, invalidate all valid lsegs */ ··· 1161 1132 1162 1133 spin_lock(&ino->i_lock); 1163 1134 lo = NFS_I(ino)->layout; 1164 - pnfs_mark_layout_returned_if_empty(lo); 1165 1135 if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) 1166 1136 lo->plh_barrier = barrier; 1167 1137 spin_unlock(&ino->i_lock); ··· 1533 1505 struct pnfs_layout_segment *lseg = NULL; 1534 1506 nfs4_stateid stateid; 1535 1507 long timeout = 0; 1536 - unsigned long giveup = jiffies + rpc_get_timeout(server->client); 1508 + unsigned long giveup = jiffies + (clp->cl_lease_time << 1); 1537 1509 bool first; 1538 1510 1539 1511 if (!pnfs_enabled_sb(NFS_SERVER(ino))) { ··· 1673 1645 lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); 1674 1646 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 1675 1647 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); 1648 + atomic_dec(&lo->plh_outstanding); 1676 1649 if (IS_ERR(lseg)) { 1677 1650 switch(PTR_ERR(lseg)) { 1678 - case -ERECALLCONFLICT: 1651 + case -EBUSY: 1679 1652 if (time_after(jiffies, giveup)) 1680 1653 lseg = NULL; 1654 + break; 1655 + case -ERECALLCONFLICT: 1656 + /* Huh? We hold no layouts, how is there a recall? */ 1657 + if (first) { 1658 + lseg = NULL; 1659 + break; 1660 + } 1661 + /* Destroy the existing layout and start over */ 1662 + if (time_after(jiffies, giveup)) 1663 + pnfs_destroy_layout(NFS_I(ino)); 1681 1664 /* Fallthrough */ 1682 1665 case -EAGAIN: 1683 - pnfs_put_layout_hdr(lo); 1684 - if (first) 1685 - pnfs_clear_first_layoutget(lo); 1686 - if (lseg) { 1687 - trace_pnfs_update_layout(ino, pos, count, 1688 - iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); 1689 - goto lookup_again; 1690 - } 1691 - /* Fallthrough */ 1666 + break; 1692 1667 default: 1693 1668 if (!nfs_error_is_fatal(PTR_ERR(lseg))) { 1694 1669 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 1695 1670 lseg = NULL; 1696 1671 } 1672 + goto out_put_layout_hdr; 1673 + } 1674 + if (lseg) { 1675 + if (first) 1676 + pnfs_clear_first_layoutget(lo); 1677 + trace_pnfs_update_layout(ino, pos, count, 1678 + iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); 1679 + pnfs_put_layout_hdr(lo); 1680 + goto lookup_again; 1697 1681 } 1698 1682 } else { 1699 1683 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 1700 1684 } 1701 1685 1702 - atomic_dec(&lo->plh_outstanding); 1703 1686 out_put_layout_hdr: 1704 1687 if (first) 1705 1688 pnfs_clear_first_layoutget(lo); ··· 1774 1735 return lseg; 1775 1736 } 1776 1737 1777 - init_lseg(lo, lseg); 1778 - lseg->pls_range = res->range; 1779 - lseg->pls_seq = be32_to_cpu(res->stateid.seqid); 1738 + pnfs_init_lseg(lo, lseg, &res->range, &res->stateid); 1780 1739 1781 1740 spin_lock(&ino->i_lock); 1782 1741 if (pnfs_layoutgets_blocked(lo)) { ··· 1795 1758 * inode invalid, and don't bother validating the stateid 1796 1759 * sequence number. 1797 1760 */ 1798 - pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0); 1761 + pnfs_mark_layout_stateid_invalid(lo, &free_me); 1799 1762 1800 1763 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); 1801 1764 lo->plh_barrier = be32_to_cpu(res->stateid.seqid); 1802 1765 } 1803 1766 1804 - clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 1805 - 1806 1767 pnfs_get_lseg(lseg); 1807 1768 pnfs_layout_insert_lseg(lo, lseg, &free_me); 1769 + if (!pnfs_layout_is_valid(lo)) { 1770 + pnfs_clear_layoutreturn_info(lo); 1771 + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 1772 + } 1773 + 1808 1774 1809 1775 if (res->return_on_close) 1810 1776 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); ··· 1827 1787 pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, 1828 1788 u32 seq) 1829 1789 { 1830 - if (lo->plh_return_iomode == iomode) 1831 - return; 1832 - if (lo->plh_return_iomode != 0) 1790 + if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) 1833 1791 iomode = IOMODE_ANY; 1834 1792 lo->plh_return_iomode = iomode; 1835 1793 set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); 1836 - if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) 1794 + if (seq != 0) { 1795 + WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); 1837 1796 lo->plh_return_seq = seq; 1797 + } 1838 1798 } 1839 1799 1840 1800 /** ··· 1864 1824 assert_spin_locked(&lo->plh_inode->i_lock); 1865 1825 1866 1826 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 1867 - if (should_free_lseg(&lseg->pls_range, return_range)) { 1827 + if (pnfs_match_lseg_recall(lseg, return_range, seq)) { 1868 1828 dprintk("%s: marking lseg %p iomode %d " 1869 1829 "offset %llu length %llu\n", __func__, 1870 1830 lseg, lseg->pls_range.iomode, ··· 1895 1855 bool return_now = false; 1896 1856 1897 1857 spin_lock(&inode->i_lock); 1898 - pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq); 1858 + pnfs_set_plh_return_info(lo, range.iomode, 0); 1899 1859 /* 1900 1860 * mark all matching lsegs so that we are sure to have no live 1901 1861 * segments at hand when sending layoutreturn. See pnfs_put_lseg() 1902 1862 * for how it works. 1903 1863 */ 1904 - if (!pnfs_mark_matching_lsegs_return(lo, &free_me, 1905 - &range, lseg->pls_seq)) { 1864 + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) { 1906 1865 nfs4_stateid stateid; 1907 - enum pnfs_iomode iomode = lo->plh_return_iomode; 1866 + enum pnfs_iomode iomode; 1908 1867 1909 - nfs4_stateid_copy(&stateid, &lo->plh_stateid); 1910 - return_now = pnfs_prepare_layoutreturn(lo); 1868 + return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); 1911 1869 spin_unlock(&inode->i_lock); 1912 1870 if (return_now) 1913 1871 pnfs_send_layoutreturn(lo, &stateid, iomode, false); ··· 2420 2382 nfs_fattr_init(&data->fattr); 2421 2383 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 2422 2384 data->res.fattr = &data->fattr; 2423 - data->args.lastbytewritten = end_pos - 1; 2385 + if (end_pos != 0) 2386 + data->args.lastbytewritten = end_pos - 1; 2387 + else 2388 + data->args.lastbytewritten = U64_MAX; 2424 2389 data->res.server = NFS_SERVER(inode); 2425 2390 2426 2391 if (ld->prepare_layoutcommit) {

+14 -20

fs/nfs/pnfs.h

··· 268 268 struct list_head *tmp_list, 269 269 const struct pnfs_layout_range *recall_range, 270 270 u32 seq); 271 + int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, 272 + struct list_head *lseg_list); 271 273 bool pnfs_roc(struct inode *ino); 272 274 void pnfs_roc_release(struct inode *ino); 273 275 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); ··· 375 373 static inline bool nfs_have_layout(struct inode *inode) 376 374 { 377 375 return NFS_I(inode)->layout != NULL; 376 + } 377 + 378 + static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo) 379 + { 380 + return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0; 378 381 } 379 382 380 383 static inline struct nfs4_deviceid_node * ··· 552 545 return 1 + end - offset; 553 546 } 554 547 555 - /** 556 - * pnfs_mark_layout_returned_if_empty - marks the layout as returned 557 - * @lo: layout header 558 - * 559 - * Note: Caller must hold inode->i_lock 560 - */ 561 - static inline void 562 - pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) 563 - { 564 - if (list_empty(&lo->plh_segs)) 565 - set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 566 - } 567 - 568 548 static inline void 569 549 pnfs_copy_range(struct pnfs_layout_range *dst, 570 550 const struct pnfs_layout_range *src) ··· 621 627 { 622 628 return 0; 623 629 } 630 + 631 + static inline bool 632 + pnfs_layoutcommit_outstanding(struct inode *inode) 633 + { 634 + return false; 635 + } 636 + 624 637 625 638 static inline bool 626 639 pnfs_roc(struct inode *ino) ··· 716 715 { 717 716 return false; 718 717 } 719 - 720 - static inline bool 721 - pnfs_layoutcommit_outstanding(struct inode *inode) 722 - { 723 - return false; 724 - } 725 - 726 718 727 719 static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 728 720 {

+10 -3

fs/nfs/pnfs_nfs.c

··· 595 595 } 596 596 597 597 static struct nfs_client *(*get_v3_ds_connect)( 598 - struct nfs_client *mds_clp, 598 + struct nfs_server *mds_srv, 599 599 const struct sockaddr *ds_addr, 600 600 int ds_addrlen, 601 601 int ds_proto, ··· 654 654 rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, 655 655 rpc_clnt_test_and_add_xprt, NULL); 656 656 } else 657 - clp = get_v3_ds_connect(mds_srv->nfs_client, 657 + clp = get_v3_ds_connect(mds_srv, 658 658 (struct sockaddr *)&da->da_addr, 659 659 da->da_addrlen, IPPROTO_TCP, 660 660 timeo, retrans, au_flavor); ··· 690 690 dprintk("%s: DS %s: trying address %s\n", 691 691 __func__, ds->ds_remotestr, da->da_remotestr); 692 692 693 - clp = nfs4_set_ds_client(mds_srv->nfs_client, 693 + clp = nfs4_set_ds_client(mds_srv, 694 694 (struct sockaddr *)&da->da_addr, 695 695 da->da_addrlen, IPPROTO_TCP, 696 696 timeo, retrans, minor_version, ··· 940 940 int 941 941 pnfs_nfs_generic_sync(struct inode *inode, bool datasync) 942 942 { 943 + int ret; 944 + 945 + if (!pnfs_layoutcommit_outstanding(inode)) 946 + return 0; 947 + ret = nfs_commit_inode(inode, FLUSH_SYNC); 948 + if (ret < 0) 949 + return ret; 943 950 if (datasync) 944 951 return 0; 945 952 return pnfs_layoutcommit_inode(inode, true);

+11 -3

fs/nfs/super.c

··· 1684 1684 { 1685 1685 rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; 1686 1686 unsigned int i; 1687 + int use_auth_null = false; 1687 1688 1688 1689 /* 1689 1690 * If the sec= mount option is used, the specified flavor or AUTH_NULL ··· 1692 1691 * 1693 1692 * AUTH_NULL has a special meaning when it's in the server list - it 1694 1693 * means that the server will ignore the rpc creds, so any flavor 1695 - * can be used. 1694 + * can be used but still use the sec= that was specified. 1696 1695 */ 1697 1696 for (i = 0; i < count; i++) { 1698 1697 flavor = server_authlist[i]; 1699 1698 1700 - if (nfs_auth_info_match(&args->auth_info, flavor) || 1701 - flavor == RPC_AUTH_NULL) 1699 + if (nfs_auth_info_match(&args->auth_info, flavor)) 1702 1700 goto out; 1701 + 1702 + if (flavor == RPC_AUTH_NULL) 1703 + use_auth_null = true; 1704 + } 1705 + 1706 + if (use_auth_null) { 1707 + flavor = RPC_AUTH_NULL; 1708 + goto out; 1703 1709 } 1704 1710 1705 1711 dfprintk(MOUNT,

+28 -16

fs/nfs/write.c

··· 625 625 int err; 626 626 627 627 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); 628 - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), 628 + nfs_pageio_init_write(&pgio, inode, 0, 629 629 false, &nfs_async_write_completion_ops); 630 630 err = nfs_do_writepage(page, wbc, &pgio, launder); 631 631 nfs_pageio_complete(&pgio); ··· 657 657 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) 658 658 { 659 659 struct inode *inode = mapping->host; 660 - unsigned long *bitlock = &NFS_I(inode)->flags; 661 660 struct nfs_pageio_descriptor pgio; 662 661 int err; 663 - 664 - /* Stop dirtying of new pages while we sync */ 665 - err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING, 666 - nfs_wait_bit_killable, TASK_KILLABLE); 667 - if (err) 668 - goto out_err; 669 662 670 663 nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); 671 664 ··· 666 673 &nfs_async_write_completion_ops); 667 674 err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); 668 675 nfs_pageio_complete(&pgio); 669 - 670 - clear_bit_unlock(NFS_INO_FLUSHING, bitlock); 671 - smp_mb__after_atomic(); 672 - wake_up_bit(bitlock, NFS_INO_FLUSHING); 673 676 674 677 if (err < 0) 675 678 goto out_err; ··· 1184 1195 /* 1185 1196 * Test if the open context credential key is marked to expire soon. 1186 1197 */ 1187 - bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx) 1198 + bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) 1188 1199 { 1189 - return rpcauth_cred_key_to_expire(ctx->cred); 1200 + struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth; 1201 + 1202 + return rpcauth_cred_key_to_expire(auth, ctx->cred); 1190 1203 } 1191 1204 1192 1205 /* ··· 1280 1289 dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n", 1281 1290 file, count, (long long)(page_file_offset(page) + offset)); 1282 1291 1292 + if (!count) 1293 + goto out; 1294 + 1283 1295 if (nfs_can_extend_write(file, page, inode)) { 1284 1296 count = max(count + offset, nfs_page_length(page)); 1285 1297 offset = 0; ··· 1293 1299 nfs_set_pageerror(page); 1294 1300 else 1295 1301 __set_page_dirty_nobuffers(page); 1296 - 1302 + out: 1297 1303 dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", 1298 1304 status, (long long)i_size_read(inode)); 1299 1305 return status; ··· 1794 1800 1795 1801 /* Okay, COMMIT succeeded, apparently. Check the verifier 1796 1802 * returned by the server against all stored verfs. */ 1797 - if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) { 1803 + if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { 1798 1804 /* We have a match */ 1799 1805 nfs_inode_remove_request(req); 1800 1806 dprintk(" OK\n"); ··· 1916 1922 return ret; 1917 1923 } 1918 1924 EXPORT_SYMBOL_GPL(nfs_write_inode); 1925 + 1926 + /* 1927 + * Wrapper for filemap_write_and_wait_range() 1928 + * 1929 + * Needed for pNFS in order to ensure data becomes visible to the 1930 + * client. 1931 + */ 1932 + int nfs_filemap_write_and_wait_range(struct address_space *mapping, 1933 + loff_t lstart, loff_t lend) 1934 + { 1935 + int ret; 1936 + 1937 + ret = filemap_write_and_wait_range(mapping, lstart, lend); 1938 + if (ret == 0) 1939 + ret = pnfs_sync_inode(mapping->host, true); 1940 + return ret; 1941 + } 1942 + EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range); 1919 1943 1920 1944 /* 1921 1945 * flush the inode to disk.

+1 -2

include/linux/nfs_fs.h

··· 205 205 #define NFS_INO_STALE (1) /* possible stale inode */ 206 206 #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ 207 207 #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ 208 - #define NFS_INO_FLUSHING (4) /* inode is flushing out data */ 209 208 #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ 210 209 #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ 211 210 #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ 212 211 #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ 213 212 #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ 213 + #define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */ 214 214 215 215 static inline struct nfs_inode *NFS_I(const struct inode *inode) 216 216 { ··· 351 351 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); 352 352 extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); 353 353 extern int nfs_revalidate_mapping_rcu(struct inode *inode); 354 - extern int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping); 355 354 extern int nfs_setattr(struct dentry *, struct iattr *); 356 355 extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); 357 356 extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,

+2 -3

include/linux/nfs_xdr.h

··· 1596 1596 int (*have_delegation)(struct inode *, fmode_t); 1597 1597 int (*return_delegation)(struct inode *); 1598 1598 struct nfs_client *(*alloc_client) (const struct nfs_client_initdata *); 1599 - struct nfs_client * 1600 - (*init_client) (struct nfs_client *, const struct rpc_timeout *, 1601 - const char *); 1599 + struct nfs_client *(*init_client) (struct nfs_client *, 1600 + const struct nfs_client_initdata *); 1602 1601 void (*free_client) (struct nfs_client *); 1603 1602 struct nfs_server *(*create_server)(struct nfs_mount_info *, struct nfs_subversion *); 1604 1603 struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *,

+7 -2

include/linux/sunrpc/auth.h

··· 37 37 38 38 /* auth_cred ac_flags bits */ 39 39 enum { 40 - RPC_CRED_NO_CRKEY_TIMEOUT = 0, /* underlying cred has no key timeout */ 41 40 RPC_CRED_KEY_EXPIRE_SOON = 1, /* underlying cred key will expire soon */ 42 41 RPC_CRED_NOTIFY_TIMEOUT = 2, /* nofity generic cred when underlying 43 42 key will expire soon */ ··· 81 82 82 83 #define RPCAUTH_CRED_MAGIC 0x0f4aa4f0 83 84 85 + /* rpc_auth au_flags */ 86 + #define RPCAUTH_AUTH_NO_CRKEY_TIMEOUT 0x0001 /* underlying cred has no key timeout */ 87 + 84 88 /* 85 89 * Client authentication handle 86 90 */ ··· 108 106 struct rpc_cred_cache * au_credcache; 109 107 /* per-flavor data */ 110 108 }; 109 + 110 + /* rpc_auth au_flags */ 111 + #define RPCAUTH_AUTH_DATATOUCH 0x00000002 111 112 112 113 struct rpc_auth_create_args { 113 114 rpc_authflavor_t pseudoflavor; ··· 201 196 void rpcauth_clear_credcache(struct rpc_cred_cache *); 202 197 int rpcauth_key_timeout_notify(struct rpc_auth *, 203 198 struct rpc_cred *); 204 - bool rpcauth_cred_key_to_expire(struct rpc_cred *); 199 + bool rpcauth_cred_key_to_expire(struct rpc_auth *, struct rpc_cred *); 205 200 char * rpcauth_stringify_acceptor(struct rpc_cred *); 206 201 207 202 static inline

+2

include/linux/sunrpc/gss_api.h

··· 73 73 rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop, 74 74 u32 service); 75 75 u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor); 76 + bool gss_pseudoflavor_to_datatouch(struct gss_api_mech *, u32 pseudoflavor); 76 77 char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service); 77 78 78 79 struct pf_desc { ··· 82 81 u32 service; 83 82 char *name; 84 83 char *auth_domain_name; 84 + bool datatouch; 85 85 }; 86 86 87 87 /* Different mechanisms (e.g., krb5 or spkm3) may implement gss-api, and

+5

include/linux/sunrpc/sched.h

··· 230 230 struct rpc_task *); 231 231 void rpc_wake_up(struct rpc_wait_queue *); 232 232 struct rpc_task *rpc_wake_up_next(struct rpc_wait_queue *); 233 + struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, 234 + struct rpc_wait_queue *, 235 + bool (*)(struct rpc_task *, void *), 236 + void *); 233 237 struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *, 234 238 bool (*)(struct rpc_task *, void *), 235 239 void *); ··· 251 247 int rpc_init_mempool(void); 252 248 void rpc_destroy_mempool(void); 253 249 extern struct workqueue_struct *rpciod_workqueue; 250 + extern struct workqueue_struct *xprtiod_workqueue; 254 251 void rpc_prepare_task(struct rpc_task *task); 255 252 256 253 static inline int rpc_wait_for_completion_task(struct rpc_task *task)

+1

include/linux/sunrpc/xprtsock.h

··· 80 80 #define TCP_RPC_REPLY (1UL << 6) 81 81 82 82 #define XPRT_SOCK_CONNECTING 1U 83 + #define XPRT_SOCK_DATA_READY (2) 83 84 84 85 #endif /* __KERNEL__ */ 85 86

+4 -4

net/sunrpc/auth.c

··· 51 51 ret = kstrtoul(val, 0, &num); 52 52 if (ret == -EINVAL) 53 53 goto out_inval; 54 - nbits = fls(num); 55 - if (num > (1U << nbits)) 56 - nbits++; 54 + nbits = fls(num - 1); 57 55 if (nbits > MAX_HASHTABLE_BITS || nbits < 2) 58 56 goto out_inval; 59 57 *(unsigned int *)kp->arg = nbits; ··· 357 359 EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); 358 360 359 361 bool 360 - rpcauth_cred_key_to_expire(struct rpc_cred *cred) 362 + rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred) 361 363 { 364 + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) 365 + return false; 362 366 if (!cred->cr_ops->crkey_to_expire) 363 367 return false; 364 368 return cred->cr_ops->crkey_to_expire(cred);

+1 -8

net/sunrpc/auth_generic.c

··· 224 224 225 225 226 226 /* Fast track for non crkey_timeout (no key) underlying credentials */ 227 - if (test_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags)) 227 + if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) 228 228 return 0; 229 229 230 230 /* Fast track for the normal case */ ··· 235 235 tcred = auth->au_ops->lookup_cred(auth, acred, 0); 236 236 if (IS_ERR(tcred)) 237 237 return -EACCES; 238 - 239 - if (!tcred->cr_ops->crkey_timeout) { 240 - set_bit(RPC_CRED_NO_CRKEY_TIMEOUT, &acred->ac_flags); 241 - ret = 0; 242 - goto out_put; 243 - } 244 238 245 239 /* Test for the almost error case */ 246 240 ret = tcred->cr_ops->crkey_timeout(tcred); ··· 251 257 set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); 252 258 } 253 259 254 - out_put: 255 260 put_rpccred(tcred); 256 261 return ret; 257 262 }

+3

net/sunrpc/auth_gss/auth_gss.c

··· 1015 1015 auth = &gss_auth->rpc_auth; 1016 1016 auth->au_cslack = GSS_CRED_SLACK >> 2; 1017 1017 auth->au_rslack = GSS_VERF_SLACK >> 2; 1018 + auth->au_flags = 0; 1018 1019 auth->au_ops = &authgss_ops; 1019 1020 auth->au_flavor = flavor; 1021 + if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) 1022 + auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; 1020 1023 atomic_set(&auth->au_count, 1); 1021 1024 kref_init(&gss_auth->kref); 1022 1025

+2

net/sunrpc/auth_gss/gss_krb5_mech.c

··· 745 745 .qop = GSS_C_QOP_DEFAULT, 746 746 .service = RPC_GSS_SVC_INTEGRITY, 747 747 .name = "krb5i", 748 + .datatouch = true, 748 749 }, 749 750 [2] = { 750 751 .pseudoflavor = RPC_AUTH_GSS_KRB5P, 751 752 .qop = GSS_C_QOP_DEFAULT, 752 753 .service = RPC_GSS_SVC_PRIVACY, 753 754 .name = "krb5p", 755 + .datatouch = true, 754 756 }, 755 757 }; 756 758

+12

net/sunrpc/auth_gss/gss_mech_switch.c

··· 361 361 } 362 362 EXPORT_SYMBOL(gss_pseudoflavor_to_service); 363 363 364 + bool 365 + gss_pseudoflavor_to_datatouch(struct gss_api_mech *gm, u32 pseudoflavor) 366 + { 367 + int i; 368 + 369 + for (i = 0; i < gm->gm_pf_num; i++) { 370 + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) 371 + return gm->gm_pfs[i].datatouch; 372 + } 373 + return false; 374 + } 375 + 364 376 char * 365 377 gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) 366 378 {

+1

net/sunrpc/auth_null.c

··· 115 115 struct rpc_auth null_auth = { 116 116 .au_cslack = NUL_CALLSLACK, 117 117 .au_rslack = NUL_REPLYSLACK, 118 + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, 118 119 .au_ops = &authnull_ops, 119 120 .au_flavor = RPC_AUTH_NULL, 120 121 .au_count = ATOMIC_INIT(0),

+1

net/sunrpc/auth_unix.c

··· 228 228 struct rpc_auth unix_auth = { 229 229 .au_cslack = UNX_CALLSLACK, 230 230 .au_rslack = NUL_REPLYSLACK, 231 + .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, 231 232 .au_ops = &authunix_ops, 232 233 .au_flavor = RPC_AUTH_UNIX, 233 234 .au_count = ATOMIC_INIT(0),

+1 -1

net/sunrpc/clnt.c

··· 2577 2577 kfree(data); 2578 2578 } 2579 2579 2580 - const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = { 2580 + static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { 2581 2581 .rpc_call_done = rpc_cb_add_xprt_done, 2582 2582 .rpc_release = rpc_cb_add_xprt_release, 2583 2583 };

+53 -14

net/sunrpc/sched.c

··· 54 54 /* 55 55 * rpciod-related stuff 56 56 */ 57 - struct workqueue_struct *rpciod_workqueue; 57 + struct workqueue_struct *rpciod_workqueue __read_mostly; 58 + struct workqueue_struct *xprtiod_workqueue __read_mostly; 58 59 59 60 /* 60 61 * Disable the timer for a given RPC task. Should be called with ··· 330 329 * lockless RPC_IS_QUEUED() test) before we've had a chance to test 331 330 * the RPC_TASK_RUNNING flag. 332 331 */ 333 - static void rpc_make_runnable(struct rpc_task *task) 332 + static void rpc_make_runnable(struct workqueue_struct *wq, 333 + struct rpc_task *task) 334 334 { 335 335 bool need_wakeup = !rpc_test_and_set_running(task); 336 336 ··· 340 338 return; 341 339 if (RPC_IS_ASYNC(task)) { 342 340 INIT_WORK(&task->u.tk_work, rpc_async_schedule); 343 - queue_work(rpciod_workqueue, &task->u.tk_work); 341 + queue_work(wq, &task->u.tk_work); 344 342 } else 345 343 wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); 346 344 } ··· 409 407 EXPORT_SYMBOL_GPL(rpc_sleep_on_priority); 410 408 411 409 /** 412 - * __rpc_do_wake_up_task - wake up a single rpc_task 410 + * __rpc_do_wake_up_task_on_wq - wake up a single rpc_task 411 + * @wq: workqueue on which to run task 413 412 * @queue: wait queue 414 413 * @task: task to be woken up 415 414 * 416 415 * Caller must hold queue->lock, and have cleared the task queued flag. 417 416 */ 418 - static void __rpc_do_wake_up_task(struct rpc_wait_queue *queue, struct rpc_task *task) 417 + static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, 418 + struct rpc_wait_queue *queue, 419 + struct rpc_task *task) 419 420 { 420 421 dprintk("RPC: %5u __rpc_wake_up_task (now %lu)\n", 421 422 task->tk_pid, jiffies); ··· 433 428 434 429 __rpc_remove_wait_queue(queue, task); 435 430 436 - rpc_make_runnable(task); 431 + rpc_make_runnable(wq, task); 437 432 438 433 dprintk("RPC: __rpc_wake_up_task done\n"); 439 434 } ··· 441 436 /* 442 437 * Wake up a queued task while the queue lock is being held 443 438 */ 444 - static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) 439 + static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, 440 + struct rpc_wait_queue *queue, struct rpc_task *task) 445 441 { 446 442 if (RPC_IS_QUEUED(task)) { 447 443 smp_rmb(); 448 444 if (task->tk_waitqueue == queue) 449 - __rpc_do_wake_up_task(queue, task); 445 + __rpc_do_wake_up_task_on_wq(wq, queue, task); 450 446 } 447 + } 448 + 449 + /* 450 + * Wake up a queued task while the queue lock is being held 451 + */ 452 + static void rpc_wake_up_task_queue_locked(struct rpc_wait_queue *queue, struct rpc_task *task) 453 + { 454 + rpc_wake_up_task_on_wq_queue_locked(rpciod_workqueue, queue, task); 451 455 } 452 456 453 457 /* ··· 532 518 /* 533 519 * Wake up the first task on the wait queue. 534 520 */ 535 - struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, 521 + struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, 522 + struct rpc_wait_queue *queue, 536 523 bool (*func)(struct rpc_task *, void *), void *data) 537 524 { 538 525 struct rpc_task *task = NULL; ··· 544 529 task = __rpc_find_next_queued(queue); 545 530 if (task != NULL) { 546 531 if (func(task, data)) 547 - rpc_wake_up_task_queue_locked(queue, task); 532 + rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); 548 533 else 549 534 task = NULL; 550 535 } 551 536 spin_unlock_bh(&queue->lock); 552 537 553 538 return task; 539 + } 540 + 541 + /* 542 + * Wake up the first task on the wait queue. 543 + */ 544 + struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *queue, 545 + bool (*func)(struct rpc_task *, void *), void *data) 546 + { 547 + return rpc_wake_up_first_on_wq(rpciod_workqueue, queue, func, data); 554 548 } 555 549 EXPORT_SYMBOL_GPL(rpc_wake_up_first); 556 550 ··· 838 814 bool is_async = RPC_IS_ASYNC(task); 839 815 840 816 rpc_set_active(task); 841 - rpc_make_runnable(task); 817 + rpc_make_runnable(rpciod_workqueue, task); 842 818 if (!is_async) 843 819 __rpc_execute(task); 844 820 } ··· 1095 1071 * Create the rpciod thread and wait for it to start. 1096 1072 */ 1097 1073 dprintk("RPC: creating workqueue rpciod\n"); 1098 - /* Note: highpri because network receive is latency sensitive */ 1099 - wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); 1074 + wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); 1075 + if (!wq) 1076 + goto out_failed; 1100 1077 rpciod_workqueue = wq; 1101 - return rpciod_workqueue != NULL; 1078 + /* Note: highpri because network receive is latency sensitive */ 1079 + wq = alloc_workqueue("xprtiod", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); 1080 + if (!wq) 1081 + goto free_rpciod; 1082 + xprtiod_workqueue = wq; 1083 + return 1; 1084 + free_rpciod: 1085 + wq = rpciod_workqueue; 1086 + rpciod_workqueue = NULL; 1087 + destroy_workqueue(wq); 1088 + out_failed: 1089 + return 0; 1102 1090 } 1103 1091 1104 1092 static void rpciod_stop(void) ··· 1123 1087 1124 1088 wq = rpciod_workqueue; 1125 1089 rpciod_workqueue = NULL; 1090 + destroy_workqueue(wq); 1091 + wq = xprtiod_workqueue; 1092 + xprtiod_workqueue = NULL; 1126 1093 destroy_workqueue(wq); 1127 1094 } 1128 1095

+7 -1

net/sunrpc/svc.c

··· 1188 1188 *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 1189 1189 1190 1190 /* Encode reply */ 1191 - if (test_bit(RQ_DROPME, &rqstp->rq_flags)) { 1191 + if (*statp == rpc_drop_reply || 1192 + test_bit(RQ_DROPME, &rqstp->rq_flags)) { 1192 1193 if (procp->pc_release) 1193 1194 procp->pc_release(rqstp, NULL, rqstp->rq_resp); 1194 1195 goto dropit; 1196 + } 1197 + if (*statp == rpc_autherr_badcred) { 1198 + if (procp->pc_release) 1199 + procp->pc_release(rqstp, NULL, rqstp->rq_resp); 1200 + goto err_bad_auth; 1195 1201 } 1196 1202 if (*statp == rpc_success && 1197 1203 (xdr = procp->pc_encode) &&

+8 -6

net/sunrpc/xprt.c

··· 220 220 clear_bit(XPRT_LOCKED, &xprt->state); 221 221 smp_mb__after_atomic(); 222 222 } else 223 - queue_work(rpciod_workqueue, &xprt->task_cleanup); 223 + queue_work(xprtiod_workqueue, &xprt->task_cleanup); 224 224 } 225 225 226 226 /* ··· 295 295 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) 296 296 return; 297 297 298 - if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_func, xprt)) 298 + if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, 299 + __xprt_lock_write_func, xprt)) 299 300 return; 300 301 xprt_clear_locked(xprt); 301 302 } ··· 325 324 return; 326 325 if (RPCXPRT_CONGESTED(xprt)) 327 326 goto out_unlock; 328 - if (rpc_wake_up_first(&xprt->sending, __xprt_lock_write_cong_func, xprt)) 327 + if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, 328 + __xprt_lock_write_cong_func, xprt)) 329 329 return; 330 330 out_unlock: 331 331 xprt_clear_locked(xprt); ··· 647 645 set_bit(XPRT_CLOSE_WAIT, &xprt->state); 648 646 /* Try to schedule an autoclose RPC call */ 649 647 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) 650 - queue_work(rpciod_workqueue, &xprt->task_cleanup); 648 + queue_work(xprtiod_workqueue, &xprt->task_cleanup); 651 649 xprt_wake_pending_tasks(xprt, -EAGAIN); 652 650 spin_unlock_bh(&xprt->transport_lock); 653 651 } ··· 674 672 set_bit(XPRT_CLOSE_WAIT, &xprt->state); 675 673 /* Try to schedule an autoclose RPC call */ 676 674 if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) 677 - queue_work(rpciod_workqueue, &xprt->task_cleanup); 675 + queue_work(xprtiod_workqueue, &xprt->task_cleanup); 678 676 xprt_wake_pending_tasks(xprt, -EAGAIN); 679 677 out: 680 678 spin_unlock_bh(&xprt->transport_lock); ··· 691 689 if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) 692 690 goto out_abort; 693 691 spin_unlock(&xprt->transport_lock); 694 - queue_work(rpciod_workqueue, &xprt->task_cleanup); 692 + queue_work(xprtiod_workqueue, &xprt->task_cleanup); 695 693 return; 696 694 out_abort: 697 695 spin_unlock(&xprt->transport_lock);

+3 -5

net/sunrpc/xprtmultipath.c

··· 271 271 xprt_switch_find_xprt_t find_next) 272 272 { 273 273 struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); 274 - struct list_head *head; 275 274 276 275 if (xps == NULL) 277 276 return NULL; 278 - head = &xps->xps_xprt_list; 279 - if (xps->xps_nxprts < 2) 280 - return xprt_switch_find_first_entry(head); 281 - return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next); 277 + return xprt_switch_set_next_cursor(&xps->xps_xprt_list, 278 + &xpi->xpi_cursor, 279 + find_next); 282 280 } 283 281 284 282 static

+1 -1

net/sunrpc/xprtrdma/Makefile

··· 1 1 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o 2 2 3 3 rpcrdma-y := transport.o rpc_rdma.o verbs.o \ 4 - fmr_ops.o frwr_ops.o physical_ops.o \ 4 + fmr_ops.o frwr_ops.o \ 5 5 svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ 6 6 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ 7 7 module.o

+177 -203

net/sunrpc/xprtrdma/fmr_ops.c

··· 19 19 * verb (fmr_op_unmap). 20 20 */ 21 21 22 - /* Transport recovery 23 - * 24 - * After a transport reconnect, fmr_op_map re-uses the MR already 25 - * allocated for the RPC, but generates a fresh rkey then maps the 26 - * MR again. This process is synchronous. 27 - */ 28 - 29 22 #include "xprt_rdma.h" 30 23 31 24 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) ··· 28 35 /* Maximum scatter/gather per FMR */ 29 36 #define RPCRDMA_MAX_FMR_SGES (64) 30 37 31 - static struct workqueue_struct *fmr_recovery_wq; 38 + /* Access mode of externally registered pages */ 39 + enum { 40 + RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | 41 + IB_ACCESS_REMOTE_READ, 42 + }; 32 43 33 - #define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) 34 - 35 - int 36 - fmr_alloc_recovery_wq(void) 44 + bool 45 + fmr_is_supported(struct rpcrdma_ia *ia) 37 46 { 38 - fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); 39 - return !fmr_recovery_wq ? -ENOMEM : 0; 47 + if (!ia->ri_device->alloc_fmr) { 48 + pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", 49 + ia->ri_device->name); 50 + return false; 51 + } 52 + return true; 40 53 } 41 54 42 - void 43 - fmr_destroy_recovery_wq(void) 55 + static int 56 + fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *mw) 44 57 { 45 - struct workqueue_struct *wq; 58 + static struct ib_fmr_attr fmr_attr = { 59 + .max_pages = RPCRDMA_MAX_FMR_SGES, 60 + .max_maps = 1, 61 + .page_shift = PAGE_SHIFT 62 + }; 46 63 47 - if (!fmr_recovery_wq) 48 - return; 64 + mw->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, 65 + sizeof(u64), GFP_KERNEL); 66 + if (!mw->fmr.fm_physaddrs) 67 + goto out_free; 49 68 50 - wq = fmr_recovery_wq; 51 - fmr_recovery_wq = NULL; 52 - destroy_workqueue(wq); 69 + mw->mw_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, 70 + sizeof(*mw->mw_sg), GFP_KERNEL); 71 + if (!mw->mw_sg) 72 + goto out_free; 73 + 74 + sg_init_table(mw->mw_sg, RPCRDMA_MAX_FMR_SGES); 75 + 76 + mw->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, 77 + &fmr_attr); 78 + if (IS_ERR(mw->fmr.fm_mr)) 79 + goto out_fmr_err; 80 + 81 + return 0; 82 + 83 + out_fmr_err: 84 + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, 85 + PTR_ERR(mw->fmr.fm_mr)); 86 + 87 + out_free: 88 + kfree(mw->mw_sg); 89 + kfree(mw->fmr.fm_physaddrs); 90 + return -ENOMEM; 53 91 } 54 92 55 93 static int 56 94 __fmr_unmap(struct rpcrdma_mw *mw) 57 95 { 58 96 LIST_HEAD(l); 97 + int rc; 59 98 60 - list_add(&mw->fmr.fmr->list, &l); 61 - return ib_unmap_fmr(&l); 99 + list_add(&mw->fmr.fm_mr->list, &l); 100 + rc = ib_unmap_fmr(&l); 101 + list_del_init(&mw->fmr.fm_mr->list); 102 + return rc; 62 103 } 63 104 64 - /* Deferred reset of a single FMR. Generate a fresh rkey by 65 - * replacing the MR. There's no recovery if this fails. 105 + static void 106 + fmr_op_release_mr(struct rpcrdma_mw *r) 107 + { 108 + LIST_HEAD(unmap_list); 109 + int rc; 110 + 111 + /* Ensure MW is not on any rl_registered list */ 112 + if (!list_empty(&r->mw_list)) 113 + list_del(&r->mw_list); 114 + 115 + kfree(r->fmr.fm_physaddrs); 116 + kfree(r->mw_sg); 117 + 118 + /* In case this one was left mapped, try to unmap it 119 + * to prevent dealloc_fmr from failing with EBUSY 120 + */ 121 + rc = __fmr_unmap(r); 122 + if (rc) 123 + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", 124 + r, rc); 125 + 126 + rc = ib_dealloc_fmr(r->fmr.fm_mr); 127 + if (rc) 128 + pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", 129 + r, rc); 130 + 131 + kfree(r); 132 + } 133 + 134 + /* Reset of a single FMR. 66 135 */ 67 136 static void 68 - __fmr_recovery_worker(struct work_struct *work) 137 + fmr_op_recover_mr(struct rpcrdma_mw *mw) 69 138 { 70 - struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, 71 - mw_work); 72 139 struct rpcrdma_xprt *r_xprt = mw->mw_xprt; 140 + int rc; 73 141 74 - __fmr_unmap(mw); 142 + /* ORDER: invalidate first */ 143 + rc = __fmr_unmap(mw); 144 + 145 + /* ORDER: then DMA unmap */ 146 + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, 147 + mw->mw_sg, mw->mw_nents, mw->mw_dir); 148 + if (rc) 149 + goto out_release; 150 + 75 151 rpcrdma_put_mw(r_xprt, mw); 152 + r_xprt->rx_stats.mrs_recovered++; 76 153 return; 77 - } 78 154 79 - /* A broken MR was discovered in a context that can't sleep. 80 - * Defer recovery to the recovery worker. 81 - */ 82 - static void 83 - __fmr_queue_recovery(struct rpcrdma_mw *mw) 84 - { 85 - INIT_WORK(&mw->mw_work, __fmr_recovery_worker); 86 - queue_work(fmr_recovery_wq, &mw->mw_work); 155 + out_release: 156 + pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mw); 157 + r_xprt->rx_stats.mrs_orphaned++; 158 + 159 + spin_lock(&r_xprt->rx_buf.rb_mwlock); 160 + list_del(&mw->mw_all); 161 + spin_unlock(&r_xprt->rx_buf.rb_mwlock); 162 + 163 + fmr_op_release_mr(mw); 87 164 } 88 165 89 166 static int ··· 175 112 RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); 176 113 } 177 114 178 - static int 179 - fmr_op_init(struct rpcrdma_xprt *r_xprt) 180 - { 181 - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 182 - int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; 183 - struct ib_fmr_attr fmr_attr = { 184 - .max_pages = RPCRDMA_MAX_FMR_SGES, 185 - .max_maps = 1, 186 - .page_shift = PAGE_SHIFT 187 - }; 188 - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; 189 - struct rpcrdma_mw *r; 190 - int i, rc; 191 - 192 - spin_lock_init(&buf->rb_mwlock); 193 - INIT_LIST_HEAD(&buf->rb_mws); 194 - INIT_LIST_HEAD(&buf->rb_all); 195 - 196 - i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1); 197 - i += 2; /* head + tail */ 198 - i *= buf->rb_max_requests; /* one set for each RPC slot */ 199 - dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); 200 - 201 - rc = -ENOMEM; 202 - while (i--) { 203 - r = kzalloc(sizeof(*r), GFP_KERNEL); 204 - if (!r) 205 - goto out; 206 - 207 - r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES * 208 - sizeof(u64), GFP_KERNEL); 209 - if (!r->fmr.physaddrs) 210 - goto out_free; 211 - 212 - r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); 213 - if (IS_ERR(r->fmr.fmr)) 214 - goto out_fmr_err; 215 - 216 - r->mw_xprt = r_xprt; 217 - list_add(&r->mw_list, &buf->rb_mws); 218 - list_add(&r->mw_all, &buf->rb_all); 219 - } 220 - return 0; 221 - 222 - out_fmr_err: 223 - rc = PTR_ERR(r->fmr.fmr); 224 - dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); 225 - kfree(r->fmr.physaddrs); 226 - out_free: 227 - kfree(r); 228 - out: 229 - return rc; 230 - } 231 - 232 115 /* Use the ib_map_phys_fmr() verb to register a memory region 233 116 * for remote access via RDMA READ or RDMA WRITE. 234 117 */ 235 118 static int 236 119 fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 237 - int nsegs, bool writing) 120 + int nsegs, bool writing, struct rpcrdma_mw **out) 238 121 { 239 - struct rpcrdma_ia *ia = &r_xprt->rx_ia; 240 - struct ib_device *device = ia->ri_device; 241 - enum dma_data_direction direction = rpcrdma_data_dir(writing); 242 122 struct rpcrdma_mr_seg *seg1 = seg; 243 123 int len, pageoff, i, rc; 244 124 struct rpcrdma_mw *mw; 125 + u64 *dma_pages; 245 126 246 - mw = seg1->rl_mw; 247 - seg1->rl_mw = NULL; 248 - if (!mw) { 249 - mw = rpcrdma_get_mw(r_xprt); 250 - if (!mw) 251 - return -ENOMEM; 252 - } else { 253 - /* this is a retransmit; generate a fresh rkey */ 254 - rc = __fmr_unmap(mw); 255 - if (rc) 256 - return rc; 257 - } 127 + mw = rpcrdma_get_mw(r_xprt); 128 + if (!mw) 129 + return -ENOBUFS; 258 130 259 131 pageoff = offset_in_page(seg1->mr_offset); 260 132 seg1->mr_offset -= pageoff; /* start of page */ ··· 198 200 if (nsegs > RPCRDMA_MAX_FMR_SGES) 199 201 nsegs = RPCRDMA_MAX_FMR_SGES; 200 202 for (i = 0; i < nsegs;) { 201 - rpcrdma_map_one(device, seg, direction); 202 - mw->fmr.physaddrs[i] = seg->mr_dma; 203 + if (seg->mr_page) 204 + sg_set_page(&mw->mw_sg[i], 205 + seg->mr_page, 206 + seg->mr_len, 207 + offset_in_page(seg->mr_offset)); 208 + else 209 + sg_set_buf(&mw->mw_sg[i], seg->mr_offset, 210 + seg->mr_len); 203 211 len += seg->mr_len; 204 212 ++seg; 205 213 ++i; ··· 214 210 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 215 211 break; 216 212 } 213 + mw->mw_nents = i; 214 + mw->mw_dir = rpcrdma_data_dir(writing); 215 + if (i == 0) 216 + goto out_dmamap_err; 217 217 218 - rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs, 219 - i, seg1->mr_dma); 218 + if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device, 219 + mw->mw_sg, mw->mw_nents, mw->mw_dir)) 220 + goto out_dmamap_err; 221 + 222 + for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) 223 + dma_pages[i] = sg_dma_address(&mw->mw_sg[i]); 224 + rc = ib_map_phys_fmr(mw->fmr.fm_mr, dma_pages, mw->mw_nents, 225 + dma_pages[0]); 220 226 if (rc) 221 227 goto out_maperr; 222 228 223 - seg1->rl_mw = mw; 224 - seg1->mr_rkey = mw->fmr.fmr->rkey; 225 - seg1->mr_base = seg1->mr_dma + pageoff; 226 - seg1->mr_nsegs = i; 227 - seg1->mr_len = len; 228 - return i; 229 + mw->mw_handle = mw->fmr.fm_mr->rkey; 230 + mw->mw_length = len; 231 + mw->mw_offset = dma_pages[0] + pageoff; 232 + 233 + *out = mw; 234 + return mw->mw_nents; 235 + 236 + out_dmamap_err: 237 + pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", 238 + mw->mw_sg, mw->mw_nents); 239 + rpcrdma_defer_mr_recovery(mw); 240 + return -EIO; 229 241 230 242 out_maperr: 231 - dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", 232 - __func__, len, (unsigned long long)seg1->mr_dma, 233 - pageoff, i, rc); 234 - while (i--) 235 - rpcrdma_unmap_one(device, --seg); 236 - return rc; 237 - } 238 - 239 - static void 240 - __fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) 241 - { 242 - struct ib_device *device = r_xprt->rx_ia.ri_device; 243 - int nsegs = seg->mr_nsegs; 244 - 245 - while (nsegs--) 246 - rpcrdma_unmap_one(device, seg++); 243 + pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", 244 + len, (unsigned long long)dma_pages[0], 245 + pageoff, mw->mw_nents, rc); 246 + rpcrdma_defer_mr_recovery(mw); 247 + return -EIO; 247 248 } 248 249 249 250 /* Invalidate all memory regions that were registered for "req". 250 251 * 251 252 * Sleeps until it is safe for the host CPU to access the 252 253 * previously mapped memory regions. 254 + * 255 + * Caller ensures that req->rl_registered is not empty. 253 256 */ 254 257 static void 255 258 fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 256 259 { 257 - struct rpcrdma_mr_seg *seg; 258 - unsigned int i, nchunks; 259 - struct rpcrdma_mw *mw; 260 + struct rpcrdma_mw *mw, *tmp; 260 261 LIST_HEAD(unmap_list); 261 262 int rc; 262 263 ··· 270 261 /* ORDER: Invalidate all of the req's MRs first 271 262 * 272 263 * ib_unmap_fmr() is slow, so use a single call instead 273 - * of one call per mapped MR. 264 + * of one call per mapped FMR. 274 265 */ 275 - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 276 - seg = &req->rl_segments[i]; 277 - mw = seg->rl_mw; 278 - 279 - list_add(&mw->fmr.fmr->list, &unmap_list); 280 - 281 - i += seg->mr_nsegs; 282 - } 266 + list_for_each_entry(mw, &req->rl_registered, mw_list) 267 + list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); 283 268 rc = ib_unmap_fmr(&unmap_list); 284 269 if (rc) 285 - pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc); 270 + goto out_reset; 286 271 287 272 /* ORDER: Now DMA unmap all of the req's MRs, and return 288 273 * them to the free MW list. 289 274 */ 290 - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 291 - seg = &req->rl_segments[i]; 292 - 293 - __fmr_dma_unmap(r_xprt, seg); 294 - rpcrdma_put_mw(r_xprt, seg->rl_mw); 295 - 296 - i += seg->mr_nsegs; 297 - seg->mr_nsegs = 0; 298 - seg->rl_mw = NULL; 275 + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { 276 + list_del_init(&mw->mw_list); 277 + list_del_init(&mw->fmr.fm_mr->list); 278 + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, 279 + mw->mw_sg, mw->mw_nents, mw->mw_dir); 280 + rpcrdma_put_mw(r_xprt, mw); 299 281 } 300 282 301 - req->rl_nchunks = 0; 283 + return; 284 + 285 + out_reset: 286 + pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); 287 + 288 + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { 289 + list_del_init(&mw->fmr.fm_mr->list); 290 + fmr_op_recover_mr(mw); 291 + } 302 292 } 303 293 304 294 /* Use a slow, safe mechanism to invalidate all memory regions 305 295 * that were registered for "req". 306 - * 307 - * In the asynchronous case, DMA unmapping occurs first here 308 - * because the rpcrdma_mr_seg is released immediately after this 309 - * call. It's contents won't be available in __fmr_dma_unmap later. 310 - * FIXME. 311 296 */ 312 297 static void 313 298 fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 314 299 bool sync) 315 300 { 316 - struct rpcrdma_mr_seg *seg; 317 301 struct rpcrdma_mw *mw; 318 - unsigned int i; 319 302 320 - for (i = 0; req->rl_nchunks; req->rl_nchunks--) { 321 - seg = &req->rl_segments[i]; 322 - mw = seg->rl_mw; 303 + while (!list_empty(&req->rl_registered)) { 304 + mw = list_first_entry(&req->rl_registered, 305 + struct rpcrdma_mw, mw_list); 306 + list_del_init(&mw->mw_list); 323 307 324 - if (sync) { 325 - /* ORDER */ 326 - __fmr_unmap(mw); 327 - __fmr_dma_unmap(r_xprt, seg); 328 - rpcrdma_put_mw(r_xprt, mw); 329 - } else { 330 - __fmr_dma_unmap(r_xprt, seg); 331 - __fmr_queue_recovery(mw); 332 - } 333 - 334 - i += seg->mr_nsegs; 335 - seg->mr_nsegs = 0; 336 - seg->rl_mw = NULL; 337 - } 338 - } 339 - 340 - static void 341 - fmr_op_destroy(struct rpcrdma_buffer *buf) 342 - { 343 - struct rpcrdma_mw *r; 344 - int rc; 345 - 346 - while (!list_empty(&buf->rb_all)) { 347 - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); 348 - list_del(&r->mw_all); 349 - kfree(r->fmr.physaddrs); 350 - 351 - rc = ib_dealloc_fmr(r->fmr.fmr); 352 - if (rc) 353 - dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", 354 - __func__, rc); 355 - 356 - kfree(r); 308 + if (sync) 309 + fmr_op_recover_mr(mw); 310 + else 311 + rpcrdma_defer_mr_recovery(mw); 357 312 } 358 313 } 359 314 ··· 325 352 .ro_map = fmr_op_map, 326 353 .ro_unmap_sync = fmr_op_unmap_sync, 327 354 .ro_unmap_safe = fmr_op_unmap_safe, 355 + .ro_recover_mr = fmr_op_recover_mr, 328 356 .ro_open = fmr_op_open, 329 357 .ro_maxpages = fmr_op_maxpages, 330 - .ro_init = fmr_op_init, 331 - .ro_destroy = fmr_op_destroy, 358 + .ro_init_mr = fmr_op_init_mr, 359 + .ro_release_mr = fmr_op_release_mr, 332 360 .ro_displayname = "fmr", 333 361 };

+143 -232

net/sunrpc/xprtrdma/frwr_ops.c

··· 73 73 # define RPCDBG_FACILITY RPCDBG_TRANS 74 74 #endif 75 75 76 - static struct workqueue_struct *frwr_recovery_wq; 77 - 78 - #define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM) 79 - 80 - int 81 - frwr_alloc_recovery_wq(void) 76 + bool 77 + frwr_is_supported(struct rpcrdma_ia *ia) 82 78 { 83 - frwr_recovery_wq = alloc_workqueue("frwr_recovery", 84 - FRWR_RECOVERY_WQ_FLAGS, 0); 85 - return !frwr_recovery_wq ? -ENOMEM : 0; 79 + struct ib_device_attr *attrs = &ia->ri_device->attrs; 80 + 81 + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) 82 + goto out_not_supported; 83 + if (attrs->max_fast_reg_page_list_len == 0) 84 + goto out_not_supported; 85 + return true; 86 + 87 + out_not_supported: 88 + pr_info("rpcrdma: 'frwr' mode is not supported by device %s\n", 89 + ia->ri_device->name); 90 + return false; 86 91 } 87 92 88 - void 89 - frwr_destroy_recovery_wq(void) 93 + static int 94 + frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) 90 95 { 91 - struct workqueue_struct *wq; 96 + unsigned int depth = ia->ri_max_frmr_depth; 97 + struct rpcrdma_frmr *f = &r->frmr; 98 + int rc; 92 99 93 - if (!frwr_recovery_wq) 94 - return; 100 + f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, depth); 101 + if (IS_ERR(f->fr_mr)) 102 + goto out_mr_err; 95 103 96 - wq = frwr_recovery_wq; 97 - frwr_recovery_wq = NULL; 98 - destroy_workqueue(wq); 104 + r->mw_sg = kcalloc(depth, sizeof(*r->mw_sg), GFP_KERNEL); 105 + if (!r->mw_sg) 106 + goto out_list_err; 107 + 108 + sg_init_table(r->mw_sg, depth); 109 + init_completion(&f->fr_linv_done); 110 + return 0; 111 + 112 + out_mr_err: 113 + rc = PTR_ERR(f->fr_mr); 114 + dprintk("RPC: %s: ib_alloc_mr status %i\n", 115 + __func__, rc); 116 + return rc; 117 + 118 + out_list_err: 119 + rc = -ENOMEM; 120 + dprintk("RPC: %s: sg allocation failure\n", 121 + __func__); 122 + ib_dereg_mr(f->fr_mr); 123 + return rc; 124 + } 125 + 126 + static void 127 + frwr_op_release_mr(struct rpcrdma_mw *r) 128 + { 129 + int rc; 130 + 131 + /* Ensure MW is not on any rl_registered list */ 132 + if (!list_empty(&r->mw_list)) 133 + list_del(&r->mw_list); 134 + 135 + rc = ib_dereg_mr(r->frmr.fr_mr); 136 + if (rc) 137 + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", 138 + r, rc); 139 + kfree(r->mw_sg); 140 + kfree(r); 99 141 } 100 142 101 143 static int ··· 166 124 return 0; 167 125 } 168 126 169 - static void 170 - __frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) 171 - { 172 - struct rpcrdma_ia *ia = &r_xprt->rx_ia; 173 - struct rpcrdma_frmr *f = &mw->frmr; 174 - int rc; 175 - 176 - rc = __frwr_reset_mr(ia, mw); 177 - ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir); 178 - if (rc) 179 - return; 180 - 181 - rpcrdma_put_mw(r_xprt, mw); 182 - } 183 - 184 - /* Deferred reset of a single FRMR. Generate a fresh rkey by 185 - * replacing the MR. 127 + /* Reset of a single FRMR. Generate a fresh rkey by replacing the MR. 186 128 * 187 129 * There's no recovery if this fails. The FRMR is abandoned, but 188 130 * remains in rb_all. It will be cleaned up when the transport is 189 131 * destroyed. 190 132 */ 191 133 static void 192 - __frwr_recovery_worker(struct work_struct *work) 134 + frwr_op_recover_mr(struct rpcrdma_mw *mw) 193 135 { 194 - struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, 195 - mw_work); 196 - 197 - __frwr_reset_and_unmap(r->mw_xprt, r); 198 - return; 199 - } 200 - 201 - /* A broken MR was discovered in a context that can't sleep. 202 - * Defer recovery to the recovery worker. 203 - */ 204 - static void 205 - __frwr_queue_recovery(struct rpcrdma_mw *r) 206 - { 207 - INIT_WORK(&r->mw_work, __frwr_recovery_worker); 208 - queue_work(frwr_recovery_wq, &r->mw_work); 209 - } 210 - 211 - static int 212 - __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, 213 - unsigned int depth) 214 - { 215 - struct rpcrdma_frmr *f = &r->frmr; 136 + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; 137 + struct rpcrdma_ia *ia = &r_xprt->rx_ia; 216 138 int rc; 217 139 218 - f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); 219 - if (IS_ERR(f->fr_mr)) 220 - goto out_mr_err; 221 - 222 - f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL); 223 - if (!f->fr_sg) 224 - goto out_list_err; 225 - 226 - sg_init_table(f->fr_sg, depth); 227 - 228 - init_completion(&f->fr_linv_done); 229 - 230 - return 0; 231 - 232 - out_mr_err: 233 - rc = PTR_ERR(f->fr_mr); 234 - dprintk("RPC: %s: ib_alloc_mr status %i\n", 235 - __func__, rc); 236 - return rc; 237 - 238 - out_list_err: 239 - rc = -ENOMEM; 240 - dprintk("RPC: %s: sg allocation failure\n", 241 - __func__); 242 - ib_dereg_mr(f->fr_mr); 243 - return rc; 244 - } 245 - 246 - static void 247 - __frwr_release(struct rpcrdma_mw *r) 248 - { 249 - int rc; 250 - 251 - rc = ib_dereg_mr(r->frmr.fr_mr); 140 + rc = __frwr_reset_mr(ia, mw); 141 + ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); 252 142 if (rc) 253 - dprintk("RPC: %s: ib_dereg_mr status %i\n", 254 - __func__, rc); 255 - kfree(r->frmr.fr_sg); 143 + goto out_release; 144 + 145 + rpcrdma_put_mw(r_xprt, mw); 146 + r_xprt->rx_stats.mrs_recovered++; 147 + return; 148 + 149 + out_release: 150 + pr_err("rpcrdma: FRMR reset failed %d, %p release\n", rc, mw); 151 + r_xprt->rx_stats.mrs_orphaned++; 152 + 153 + spin_lock(&r_xprt->rx_buf.rb_mwlock); 154 + list_del(&mw->mw_all); 155 + spin_unlock(&r_xprt->rx_buf.rb_mwlock); 156 + 157 + frwr_op_release_mr(mw); 256 158 } 257 159 258 160 static int ··· 332 346 complete_all(&frmr->fr_linv_done); 333 347 } 334 348 335 - static int 336 - frwr_op_init(struct rpcrdma_xprt *r_xprt) 337 - { 338 - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 339 - struct ib_device *device = r_xprt->rx_ia.ri_device; 340 - unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; 341 - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; 342 - int i; 343 - 344 - spin_lock_init(&buf->rb_mwlock); 345 - INIT_LIST_HEAD(&buf->rb_mws); 346 - INIT_LIST_HEAD(&buf->rb_all); 347 - 348 - i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1); 349 - i += 2; /* head + tail */ 350 - i *= buf->rb_max_requests; /* one set for each RPC slot */ 351 - dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); 352 - 353 - while (i--) { 354 - struct rpcrdma_mw *r; 355 - int rc; 356 - 357 - r = kzalloc(sizeof(*r), GFP_KERNEL); 358 - if (!r) 359 - return -ENOMEM; 360 - 361 - rc = __frwr_init(r, pd, device, depth); 362 - if (rc) { 363 - kfree(r); 364 - return rc; 365 - } 366 - 367 - r->mw_xprt = r_xprt; 368 - list_add(&r->mw_list, &buf->rb_mws); 369 - list_add(&r->mw_all, &buf->rb_all); 370 - } 371 - 372 - return 0; 373 - } 374 - 375 - /* Post a FAST_REG Work Request to register a memory region 349 + /* Post a REG_MR Work Request to register a memory region 376 350 * for remote access via RDMA READ or RDMA WRITE. 377 351 */ 378 352 static int 379 353 frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 380 - int nsegs, bool writing) 354 + int nsegs, bool writing, struct rpcrdma_mw **out) 381 355 { 382 356 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 383 - struct ib_device *device = ia->ri_device; 384 - enum dma_data_direction direction = rpcrdma_data_dir(writing); 385 - struct rpcrdma_mr_seg *seg1 = seg; 386 357 struct rpcrdma_mw *mw; 387 358 struct rpcrdma_frmr *frmr; 388 359 struct ib_mr *mr; ··· 348 405 int rc, i, n, dma_nents; 349 406 u8 key; 350 407 351 - mw = seg1->rl_mw; 352 - seg1->rl_mw = NULL; 408 + mw = NULL; 353 409 do { 354 410 if (mw) 355 - __frwr_queue_recovery(mw); 411 + rpcrdma_defer_mr_recovery(mw); 356 412 mw = rpcrdma_get_mw(r_xprt); 357 413 if (!mw) 358 - return -ENOMEM; 414 + return -ENOBUFS; 359 415 } while (mw->frmr.fr_state != FRMR_IS_INVALID); 360 416 frmr = &mw->frmr; 361 417 frmr->fr_state = FRMR_IS_VALID; ··· 363 421 364 422 if (nsegs > ia->ri_max_frmr_depth) 365 423 nsegs = ia->ri_max_frmr_depth; 366 - 367 424 for (i = 0; i < nsegs;) { 368 425 if (seg->mr_page) 369 - sg_set_page(&frmr->fr_sg[i], 426 + sg_set_page(&mw->mw_sg[i], 370 427 seg->mr_page, 371 428 seg->mr_len, 372 429 offset_in_page(seg->mr_offset)); 373 430 else 374 - sg_set_buf(&frmr->fr_sg[i], seg->mr_offset, 431 + sg_set_buf(&mw->mw_sg[i], seg->mr_offset, 375 432 seg->mr_len); 376 433 377 434 ++seg; ··· 381 440 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 382 441 break; 383 442 } 384 - frmr->fr_nents = i; 385 - frmr->fr_dir = direction; 443 + mw->mw_nents = i; 444 + mw->mw_dir = rpcrdma_data_dir(writing); 445 + if (i == 0) 446 + goto out_dmamap_err; 386 447 387 - dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); 388 - if (!dma_nents) { 389 - pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n", 390 - __func__, frmr->fr_sg, frmr->fr_nents); 391 - return -ENOMEM; 392 - } 448 + dma_nents = ib_dma_map_sg(ia->ri_device, 449 + mw->mw_sg, mw->mw_nents, mw->mw_dir); 450 + if (!dma_nents) 451 + goto out_dmamap_err; 393 452 394 - n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE); 395 - if (unlikely(n != frmr->fr_nents)) { 396 - pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", 397 - __func__, frmr->fr_mr, n, frmr->fr_nents); 398 - rc = n < 0 ? n : -EINVAL; 399 - goto out_senderr; 400 - } 453 + n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); 454 + if (unlikely(n != mw->mw_nents)) 455 + goto out_mapmr_err; 401 456 402 457 dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", 403 - __func__, mw, frmr->fr_nents, mr->length); 458 + __func__, mw, mw->mw_nents, mr->length); 404 459 405 460 key = (u8)(mr->rkey & 0x000000FF); 406 461 ib_update_fast_reg_key(mr, ++key); ··· 418 481 if (rc) 419 482 goto out_senderr; 420 483 421 - seg1->rl_mw = mw; 422 - seg1->mr_rkey = mr->rkey; 423 - seg1->mr_base = mr->iova; 424 - seg1->mr_nsegs = frmr->fr_nents; 425 - seg1->mr_len = mr->length; 484 + mw->mw_handle = mr->rkey; 485 + mw->mw_length = mr->length; 486 + mw->mw_offset = mr->iova; 426 487 427 - return frmr->fr_nents; 488 + *out = mw; 489 + return mw->mw_nents; 490 + 491 + out_dmamap_err: 492 + pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", 493 + mw->mw_sg, mw->mw_nents); 494 + rpcrdma_defer_mr_recovery(mw); 495 + return -EIO; 496 + 497 + out_mapmr_err: 498 + pr_err("rpcrdma: failed to map mr %p (%u/%u)\n", 499 + frmr->fr_mr, n, mw->mw_nents); 500 + rpcrdma_defer_mr_recovery(mw); 501 + return -EIO; 428 502 429 503 out_senderr: 430 - dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); 431 - __frwr_queue_recovery(mw); 432 - return rc; 504 + pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); 505 + rpcrdma_defer_mr_recovery(mw); 506 + return -ENOTCONN; 433 507 } 434 508 435 509 static struct ib_send_wr * 436 - __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) 510 + __frwr_prepare_linv_wr(struct rpcrdma_mw *mw) 437 511 { 438 - struct rpcrdma_mw *mw = seg->rl_mw; 439 512 struct rpcrdma_frmr *f = &mw->frmr; 440 513 struct ib_send_wr *invalidate_wr; 441 514 ··· 465 518 * 466 519 * Sleeps until it is safe for the host CPU to access the 467 520 * previously mapped memory regions. 521 + * 522 + * Caller ensures that req->rl_registered is not empty. 468 523 */ 469 524 static void 470 525 frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 471 526 { 472 527 struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr; 473 528 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 474 - struct rpcrdma_mr_seg *seg; 475 - unsigned int i, nchunks; 529 + struct rpcrdma_mw *mw, *tmp; 476 530 struct rpcrdma_frmr *f; 477 - struct rpcrdma_mw *mw; 478 531 int rc; 479 532 480 533 dprintk("RPC: %s: req %p\n", __func__, req); ··· 484 537 * Chain the LOCAL_INV Work Requests and post them with 485 538 * a single ib_post_send() call. 486 539 */ 540 + f = NULL; 487 541 invalidate_wrs = pos = prev = NULL; 488 - seg = NULL; 489 - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 490 - seg = &req->rl_segments[i]; 491 - 492 - pos = __frwr_prepare_linv_wr(seg); 542 + list_for_each_entry(mw, &req->rl_registered, mw_list) { 543 + pos = __frwr_prepare_linv_wr(mw); 493 544 494 545 if (!invalidate_wrs) 495 546 invalidate_wrs = pos; 496 547 else 497 548 prev->next = pos; 498 549 prev = pos; 499 - 500 - i += seg->mr_nsegs; 550 + f = &mw->frmr; 501 551 } 502 - f = &seg->rl_mw->frmr; 503 552 504 553 /* Strong send queue ordering guarantees that when the 505 554 * last WR in the chain completes, all WRs in the chain ··· 520 577 * them to the free MW list. 521 578 */ 522 579 unmap: 523 - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 524 - seg = &req->rl_segments[i]; 525 - mw = seg->rl_mw; 526 - seg->rl_mw = NULL; 527 - 528 - ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, 529 - f->fr_dir); 580 + list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { 581 + list_del_init(&mw->mw_list); 582 + ib_dma_unmap_sg(ia->ri_device, 583 + mw->mw_sg, mw->mw_nents, mw->mw_dir); 530 584 rpcrdma_put_mw(r_xprt, mw); 531 - 532 - i += seg->mr_nsegs; 533 - seg->mr_nsegs = 0; 534 585 } 535 - 536 - req->rl_nchunks = 0; 537 586 return; 538 587 539 588 reset_mrs: 540 - pr_warn("%s: ib_post_send failed %i\n", __func__, rc); 589 + pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); 590 + rdma_disconnect(ia->ri_id); 541 591 542 592 /* Find and reset the MRs in the LOCAL_INV WRs that did not 543 593 * get posted. This is synchronous, and slow. 544 594 */ 545 - for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { 546 - seg = &req->rl_segments[i]; 547 - mw = seg->rl_mw; 595 + list_for_each_entry(mw, &req->rl_registered, mw_list) { 548 596 f = &mw->frmr; 549 - 550 597 if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { 551 598 __frwr_reset_mr(ia, mw); 552 599 bad_wr = bad_wr->next; 553 600 } 554 - 555 - i += seg->mr_nsegs; 556 601 } 557 602 goto unmap; 558 603 } ··· 552 621 frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 553 622 bool sync) 554 623 { 555 - struct rpcrdma_mr_seg *seg; 556 624 struct rpcrdma_mw *mw; 557 - unsigned int i; 558 625 559 - for (i = 0; req->rl_nchunks; req->rl_nchunks--) { 560 - seg = &req->rl_segments[i]; 561 - mw = seg->rl_mw; 626 + while (!list_empty(&req->rl_registered)) { 627 + mw = list_first_entry(&req->rl_registered, 628 + struct rpcrdma_mw, mw_list); 629 + list_del_init(&mw->mw_list); 562 630 563 631 if (sync) 564 - __frwr_reset_and_unmap(r_xprt, mw); 632 + frwr_op_recover_mr(mw); 565 633 else 566 - __frwr_queue_recovery(mw); 567 - 568 - i += seg->mr_nsegs; 569 - seg->mr_nsegs = 0; 570 - seg->rl_mw = NULL; 571 - } 572 - } 573 - 574 - static void 575 - frwr_op_destroy(struct rpcrdma_buffer *buf) 576 - { 577 - struct rpcrdma_mw *r; 578 - 579 - /* Ensure stale MWs for "buf" are no longer in flight */ 580 - flush_workqueue(frwr_recovery_wq); 581 - 582 - while (!list_empty(&buf->rb_all)) { 583 - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); 584 - list_del(&r->mw_all); 585 - __frwr_release(r); 586 - kfree(r); 634 + rpcrdma_defer_mr_recovery(mw); 587 635 } 588 636 } 589 637 ··· 570 660 .ro_map = frwr_op_map, 571 661 .ro_unmap_sync = frwr_op_unmap_sync, 572 662 .ro_unmap_safe = frwr_op_unmap_safe, 663 + .ro_recover_mr = frwr_op_recover_mr, 573 664 .ro_open = frwr_op_open, 574 665 .ro_maxpages = frwr_op_maxpages, 575 - .ro_init = frwr_op_init, 576 - .ro_destroy = frwr_op_destroy, 666 + .ro_init_mr = frwr_op_init_mr, 667 + .ro_release_mr = frwr_op_release_mr, 577 668 .ro_displayname = "frwr", 578 669 };

-122

net/sunrpc/xprtrdma/physical_ops.c

··· 1 - /* 2 - * Copyright (c) 2015 Oracle. All rights reserved. 3 - * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 4 - */ 5 - 6 - /* No-op chunk preparation. All client memory is pre-registered. 7 - * Sometimes referred to as ALLPHYSICAL mode. 8 - * 9 - * Physical registration is simple because all client memory is 10 - * pre-registered and never deregistered. This mode is good for 11 - * adapter bring up, but is considered not safe: the server is 12 - * trusted not to abuse its access to client memory not involved 13 - * in RDMA I/O. 14 - */ 15 - 16 - #include "xprt_rdma.h" 17 - 18 - #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 19 - # define RPCDBG_FACILITY RPCDBG_TRANS 20 - #endif 21 - 22 - static int 23 - physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, 24 - struct rpcrdma_create_data_internal *cdata) 25 - { 26 - struct ib_mr *mr; 27 - 28 - /* Obtain an rkey to use for RPC data payloads. 29 - */ 30 - mr = ib_get_dma_mr(ia->ri_pd, 31 - IB_ACCESS_LOCAL_WRITE | 32 - IB_ACCESS_REMOTE_WRITE | 33 - IB_ACCESS_REMOTE_READ); 34 - if (IS_ERR(mr)) { 35 - pr_err("%s: ib_get_dma_mr for failed with %lX\n", 36 - __func__, PTR_ERR(mr)); 37 - return -ENOMEM; 38 - } 39 - ia->ri_dma_mr = mr; 40 - 41 - rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int, 42 - RPCRDMA_MAX_DATA_SEGS, 43 - RPCRDMA_MAX_HDR_SEGS)); 44 - return 0; 45 - } 46 - 47 - /* PHYSICAL memory registration conveys one page per chunk segment. 48 - */ 49 - static size_t 50 - physical_op_maxpages(struct rpcrdma_xprt *r_xprt) 51 - { 52 - return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, 53 - RPCRDMA_MAX_HDR_SEGS); 54 - } 55 - 56 - static int 57 - physical_op_init(struct rpcrdma_xprt *r_xprt) 58 - { 59 - return 0; 60 - } 61 - 62 - /* The client's physical memory is already exposed for 63 - * remote access via RDMA READ or RDMA WRITE. 64 - */ 65 - static int 66 - physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 67 - int nsegs, bool writing) 68 - { 69 - struct rpcrdma_ia *ia = &r_xprt->rx_ia; 70 - 71 - rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing)); 72 - seg->mr_rkey = ia->ri_dma_mr->rkey; 73 - seg->mr_base = seg->mr_dma; 74 - return 1; 75 - } 76 - 77 - /* DMA unmap all memory regions that were mapped for "req". 78 - */ 79 - static void 80 - physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 81 - { 82 - struct ib_device *device = r_xprt->rx_ia.ri_device; 83 - unsigned int i; 84 - 85 - for (i = 0; req->rl_nchunks; --req->rl_nchunks) 86 - rpcrdma_unmap_one(device, &req->rl_segments[i++]); 87 - } 88 - 89 - /* Use a slow, safe mechanism to invalidate all memory regions 90 - * that were registered for "req". 91 - * 92 - * For physical memory registration, there is no good way to 93 - * fence a single MR that has been advertised to the server. The 94 - * client has already handed the server an R_key that cannot be 95 - * invalidated and is shared by all MRs on this connection. 96 - * Tearing down the PD might be the only safe choice, but it's 97 - * not clear that a freshly acquired DMA R_key would be different 98 - * than the one used by the PD that was just destroyed. 99 - * FIXME. 100 - */ 101 - static void 102 - physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 103 - bool sync) 104 - { 105 - physical_op_unmap_sync(r_xprt, req); 106 - } 107 - 108 - static void 109 - physical_op_destroy(struct rpcrdma_buffer *buf) 110 - { 111 - } 112 - 113 - const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { 114 - .ro_map = physical_op_map, 115 - .ro_unmap_sync = physical_op_unmap_sync, 116 - .ro_unmap_safe = physical_op_unmap_safe, 117 - .ro_open = physical_op_open, 118 - .ro_maxpages = physical_op_maxpages, 119 - .ro_init = physical_op_init, 120 - .ro_destroy = physical_op_destroy, 121 - .ro_displayname = "physical", 122 - };

+145 -131

net/sunrpc/xprtrdma/rpc_rdma.c

··· 196 196 * MR when they can. 197 197 */ 198 198 static int 199 - rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 200 - int n, int nsegs) 199 + rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) 201 200 { 202 201 size_t page_offset; 203 202 u32 remaining; ··· 205 206 base = vec->iov_base; 206 207 page_offset = offset_in_page(base); 207 208 remaining = vec->iov_len; 208 - while (remaining && n < nsegs) { 209 + while (remaining && n < RPCRDMA_MAX_SEGS) { 209 210 seg[n].mr_page = NULL; 210 211 seg[n].mr_offset = base; 211 212 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); ··· 229 230 230 231 static int 231 232 rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, 232 - enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) 233 + enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) 233 234 { 234 - int len, n = 0, p; 235 - int page_base; 235 + int len, n, p, page_base; 236 236 struct page **ppages; 237 237 238 + n = 0; 238 239 if (pos == 0) { 239 - n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs); 240 - if (n == nsegs) 241 - return -EIO; 240 + n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); 241 + if (n == RPCRDMA_MAX_SEGS) 242 + goto out_overflow; 242 243 } 243 244 244 245 len = xdrbuf->page_len; 245 246 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 246 247 page_base = xdrbuf->page_base & ~PAGE_MASK; 247 248 p = 0; 248 - while (len && n < nsegs) { 249 + while (len && n < RPCRDMA_MAX_SEGS) { 249 250 if (!ppages[p]) { 250 251 /* alloc the pagelist for receiving buffer */ 251 252 ppages[p] = alloc_page(GFP_ATOMIC); 252 253 if (!ppages[p]) 253 - return -ENOMEM; 254 + return -EAGAIN; 254 255 } 255 256 seg[n].mr_page = ppages[p]; 256 257 seg[n].mr_offset = (void *)(unsigned long) page_base; 257 258 seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); 258 259 if (seg[n].mr_len > PAGE_SIZE) 259 - return -EIO; 260 + goto out_overflow; 260 261 len -= seg[n].mr_len; 261 262 ++n; 262 263 ++p; ··· 264 265 } 265 266 266 267 /* Message overflows the seg array */ 267 - if (len && n == nsegs) 268 - return -EIO; 268 + if (len && n == RPCRDMA_MAX_SEGS) 269 + goto out_overflow; 269 270 270 271 /* When encoding the read list, the tail is always sent inline */ 271 272 if (type == rpcrdma_readch) ··· 276 277 * xdr pad bytes, saving the server an RDMA operation. */ 277 278 if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) 278 279 return n; 279 - n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs); 280 - if (n == nsegs) 281 - return -EIO; 280 + n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); 281 + if (n == RPCRDMA_MAX_SEGS) 282 + goto out_overflow; 282 283 } 283 284 284 285 return n; 286 + 287 + out_overflow: 288 + pr_err("rpcrdma: segment array overflow\n"); 289 + return -EIO; 285 290 } 286 291 287 292 static inline __be32 * 288 - xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) 293 + xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) 289 294 { 290 - *iptr++ = cpu_to_be32(seg->mr_rkey); 291 - *iptr++ = cpu_to_be32(seg->mr_len); 292 - return xdr_encode_hyper(iptr, seg->mr_base); 295 + *iptr++ = cpu_to_be32(mw->mw_handle); 296 + *iptr++ = cpu_to_be32(mw->mw_length); 297 + return xdr_encode_hyper(iptr, mw->mw_offset); 293 298 } 294 299 295 300 /* XDR-encode the Read list. Supports encoding a list of read ··· 313 310 struct rpcrdma_req *req, struct rpc_rqst *rqst, 314 311 __be32 *iptr, enum rpcrdma_chunktype rtype) 315 312 { 316 - struct rpcrdma_mr_seg *seg = req->rl_nextseg; 313 + struct rpcrdma_mr_seg *seg; 314 + struct rpcrdma_mw *mw; 317 315 unsigned int pos; 318 316 int n, nsegs; 319 317 ··· 326 322 pos = rqst->rq_snd_buf.head[0].iov_len; 327 323 if (rtype == rpcrdma_areadch) 328 324 pos = 0; 329 - nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, 330 - RPCRDMA_MAX_SEGS - req->rl_nchunks); 325 + seg = req->rl_segments; 326 + nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg); 331 327 if (nsegs < 0) 332 328 return ERR_PTR(nsegs); 333 329 334 330 do { 335 - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); 336 - if (n <= 0) 331 + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 332 + false, &mw); 333 + if (n < 0) 337 334 return ERR_PTR(n); 335 + list_add(&mw->mw_list, &req->rl_registered); 338 336 339 337 *iptr++ = xdr_one; /* item present */ 340 338 ··· 344 338 * have the same "position". 345 339 */ 346 340 *iptr++ = cpu_to_be32(pos); 347 - iptr = xdr_encode_rdma_segment(iptr, seg); 341 + iptr = xdr_encode_rdma_segment(iptr, mw); 348 342 349 - dprintk("RPC: %5u %s: read segment pos %u " 350 - "%d@0x%016llx:0x%08x (%s)\n", 343 + dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", 351 344 rqst->rq_task->tk_pid, __func__, pos, 352 - seg->mr_len, (unsigned long long)seg->mr_base, 353 - seg->mr_rkey, n < nsegs ? "more" : "last"); 345 + mw->mw_length, (unsigned long long)mw->mw_offset, 346 + mw->mw_handle, n < nsegs ? "more" : "last"); 354 347 355 348 r_xprt->rx_stats.read_chunk_count++; 356 - req->rl_nchunks++; 357 349 seg += n; 358 350 nsegs -= n; 359 351 } while (nsegs); 360 - req->rl_nextseg = seg; 361 352 362 353 /* Finish Read list */ 363 354 *iptr++ = xdr_zero; /* Next item not present */ ··· 378 375 struct rpc_rqst *rqst, __be32 *iptr, 379 376 enum rpcrdma_chunktype wtype) 380 377 { 381 - struct rpcrdma_mr_seg *seg = req->rl_nextseg; 378 + struct rpcrdma_mr_seg *seg; 379 + struct rpcrdma_mw *mw; 382 380 int n, nsegs, nchunks; 383 381 __be32 *segcount; 384 382 ··· 388 384 return iptr; 389 385 } 390 386 387 + seg = req->rl_segments; 391 388 nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 392 389 rqst->rq_rcv_buf.head[0].iov_len, 393 - wtype, seg, 394 - RPCRDMA_MAX_SEGS - req->rl_nchunks); 390 + wtype, seg); 395 391 if (nsegs < 0) 396 392 return ERR_PTR(nsegs); 397 393 ··· 400 396 401 397 nchunks = 0; 402 398 do { 403 - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); 404 - if (n <= 0) 399 + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 400 + true, &mw); 401 + if (n < 0) 405 402 return ERR_PTR(n); 403 + list_add(&mw->mw_list, &req->rl_registered); 406 404 407 - iptr = xdr_encode_rdma_segment(iptr, seg); 405 + iptr = xdr_encode_rdma_segment(iptr, mw); 408 406 409 - dprintk("RPC: %5u %s: write segment " 410 - "%d@0x016%llx:0x%08x (%s)\n", 407 + dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", 411 408 rqst->rq_task->tk_pid, __func__, 412 - seg->mr_len, (unsigned long long)seg->mr_base, 413 - seg->mr_rkey, n < nsegs ? "more" : "last"); 409 + mw->mw_length, (unsigned long long)mw->mw_offset, 410 + mw->mw_handle, n < nsegs ? "more" : "last"); 414 411 415 412 r_xprt->rx_stats.write_chunk_count++; 416 413 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 417 - req->rl_nchunks++; 418 414 nchunks++; 419 415 seg += n; 420 416 nsegs -= n; 421 417 } while (nsegs); 422 - req->rl_nextseg = seg; 423 418 424 419 /* Update count of segments in this Write chunk */ 425 420 *segcount = cpu_to_be32(nchunks); ··· 445 442 struct rpcrdma_req *req, struct rpc_rqst *rqst, 446 443 __be32 *iptr, enum rpcrdma_chunktype wtype) 447 444 { 448 - struct rpcrdma_mr_seg *seg = req->rl_nextseg; 445 + struct rpcrdma_mr_seg *seg; 446 + struct rpcrdma_mw *mw; 449 447 int n, nsegs, nchunks; 450 448 __be32 *segcount; 451 449 ··· 455 451 return iptr; 456 452 } 457 453 458 - nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, 459 - RPCRDMA_MAX_SEGS - req->rl_nchunks); 454 + seg = req->rl_segments; 455 + nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg); 460 456 if (nsegs < 0) 461 457 return ERR_PTR(nsegs); 462 458 ··· 465 461 466 462 nchunks = 0; 467 463 do { 468 - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); 469 - if (n <= 0) 464 + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 465 + true, &mw); 466 + if (n < 0) 470 467 return ERR_PTR(n); 468 + list_add(&mw->mw_list, &req->rl_registered); 471 469 472 - iptr = xdr_encode_rdma_segment(iptr, seg); 470 + iptr = xdr_encode_rdma_segment(iptr, mw); 473 471 474 - dprintk("RPC: %5u %s: reply segment " 475 - "%d@0x%016llx:0x%08x (%s)\n", 472 + dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", 476 473 rqst->rq_task->tk_pid, __func__, 477 - seg->mr_len, (unsigned long long)seg->mr_base, 478 - seg->mr_rkey, n < nsegs ? "more" : "last"); 474 + mw->mw_length, (unsigned long long)mw->mw_offset, 475 + mw->mw_handle, n < nsegs ? "more" : "last"); 479 476 480 477 r_xprt->rx_stats.reply_chunk_count++; 481 478 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 482 - req->rl_nchunks++; 483 479 nchunks++; 484 480 seg += n; 485 481 nsegs -= n; 486 482 } while (nsegs); 487 - req->rl_nextseg = seg; 488 483 489 484 /* Update count of segments in the Reply chunk */ 490 485 *segcount = cpu_to_be32(nchunks); ··· 570 567 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 571 568 enum rpcrdma_chunktype rtype, wtype; 572 569 struct rpcrdma_msg *headerp; 570 + bool ddp_allowed; 573 571 ssize_t hdrlen; 574 572 size_t rpclen; 575 573 __be32 *iptr; ··· 587 583 headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); 588 584 headerp->rm_type = rdma_msg; 589 585 586 + /* When the ULP employs a GSS flavor that guarantees integrity 587 + * or privacy, direct data placement of individual data items 588 + * is not allowed. 589 + */ 590 + ddp_allowed = !(rqst->rq_cred->cr_auth->au_flags & 591 + RPCAUTH_AUTH_DATATOUCH); 592 + 590 593 /* 591 594 * Chunks needed for results? 592 595 * ··· 605 594 */ 606 595 if (rpcrdma_results_inline(r_xprt, rqst)) 607 596 wtype = rpcrdma_noch; 608 - else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) 597 + else if (ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) 609 598 wtype = rpcrdma_writech; 610 599 else 611 600 wtype = rpcrdma_replych; ··· 628 617 rtype = rpcrdma_noch; 629 618 rpcrdma_inline_pullup(rqst); 630 619 rpclen = rqst->rq_svec[0].iov_len; 631 - } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { 620 + } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { 632 621 rtype = rpcrdma_readch; 633 622 rpclen = rqst->rq_svec[0].iov_len; 634 623 rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); ··· 661 650 * send a Call message with a Position Zero Read chunk and a 662 651 * regular Read chunk at the same time. 663 652 */ 664 - req->rl_nchunks = 0; 665 - req->rl_nextseg = req->rl_segments; 666 653 iptr = headerp->rm_body.rm_chunks; 667 654 iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); 668 655 if (IS_ERR(iptr)) ··· 699 690 out_overflow: 700 691 pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", 701 692 hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); 702 - /* Terminate this RPC. Chunks registered above will be 703 - * released by xprt_release -> xprt_rmda_free . 704 - */ 705 - return -EIO; 693 + iptr = ERR_PTR(-EIO); 706 694 707 695 out_unmap: 708 696 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); ··· 711 705 * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) 712 706 */ 713 707 static int 714 - rpcrdma_count_chunks(struct rpcrdma_rep *rep, unsigned int max, int wrchunk, __be32 **iptrp) 708 + rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) 715 709 { 716 710 unsigned int i, total_len; 717 711 struct rpcrdma_write_chunk *cur_wchunk; 718 712 char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); 719 713 720 714 i = be32_to_cpu(**iptrp); 721 - if (i > max) 722 - return -1; 723 715 cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); 724 716 total_len = 0; 725 717 while (i--) { ··· 748 744 return total_len; 749 745 } 750 746 751 - /* 752 - * Scatter inline received data back into provided iov's. 747 + /** 748 + * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs 749 + * @rqst: controlling RPC request 750 + * @srcp: points to RPC message payload in receive buffer 751 + * @copy_len: remaining length of receive buffer content 752 + * @pad: Write chunk pad bytes needed (zero for pure inline) 753 + * 754 + * The upper layer has set the maximum number of bytes it can 755 + * receive in each component of rq_rcv_buf. These values are set in 756 + * the head.iov_len, page_len, tail.iov_len, and buflen fields. 757 + * 758 + * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in 759 + * many cases this function simply updates iov_base pointers in 760 + * rq_rcv_buf to point directly to the received reply data, to 761 + * avoid copying reply data. 762 + * 763 + * Returns the count of bytes which had to be memcopied. 753 764 */ 754 - static void 765 + static unsigned long 755 766 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) 756 767 { 757 - int i, npages, curlen, olen; 768 + unsigned long fixup_copy_count; 769 + int i, npages, curlen; 758 770 char *destp; 759 771 struct page **ppages; 760 772 int page_base; 761 773 762 - curlen = rqst->rq_rcv_buf.head[0].iov_len; 763 - if (curlen > copy_len) { /* write chunk header fixup */ 764 - curlen = copy_len; 765 - rqst->rq_rcv_buf.head[0].iov_len = curlen; 766 - } 774 + /* The head iovec is redirected to the RPC reply message 775 + * in the receive buffer, to avoid a memcopy. 776 + */ 777 + rqst->rq_rcv_buf.head[0].iov_base = srcp; 778 + rqst->rq_private_buf.head[0].iov_base = srcp; 767 779 780 + /* The contents of the receive buffer that follow 781 + * head.iov_len bytes are copied into the page list. 782 + */ 783 + curlen = rqst->rq_rcv_buf.head[0].iov_len; 784 + if (curlen > copy_len) 785 + curlen = copy_len; 768 786 dprintk("RPC: %s: srcp 0x%p len %d hdrlen %d\n", 769 787 __func__, srcp, copy_len, curlen); 770 - 771 - /* Shift pointer for first receive segment only */ 772 - rqst->rq_rcv_buf.head[0].iov_base = srcp; 773 788 srcp += curlen; 774 789 copy_len -= curlen; 775 790 776 - olen = copy_len; 777 - i = 0; 778 - rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; 779 791 page_base = rqst->rq_rcv_buf.page_base; 780 792 ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); 781 793 page_base &= ~PAGE_MASK; 782 - 794 + fixup_copy_count = 0; 783 795 if (copy_len && rqst->rq_rcv_buf.page_len) { 784 - npages = PAGE_ALIGN(page_base + 785 - rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; 786 - for (; i < npages; i++) { 796 + int pagelist_len; 797 + 798 + pagelist_len = rqst->rq_rcv_buf.page_len; 799 + if (pagelist_len > copy_len) 800 + pagelist_len = copy_len; 801 + npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; 802 + for (i = 0; i < npages; i++) { 787 803 curlen = PAGE_SIZE - page_base; 788 - if (curlen > copy_len) 789 - curlen = copy_len; 804 + if (curlen > pagelist_len) 805 + curlen = pagelist_len; 806 + 790 807 dprintk("RPC: %s: page %d" 791 808 " srcp 0x%p len %d curlen %d\n", 792 809 __func__, i, srcp, copy_len, curlen); ··· 817 792 kunmap_atomic(destp); 818 793 srcp += curlen; 819 794 copy_len -= curlen; 820 - if (copy_len == 0) 795 + fixup_copy_count += curlen; 796 + pagelist_len -= curlen; 797 + if (!pagelist_len) 821 798 break; 822 799 page_base = 0; 823 800 } 801 + 802 + /* Implicit padding for the last segment in a Write 803 + * chunk is inserted inline at the front of the tail 804 + * iovec. The upper layer ignores the content of 805 + * the pad. Simply ensure inline content in the tail 806 + * that follows the Write chunk is properly aligned. 807 + */ 808 + if (pad) 809 + srcp -= pad; 824 810 } 825 811 826 - if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { 827 - curlen = copy_len; 828 - if (curlen > rqst->rq_rcv_buf.tail[0].iov_len) 829 - curlen = rqst->rq_rcv_buf.tail[0].iov_len; 830 - if (rqst->rq_rcv_buf.tail[0].iov_base != srcp) 831 - memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen); 832 - dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n", 833 - __func__, srcp, copy_len, curlen); 834 - rqst->rq_rcv_buf.tail[0].iov_len = curlen; 835 - copy_len -= curlen; ++i; 836 - } else 837 - rqst->rq_rcv_buf.tail[0].iov_len = 0; 838 - 839 - if (pad) { 840 - /* implicit padding on terminal chunk */ 841 - unsigned char *p = rqst->rq_rcv_buf.tail[0].iov_base; 842 - while (pad--) 843 - p[rqst->rq_rcv_buf.tail[0].iov_len++] = 0; 812 + /* The tail iovec is redirected to the remaining data 813 + * in the receive buffer, to avoid a memcopy. 814 + */ 815 + if (copy_len || pad) { 816 + rqst->rq_rcv_buf.tail[0].iov_base = srcp; 817 + rqst->rq_private_buf.tail[0].iov_base = srcp; 844 818 } 845 819 846 - if (copy_len) 847 - dprintk("RPC: %s: %d bytes in" 848 - " %d extra segments (%d lost)\n", 849 - __func__, olen, i, copy_len); 850 - 851 - /* TBD avoid a warning from call_decode() */ 852 - rqst->rq_private_buf = rqst->rq_rcv_buf; 820 + return fixup_copy_count; 853 821 } 854 822 855 823 void ··· 978 960 (headerp->rm_body.rm_chunks[1] == xdr_zero && 979 961 headerp->rm_body.rm_chunks[2] != xdr_zero) || 980 962 (headerp->rm_body.rm_chunks[1] != xdr_zero && 981 - req->rl_nchunks == 0)) 963 + list_empty(&req->rl_registered))) 982 964 goto badheader; 983 965 if (headerp->rm_body.rm_chunks[1] != xdr_zero) { 984 966 /* count any expected write chunks in read reply */ 985 967 /* start at write chunk array count */ 986 968 iptr = &headerp->rm_body.rm_chunks[2]; 987 - rdmalen = rpcrdma_count_chunks(rep, 988 - req->rl_nchunks, 1, &iptr); 969 + rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); 989 970 /* check for validity, and no reply chunk after */ 990 971 if (rdmalen < 0 || *iptr++ != xdr_zero) 991 972 goto badheader; ··· 1005 988 rep->rr_len -= RPCRDMA_HDRLEN_MIN; 1006 989 status = rep->rr_len; 1007 990 } 1008 - /* Fix up the rpc results for upper layer */ 1009 - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, rdmalen); 991 + 992 + r_xprt->rx_stats.fixup_copy_count += 993 + rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, 994 + rdmalen); 1010 995 break; 1011 996 1012 997 case rdma_nomsg: ··· 1016 997 if (headerp->rm_body.rm_chunks[0] != xdr_zero || 1017 998 headerp->rm_body.rm_chunks[1] != xdr_zero || 1018 999 headerp->rm_body.rm_chunks[2] != xdr_one || 1019 - req->rl_nchunks == 0) 1000 + list_empty(&req->rl_registered)) 1020 1001 goto badheader; 1021 1002 iptr = (__be32 *)((unsigned char *)headerp + 1022 1003 RPCRDMA_HDRLEN_MIN); 1023 - rdmalen = rpcrdma_count_chunks(rep, req->rl_nchunks, 0, &iptr); 1004 + rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); 1024 1005 if (rdmalen < 0) 1025 1006 goto badheader; 1026 1007 r_xprt->rx_stats.total_rdma_reply += rdmalen; ··· 1033 1014 1034 1015 badheader: 1035 1016 default: 1036 - dprintk("%s: invalid rpcrdma reply header (type %d):" 1037 - " chunks[012] == %d %d %d" 1038 - " expected chunks <= %d\n", 1039 - __func__, be32_to_cpu(headerp->rm_type), 1040 - headerp->rm_body.rm_chunks[0], 1041 - headerp->rm_body.rm_chunks[1], 1042 - headerp->rm_body.rm_chunks[2], 1043 - req->rl_nchunks); 1017 + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", 1018 + rqst->rq_task->tk_pid, __func__, 1019 + be32_to_cpu(headerp->rm_type)); 1044 1020 status = -EIO; 1045 1021 r_xprt->rx_stats.bad_reply_count++; 1046 1022 break; ··· 1049 1035 * control: waking the next RPC waits until this RPC has 1050 1036 * relinquished all its Send Queue entries. 1051 1037 */ 1052 - if (req->rl_nchunks) 1038 + if (!list_empty(&req->rl_registered)) 1053 1039 r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); 1054 1040 1055 1041 spin_lock_bh(&xprt->transport_lock);

+24 -16

net/sunrpc/xprtrdma/transport.c

··· 558 558 559 559 out_fail: 560 560 rpcrdma_buffer_put(req); 561 - r_xprt->rx_stats.failed_marshal_count++; 562 561 return NULL; 563 562 } 564 563 ··· 589 590 rpcrdma_buffer_put(req); 590 591 } 591 592 592 - /* 593 + /** 594 + * xprt_rdma_send_request - marshal and send an RPC request 595 + * @task: RPC task with an RPC message in rq_snd_buf 596 + * 597 + * Return values: 598 + * 0: The request has been sent 599 + * ENOTCONN: Caller needs to invoke connect logic then call again 600 + * ENOBUFS: Call again later to send the request 601 + * EIO: A permanent error occurred. The request was not sent, 602 + * and don't try it again 603 + * 593 604 * send_request invokes the meat of RPC RDMA. It must do the following: 605 + * 594 606 * 1. Marshal the RPC request into an RPC RDMA request, which means 595 607 * putting a header in front of data, and creating IOVs for RDMA 596 608 * from those in the request. ··· 610 600 * the request (rpcrdma_ep_post). 611 601 * 4. No partial sends are possible in the RPC-RDMA protocol (as in UDP). 612 602 */ 613 - 614 603 static int 615 604 xprt_rdma_send_request(struct rpc_task *task) 616 605 { ··· 618 609 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 619 610 struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 620 611 int rc = 0; 612 + 613 + /* On retransmit, remove any previously registered chunks */ 614 + r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); 621 615 622 616 rc = rpcrdma_marshal_req(rqst); 623 617 if (rc < 0) ··· 642 630 return 0; 643 631 644 632 failed_marshal: 645 - r_xprt->rx_stats.failed_marshal_count++; 646 633 dprintk("RPC: %s: rpcrdma_marshal_req failed, status %i\n", 647 634 __func__, rc); 648 635 if (rc == -EIO) 649 - return -EIO; 636 + r_xprt->rx_stats.failed_marshal_count++; 637 + if (rc != -ENOTCONN) 638 + return rc; 650 639 drop_connection: 651 640 xprt_disconnect_done(xprt); 652 641 return -ENOTCONN; /* implies disconnect */ ··· 673 660 xprt->stat.bad_xids, 674 661 xprt->stat.req_u, 675 662 xprt->stat.bklog_u); 676 - seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n", 663 + seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ", 677 664 r_xprt->rx_stats.read_chunk_count, 678 665 r_xprt->rx_stats.write_chunk_count, 679 666 r_xprt->rx_stats.reply_chunk_count, ··· 685 672 r_xprt->rx_stats.failed_marshal_count, 686 673 r_xprt->rx_stats.bad_reply_count, 687 674 r_xprt->rx_stats.nomsg_call_count); 675 + seq_printf(seq, "%lu %lu %lu\n", 676 + r_xprt->rx_stats.mrs_recovered, 677 + r_xprt->rx_stats.mrs_orphaned, 678 + r_xprt->rx_stats.mrs_allocated); 688 679 } 689 680 690 681 static int ··· 758 741 __func__, rc); 759 742 760 743 rpcrdma_destroy_wq(); 761 - frwr_destroy_recovery_wq(); 762 744 763 745 rc = xprt_unregister_transport(&xprt_rdma_bc); 764 746 if (rc) ··· 769 753 { 770 754 int rc; 771 755 772 - rc = frwr_alloc_recovery_wq(); 756 + rc = rpcrdma_alloc_wq(); 773 757 if (rc) 774 758 return rc; 775 - 776 - rc = rpcrdma_alloc_wq(); 777 - if (rc) { 778 - frwr_destroy_recovery_wq(); 779 - return rc; 780 - } 781 759 782 760 rc = xprt_register_transport(&xprt_rdma); 783 761 if (rc) { 784 762 rpcrdma_destroy_wq(); 785 - frwr_destroy_recovery_wq(); 786 763 return rc; 787 764 } 788 765 ··· 783 774 if (rc) { 784 775 xprt_unregister_transport(&xprt_rdma); 785 776 rpcrdma_destroy_wq(); 786 - frwr_destroy_recovery_wq(); 787 777 return rc; 788 778 } 789 779

+167 -75

net/sunrpc/xprtrdma/verbs.c

··· 379 379 struct rpcrdma_ia *ia = &xprt->rx_ia; 380 380 int rc; 381 381 382 - ia->ri_dma_mr = NULL; 383 - 384 382 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 385 383 if (IS_ERR(ia->ri_id)) { 386 384 rc = PTR_ERR(ia->ri_id); ··· 389 391 ia->ri_pd = ib_alloc_pd(ia->ri_device); 390 392 if (IS_ERR(ia->ri_pd)) { 391 393 rc = PTR_ERR(ia->ri_pd); 392 - dprintk("RPC: %s: ib_alloc_pd() failed %i\n", 393 - __func__, rc); 394 + pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); 394 395 goto out2; 395 - } 396 - 397 - if (memreg == RPCRDMA_FRMR) { 398 - if (!(ia->ri_device->attrs.device_cap_flags & 399 - IB_DEVICE_MEM_MGT_EXTENSIONS) || 400 - (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) { 401 - dprintk("RPC: %s: FRMR registration " 402 - "not supported by HCA\n", __func__); 403 - memreg = RPCRDMA_MTHCAFMR; 404 - } 405 - } 406 - if (memreg == RPCRDMA_MTHCAFMR) { 407 - if (!ia->ri_device->alloc_fmr) { 408 - dprintk("RPC: %s: MTHCAFMR registration " 409 - "not supported by HCA\n", __func__); 410 - rc = -EINVAL; 411 - goto out3; 412 - } 413 396 } 414 397 415 398 switch (memreg) { 416 399 case RPCRDMA_FRMR: 417 - ia->ri_ops = &rpcrdma_frwr_memreg_ops; 418 - break; 419 - case RPCRDMA_ALLPHYSICAL: 420 - ia->ri_ops = &rpcrdma_physical_memreg_ops; 421 - break; 400 + if (frwr_is_supported(ia)) { 401 + ia->ri_ops = &rpcrdma_frwr_memreg_ops; 402 + break; 403 + } 404 + /*FALLTHROUGH*/ 422 405 case RPCRDMA_MTHCAFMR: 423 - ia->ri_ops = &rpcrdma_fmr_memreg_ops; 424 - break; 406 + if (fmr_is_supported(ia)) { 407 + ia->ri_ops = &rpcrdma_fmr_memreg_ops; 408 + break; 409 + } 410 + /*FALLTHROUGH*/ 425 411 default: 426 - printk(KERN_ERR "RPC: Unsupported memory " 427 - "registration mode: %d\n", memreg); 428 - rc = -ENOMEM; 412 + pr_err("rpcrdma: Unsupported memory registration mode: %d\n", 413 + memreg); 414 + rc = -EINVAL; 429 415 goto out3; 430 416 } 431 - dprintk("RPC: %s: memory registration strategy is '%s'\n", 432 - __func__, ia->ri_ops->ro_displayname); 433 417 434 418 return 0; 435 419 ··· 565 585 out2: 566 586 ib_free_cq(sendcq); 567 587 out1: 568 - if (ia->ri_dma_mr) 569 - ib_dereg_mr(ia->ri_dma_mr); 570 588 return rc; 571 589 } 572 590 ··· 578 600 void 579 601 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 580 602 { 581 - int rc; 582 - 583 603 dprintk("RPC: %s: entering, connected is %d\n", 584 604 __func__, ep->rep_connected); 585 605 ··· 591 615 592 616 ib_free_cq(ep->rep_attr.recv_cq); 593 617 ib_free_cq(ep->rep_attr.send_cq); 594 - 595 - if (ia->ri_dma_mr) { 596 - rc = ib_dereg_mr(ia->ri_dma_mr); 597 - dprintk("RPC: %s: ib_dereg_mr returned %i\n", 598 - __func__, rc); 599 - } 600 618 } 601 619 602 620 /* ··· 747 777 ib_drain_qp(ia->ri_id->qp); 748 778 } 749 779 780 + static void 781 + rpcrdma_mr_recovery_worker(struct work_struct *work) 782 + { 783 + struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, 784 + rb_recovery_worker.work); 785 + struct rpcrdma_mw *mw; 786 + 787 + spin_lock(&buf->rb_recovery_lock); 788 + while (!list_empty(&buf->rb_stale_mrs)) { 789 + mw = list_first_entry(&buf->rb_stale_mrs, 790 + struct rpcrdma_mw, mw_list); 791 + list_del_init(&mw->mw_list); 792 + spin_unlock(&buf->rb_recovery_lock); 793 + 794 + dprintk("RPC: %s: recovering MR %p\n", __func__, mw); 795 + mw->mw_xprt->rx_ia.ri_ops->ro_recover_mr(mw); 796 + 797 + spin_lock(&buf->rb_recovery_lock); 798 + } 799 + spin_unlock(&buf->rb_recovery_lock); 800 + } 801 + 802 + void 803 + rpcrdma_defer_mr_recovery(struct rpcrdma_mw *mw) 804 + { 805 + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; 806 + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 807 + 808 + spin_lock(&buf->rb_recovery_lock); 809 + list_add(&mw->mw_list, &buf->rb_stale_mrs); 810 + spin_unlock(&buf->rb_recovery_lock); 811 + 812 + schedule_delayed_work(&buf->rb_recovery_worker, 0); 813 + } 814 + 815 + static void 816 + rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt) 817 + { 818 + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 819 + struct rpcrdma_ia *ia = &r_xprt->rx_ia; 820 + unsigned int count; 821 + LIST_HEAD(free); 822 + LIST_HEAD(all); 823 + 824 + for (count = 0; count < 32; count++) { 825 + struct rpcrdma_mw *mw; 826 + int rc; 827 + 828 + mw = kzalloc(sizeof(*mw), GFP_KERNEL); 829 + if (!mw) 830 + break; 831 + 832 + rc = ia->ri_ops->ro_init_mr(ia, mw); 833 + if (rc) { 834 + kfree(mw); 835 + break; 836 + } 837 + 838 + mw->mw_xprt = r_xprt; 839 + 840 + list_add(&mw->mw_list, &free); 841 + list_add(&mw->mw_all, &all); 842 + } 843 + 844 + spin_lock(&buf->rb_mwlock); 845 + list_splice(&free, &buf->rb_mws); 846 + list_splice(&all, &buf->rb_all); 847 + r_xprt->rx_stats.mrs_allocated += count; 848 + spin_unlock(&buf->rb_mwlock); 849 + 850 + dprintk("RPC: %s: created %u MRs\n", __func__, count); 851 + } 852 + 853 + static void 854 + rpcrdma_mr_refresh_worker(struct work_struct *work) 855 + { 856 + struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, 857 + rb_refresh_worker.work); 858 + struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, 859 + rx_buf); 860 + 861 + rpcrdma_create_mrs(r_xprt); 862 + } 863 + 750 864 struct rpcrdma_req * 751 865 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) 752 866 { ··· 847 793 spin_unlock(&buffer->rb_reqslock); 848 794 req->rl_cqe.done = rpcrdma_wc_send; 849 795 req->rl_buffer = &r_xprt->rx_buf; 796 + INIT_LIST_HEAD(&req->rl_registered); 850 797 return req; 851 798 } 852 799 ··· 887 832 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) 888 833 { 889 834 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 890 - struct rpcrdma_ia *ia = &r_xprt->rx_ia; 891 835 int i, rc; 892 836 893 837 buf->rb_max_requests = r_xprt->rx_data.max_requests; 894 838 buf->rb_bc_srv_max_requests = 0; 895 - spin_lock_init(&buf->rb_lock); 896 839 atomic_set(&buf->rb_credits, 1); 840 + spin_lock_init(&buf->rb_mwlock); 841 + spin_lock_init(&buf->rb_lock); 842 + spin_lock_init(&buf->rb_recovery_lock); 843 + INIT_LIST_HEAD(&buf->rb_mws); 844 + INIT_LIST_HEAD(&buf->rb_all); 845 + INIT_LIST_HEAD(&buf->rb_stale_mrs); 846 + INIT_DELAYED_WORK(&buf->rb_refresh_worker, 847 + rpcrdma_mr_refresh_worker); 848 + INIT_DELAYED_WORK(&buf->rb_recovery_worker, 849 + rpcrdma_mr_recovery_worker); 897 850 898 - rc = ia->ri_ops->ro_init(r_xprt); 899 - if (rc) 900 - goto out; 851 + rpcrdma_create_mrs(r_xprt); 901 852 902 853 INIT_LIST_HEAD(&buf->rb_send_bufs); 903 854 INIT_LIST_HEAD(&buf->rb_allreqs); ··· 923 862 } 924 863 925 864 INIT_LIST_HEAD(&buf->rb_recv_bufs); 926 - for (i = 0; i < buf->rb_max_requests + 2; i++) { 865 + for (i = 0; i < buf->rb_max_requests; i++) { 927 866 struct rpcrdma_rep *rep; 928 867 929 868 rep = rpcrdma_create_rep(r_xprt); ··· 979 918 kfree(req); 980 919 } 981 920 921 + static void 922 + rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf) 923 + { 924 + struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, 925 + rx_buf); 926 + struct rpcrdma_ia *ia = rdmab_to_ia(buf); 927 + struct rpcrdma_mw *mw; 928 + unsigned int count; 929 + 930 + count = 0; 931 + spin_lock(&buf->rb_mwlock); 932 + while (!list_empty(&buf->rb_all)) { 933 + mw = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); 934 + list_del(&mw->mw_all); 935 + 936 + spin_unlock(&buf->rb_mwlock); 937 + ia->ri_ops->ro_release_mr(mw); 938 + count++; 939 + spin_lock(&buf->rb_mwlock); 940 + } 941 + spin_unlock(&buf->rb_mwlock); 942 + r_xprt->rx_stats.mrs_allocated = 0; 943 + 944 + dprintk("RPC: %s: released %u MRs\n", __func__, count); 945 + } 946 + 982 947 void 983 948 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 984 949 { 985 950 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 951 + 952 + cancel_delayed_work_sync(&buf->rb_recovery_worker); 986 953 987 954 while (!list_empty(&buf->rb_recv_bufs)) { 988 955 struct rpcrdma_rep *rep; ··· 1033 944 } 1034 945 spin_unlock(&buf->rb_reqslock); 1035 946 1036 - ia->ri_ops->ro_destroy(buf); 947 + rpcrdma_destroy_mrs(buf); 1037 948 } 1038 949 1039 950 struct rpcrdma_mw * ··· 1051 962 spin_unlock(&buf->rb_mwlock); 1052 963 1053 964 if (!mw) 1054 - pr_err("RPC: %s: no MWs available\n", __func__); 965 + goto out_nomws; 1055 966 return mw; 967 + 968 + out_nomws: 969 + dprintk("RPC: %s: no MWs available\n", __func__); 970 + schedule_delayed_work(&buf->rb_refresh_worker, 0); 971 + 972 + /* Allow the reply handler and refresh worker to run */ 973 + cond_resched(); 974 + 975 + return NULL; 1056 976 } 1057 977 1058 978 void ··· 1076 978 1077 979 /* 1078 980 * Get a set of request/reply buffers. 1079 - * 1080 - * Reply buffer (if available) is attached to send buffer upon return. 1081 981 */ 1082 982 struct rpcrdma_req * 1083 983 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) ··· 1094 998 1095 999 out_reqbuf: 1096 1000 spin_unlock(&buffers->rb_lock); 1097 - pr_warn("RPC: %s: out of request buffers\n", __func__); 1001 + pr_warn("rpcrdma: out of request buffers (%p)\n", buffers); 1098 1002 return NULL; 1099 1003 out_repbuf: 1004 + list_add(&req->rl_free, &buffers->rb_send_bufs); 1100 1005 spin_unlock(&buffers->rb_lock); 1101 - pr_warn("RPC: %s: out of reply buffers\n", __func__); 1102 - req->rl_reply = NULL; 1103 - return req; 1006 + pr_warn("rpcrdma: out of reply buffers (%p)\n", buffers); 1007 + return NULL; 1104 1008 } 1105 1009 1106 1010 /* ··· 1155 1059 /* 1156 1060 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1157 1061 */ 1158 - 1159 - void 1160 - rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) 1161 - { 1162 - dprintk("RPC: map_one: offset %p iova %llx len %zu\n", 1163 - seg->mr_offset, 1164 - (unsigned long long)seg->mr_dma, seg->mr_dmalen); 1165 - } 1166 1062 1167 1063 /** 1168 1064 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers ··· 1238 1150 if (rep) { 1239 1151 rc = rpcrdma_ep_post_recv(ia, ep, rep); 1240 1152 if (rc) 1241 - goto out; 1153 + return rc; 1242 1154 req->rl_reply = NULL; 1243 1155 } 1244 1156 ··· 1263 1175 1264 1176 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); 1265 1177 if (rc) 1266 - dprintk("RPC: %s: ib_post_send returned %i\n", __func__, 1267 - rc); 1268 - out: 1269 - return rc; 1178 + goto out_postsend_err; 1179 + return 0; 1180 + 1181 + out_postsend_err: 1182 + pr_err("rpcrdma: RDMA Send ib_post_send returned %i\n", rc); 1183 + return -ENOTCONN; 1270 1184 } 1271 1185 1272 1186 /* ··· 1293 1203 DMA_BIDIRECTIONAL); 1294 1204 1295 1205 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 1296 - 1297 1206 if (rc) 1298 - dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, 1299 - rc); 1300 - return rc; 1207 + goto out_postrecv; 1208 + return 0; 1209 + 1210 + out_postrecv: 1211 + pr_err("rpcrdma: ib_post_recv returned %i\n", rc); 1212 + return -ENOTCONN; 1301 1213 } 1302 1214 1303 1215 /**

+43 -75

net/sunrpc/xprtrdma/xprt_rdma.h

··· 68 68 struct ib_device *ri_device; 69 69 struct rdma_cm_id *ri_id; 70 70 struct ib_pd *ri_pd; 71 - struct ib_mr *ri_dma_mr; 72 71 struct completion ri_done; 73 72 int ri_async_rc; 74 73 unsigned int ri_max_frmr_depth; ··· 171 172 * o recv buffer (posted to provider) 172 173 * o ib_sge (also donated to provider) 173 174 * o status of reply (length, success or not) 174 - * o bookkeeping state to get run by tasklet (list, etc) 175 + * o bookkeeping state to get run by reply handler (list, etc) 175 176 * 176 - * These are allocated during initialization, per-transport instance; 177 - * however, the tasklet execution list itself is global, as it should 178 - * always be pretty short. 177 + * These are allocated during initialization, per-transport instance. 179 178 * 180 179 * N of these are associated with a transport instance, and stored in 181 180 * struct rpcrdma_buffer. N is the max number of outstanding requests. 182 181 */ 183 - 184 - #define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) 185 - 186 - /* data segments + head/tail for Call + head/tail for Reply */ 187 - #define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4) 188 - 189 - struct rpcrdma_buffer; 190 182 191 183 struct rpcrdma_rep { 192 184 struct ib_cqe rr_cqe; ··· 211 221 }; 212 222 213 223 struct rpcrdma_frmr { 214 - struct scatterlist *fr_sg; 215 - int fr_nents; 216 - enum dma_data_direction fr_dir; 217 224 struct ib_mr *fr_mr; 218 225 struct ib_cqe fr_cqe; 219 226 enum rpcrdma_frmr_state fr_state; ··· 222 235 }; 223 236 224 237 struct rpcrdma_fmr { 225 - struct ib_fmr *fmr; 226 - u64 *physaddrs; 238 + struct ib_fmr *fm_mr; 239 + u64 *fm_physaddrs; 227 240 }; 228 241 229 242 struct rpcrdma_mw { 243 + struct list_head mw_list; 244 + struct scatterlist *mw_sg; 245 + int mw_nents; 246 + enum dma_data_direction mw_dir; 230 247 union { 231 248 struct rpcrdma_fmr fmr; 232 249 struct rpcrdma_frmr frmr; 233 250 }; 234 - struct work_struct mw_work; 235 251 struct rpcrdma_xprt *mw_xprt; 236 - struct list_head mw_list; 252 + u32 mw_handle; 253 + u32 mw_length; 254 + u64 mw_offset; 237 255 struct list_head mw_all; 238 256 }; 239 257 ··· 258 266 * of iovs for send operations. The reason is that the iovs passed to 259 267 * ib_post_{send,recv} must not be modified until the work request 260 268 * completes. 261 - * 262 - * NOTES: 263 - * o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we 264 - * marshal. The number needed varies depending on the iov lists that 265 - * are passed to us, the memory registration mode we are in, and if 266 - * physical addressing is used, the layout. 267 269 */ 268 270 271 + /* Maximum number of page-sized "segments" per chunk list to be 272 + * registered or invalidated. Must handle a Reply chunk: 273 + */ 274 + enum { 275 + RPCRDMA_MAX_IOV_SEGS = 3, 276 + RPCRDMA_MAX_DATA_SEGS = ((1 * 1024 * 1024) / PAGE_SIZE) + 1, 277 + RPCRDMA_MAX_SEGS = RPCRDMA_MAX_DATA_SEGS + 278 + RPCRDMA_MAX_IOV_SEGS, 279 + }; 280 + 269 281 struct rpcrdma_mr_seg { /* chunk descriptors */ 270 - struct rpcrdma_mw *rl_mw; /* registered MR */ 271 - u64 mr_base; /* registration result */ 272 - u32 mr_rkey; /* registration result */ 273 282 u32 mr_len; /* length of chunk or segment */ 274 - int mr_nsegs; /* number of segments in chunk or 0 */ 275 - enum dma_data_direction mr_dir; /* segment mapping direction */ 276 - dma_addr_t mr_dma; /* segment mapping address */ 277 - size_t mr_dmalen; /* segment mapping length */ 278 283 struct page *mr_page; /* owning page, if any */ 279 284 char *mr_offset; /* kva if no page, else offset */ 280 285 }; 281 286 282 287 #define RPCRDMA_MAX_IOVS (2) 283 288 289 + struct rpcrdma_buffer; 284 290 struct rpcrdma_req { 285 291 struct list_head rl_free; 286 292 unsigned int rl_niovs; 287 - unsigned int rl_nchunks; 288 293 unsigned int rl_connect_cookie; 289 294 struct rpc_task *rl_task; 290 295 struct rpcrdma_buffer *rl_buffer; ··· 289 300 struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; 290 301 struct rpcrdma_regbuf *rl_rdmabuf; 291 302 struct rpcrdma_regbuf *rl_sendbuf; 292 - struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; 293 - struct rpcrdma_mr_seg *rl_nextseg; 294 303 295 304 struct ib_cqe rl_cqe; 296 305 struct list_head rl_all; 297 306 bool rl_backchannel; 307 + 308 + struct list_head rl_registered; /* registered segments */ 309 + struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; 298 310 }; 299 311 300 312 static inline struct rpcrdma_req * ··· 331 341 struct list_head rb_allreqs; 332 342 333 343 u32 rb_bc_max_requests; 344 + 345 + spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ 346 + struct list_head rb_stale_mrs; 347 + struct delayed_work rb_recovery_worker; 348 + struct delayed_work rb_refresh_worker; 334 349 }; 335 350 #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) 336 351 ··· 382 387 unsigned long bad_reply_count; 383 388 unsigned long nomsg_call_count; 384 389 unsigned long bcall_count; 390 + unsigned long mrs_recovered; 391 + unsigned long mrs_orphaned; 392 + unsigned long mrs_allocated; 385 393 }; 386 394 387 395 /* ··· 393 395 struct rpcrdma_xprt; 394 396 struct rpcrdma_memreg_ops { 395 397 int (*ro_map)(struct rpcrdma_xprt *, 396 - struct rpcrdma_mr_seg *, int, bool); 398 + struct rpcrdma_mr_seg *, int, bool, 399 + struct rpcrdma_mw **); 397 400 void (*ro_unmap_sync)(struct rpcrdma_xprt *, 398 401 struct rpcrdma_req *); 399 402 void (*ro_unmap_safe)(struct rpcrdma_xprt *, 400 403 struct rpcrdma_req *, bool); 404 + void (*ro_recover_mr)(struct rpcrdma_mw *); 401 405 int (*ro_open)(struct rpcrdma_ia *, 402 406 struct rpcrdma_ep *, 403 407 struct rpcrdma_create_data_internal *); 404 408 size_t (*ro_maxpages)(struct rpcrdma_xprt *); 405 - int (*ro_init)(struct rpcrdma_xprt *); 406 - void (*ro_destroy)(struct rpcrdma_buffer *); 409 + int (*ro_init_mr)(struct rpcrdma_ia *, 410 + struct rpcrdma_mw *); 411 + void (*ro_release_mr)(struct rpcrdma_mw *); 407 412 const char *ro_displayname; 408 413 }; 409 414 410 415 extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; 411 416 extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; 412 - extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; 413 417 414 418 /* 415 419 * RPCRDMA transport -- encapsulates the structures above for ··· 446 446 */ 447 447 int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); 448 448 void rpcrdma_ia_close(struct rpcrdma_ia *); 449 + bool frwr_is_supported(struct rpcrdma_ia *); 450 + bool fmr_is_supported(struct rpcrdma_ia *); 449 451 450 452 /* 451 453 * Endpoint calls - xprtrdma/verbs.c ··· 479 477 void rpcrdma_recv_buffer_get(struct rpcrdma_req *); 480 478 void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); 481 479 480 + void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *); 481 + 482 482 struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, 483 483 size_t, gfp_t); 484 484 void rpcrdma_free_regbuf(struct rpcrdma_ia *, 485 485 struct rpcrdma_regbuf *); 486 486 487 487 int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); 488 - 489 - int frwr_alloc_recovery_wq(void); 490 - void frwr_destroy_recovery_wq(void); 491 488 492 489 int rpcrdma_alloc_wq(void); 493 490 void rpcrdma_destroy_wq(void); ··· 495 494 * Wrappers for chunk registration, shared by read/write chunk code. 496 495 */ 497 496 498 - void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); 499 - 500 497 static inline enum dma_data_direction 501 498 rpcrdma_data_dir(bool writing) 502 499 { 503 500 return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; 504 - } 505 - 506 - static inline void 507 - rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, 508 - enum dma_data_direction direction) 509 - { 510 - seg->mr_dir = direction; 511 - seg->mr_dmalen = seg->mr_len; 512 - 513 - if (seg->mr_page) 514 - seg->mr_dma = ib_dma_map_page(device, 515 - seg->mr_page, offset_in_page(seg->mr_offset), 516 - seg->mr_dmalen, seg->mr_dir); 517 - else 518 - seg->mr_dma = ib_dma_map_single(device, 519 - seg->mr_offset, 520 - seg->mr_dmalen, seg->mr_dir); 521 - 522 - if (ib_dma_mapping_error(device, seg->mr_dma)) 523 - rpcrdma_mapping_error(seg); 524 - } 525 - 526 - static inline void 527 - rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) 528 - { 529 - if (seg->mr_page) 530 - ib_dma_unmap_page(device, 531 - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 532 - else 533 - ib_dma_unmap_single(device, 534 - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); 535 501 } 536 502 537 503 /*

+73 -52

net/sunrpc/xprtsock.c

··· 124 124 .mode = 0644, 125 125 .proc_handler = proc_dointvec_minmax, 126 126 .extra1 = &xprt_min_resvport_limit, 127 - .extra2 = &xprt_max_resvport_limit 127 + .extra2 = &xprt_max_resvport 128 128 }, 129 129 { 130 130 .procname = "max_resvport", ··· 132 132 .maxlen = sizeof(unsigned int), 133 133 .mode = 0644, 134 134 .proc_handler = proc_dointvec_minmax, 135 - .extra1 = &xprt_min_resvport_limit, 135 + .extra1 = &xprt_min_resvport, 136 136 .extra2 = &xprt_max_resvport_limit 137 137 }, 138 138 { ··· 642 642 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 643 643 struct xdr_buf *xdr = &req->rq_snd_buf; 644 644 bool zerocopy = true; 645 + bool vm_wait = false; 645 646 int status; 646 647 int sent; 647 648 ··· 678 677 return 0; 679 678 } 680 679 680 + WARN_ON_ONCE(sent == 0 && status == 0); 681 + 682 + if (status == -EAGAIN ) { 683 + /* 684 + * Return EAGAIN if we're sure we're hitting the 685 + * socket send buffer limits. 686 + */ 687 + if (test_bit(SOCK_NOSPACE, &transport->sock->flags)) 688 + break; 689 + /* 690 + * Did we hit a memory allocation failure? 691 + */ 692 + if (sent == 0) { 693 + status = -ENOBUFS; 694 + if (vm_wait) 695 + break; 696 + /* Retry, knowing now that we're below the 697 + * socket send buffer limit 698 + */ 699 + vm_wait = true; 700 + } 701 + continue; 702 + } 681 703 if (status < 0) 682 704 break; 683 - if (sent == 0) { 684 - status = -EAGAIN; 685 - break; 686 - } 705 + vm_wait = false; 687 706 } 688 - if (status == -EAGAIN && sk_stream_is_writeable(transport->inet)) 689 - status = -ENOBUFS; 690 707 691 708 switch (status) { 692 709 case -ENOTSOCK: ··· 774 755 sk->sk_error_report = transport->old_error_report; 775 756 } 776 757 758 + static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) 759 + { 760 + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); 761 + 762 + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); 763 + } 764 + 777 765 static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt) 778 766 { 779 767 smp_mb__before_atomic(); 780 768 clear_bit(XPRT_CLOSE_WAIT, &xprt->state); 781 769 clear_bit(XPRT_CLOSING, &xprt->state); 770 + xs_sock_reset_state_flags(xprt); 782 771 smp_mb__after_atomic(); 783 772 } 784 773 ··· 989 962 goto out; 990 963 for (;;) { 991 964 skb = skb_recv_datagram(sk, 0, 1, &err); 992 - if (skb == NULL) 965 + if (skb != NULL) { 966 + xs_local_data_read_skb(&transport->xprt, sk, skb); 967 + skb_free_datagram(sk, skb); 968 + continue; 969 + } 970 + if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) 993 971 break; 994 - xs_local_data_read_skb(&transport->xprt, sk, skb); 995 - skb_free_datagram(sk, skb); 996 972 } 997 973 out: 998 974 mutex_unlock(&transport->recv_mutex); ··· 1073 1043 goto out; 1074 1044 for (;;) { 1075 1045 skb = skb_recv_datagram(sk, 0, 1, &err); 1076 - if (skb == NULL) 1046 + if (skb != NULL) { 1047 + xs_udp_data_read_skb(&transport->xprt, sk, skb); 1048 + skb_free_datagram(sk, skb); 1049 + continue; 1050 + } 1051 + if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) 1077 1052 break; 1078 - xs_udp_data_read_skb(&transport->xprt, sk, skb); 1079 - skb_free_datagram(sk, skb); 1080 1053 } 1081 1054 out: 1082 1055 mutex_unlock(&transport->recv_mutex); ··· 1107 1074 if (xprt != NULL) { 1108 1075 struct sock_xprt *transport = container_of(xprt, 1109 1076 struct sock_xprt, xprt); 1110 - queue_work(rpciod_workqueue, &transport->recv_worker); 1077 + transport->old_data_ready(sk); 1078 + /* Any data means we had a useful conversation, so 1079 + * then we don't need to delay the next reconnect 1080 + */ 1081 + if (xprt->reestablish_timeout) 1082 + xprt->reestablish_timeout = 0; 1083 + if (!test_and_set_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) 1084 + queue_work(xprtiod_workqueue, &transport->recv_worker); 1111 1085 } 1112 1086 read_unlock_bh(&sk->sk_callback_lock); 1113 1087 } ··· 1514 1474 for (;;) { 1515 1475 lock_sock(sk); 1516 1476 read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); 1517 - release_sock(sk); 1518 - if (read <= 0) 1519 - break; 1520 - total += read; 1477 + if (read <= 0) { 1478 + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); 1479 + release_sock(sk); 1480 + if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) 1481 + break; 1482 + } else { 1483 + release_sock(sk); 1484 + total += read; 1485 + } 1521 1486 rd_desc.count = 65536; 1522 1487 } 1523 1488 out: ··· 1535 1490 struct sock_xprt *transport = 1536 1491 container_of(work, struct sock_xprt, recv_worker); 1537 1492 xs_tcp_data_receive(transport); 1538 - } 1539 - 1540 - /** 1541 - * xs_tcp_data_ready - "data ready" callback for TCP sockets 1542 - * @sk: socket with data to read 1543 - * 1544 - */ 1545 - static void xs_tcp_data_ready(struct sock *sk) 1546 - { 1547 - struct sock_xprt *transport; 1548 - struct rpc_xprt *xprt; 1549 - 1550 - dprintk("RPC: xs_tcp_data_ready...\n"); 1551 - 1552 - read_lock_bh(&sk->sk_callback_lock); 1553 - if (!(xprt = xprt_from_sock(sk))) 1554 - goto out; 1555 - transport = container_of(xprt, struct sock_xprt, xprt); 1556 - 1557 - /* Any data means we had a useful conversation, so 1558 - * the we don't need to delay the next reconnect 1559 - */ 1560 - if (xprt->reestablish_timeout) 1561 - xprt->reestablish_timeout = 0; 1562 - queue_work(rpciod_workqueue, &transport->recv_worker); 1563 - 1564 - out: 1565 - read_unlock_bh(&sk->sk_callback_lock); 1566 1493 } 1567 1494 1568 1495 /** ··· 1731 1714 1732 1715 static unsigned short xs_get_random_port(void) 1733 1716 { 1734 - unsigned short range = xprt_max_resvport - xprt_min_resvport; 1717 + unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; 1735 1718 unsigned short rand = (unsigned short) prandom_u32() % range; 1736 1719 return rand + xprt_min_resvport; 1737 1720 } ··· 2258 2241 xs_save_old_callbacks(transport, sk); 2259 2242 2260 2243 sk->sk_user_data = xprt; 2261 - sk->sk_data_ready = xs_tcp_data_ready; 2244 + sk->sk_data_ready = xs_data_ready; 2262 2245 sk->sk_state_change = xs_tcp_state_change; 2263 2246 sk->sk_write_space = xs_tcp_write_space; 2264 2247 sock_set_flag(sk, SOCK_FASYNC); ··· 2397 2380 /* Start by resetting any existing state */ 2398 2381 xs_reset_transport(transport); 2399 2382 2400 - queue_delayed_work(rpciod_workqueue, 2383 + queue_delayed_work(xprtiod_workqueue, 2401 2384 &transport->connect_worker, 2402 2385 xprt->reestablish_timeout); 2403 2386 xprt->reestablish_timeout <<= 1; ··· 2407 2390 xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; 2408 2391 } else { 2409 2392 dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); 2410 - queue_delayed_work(rpciod_workqueue, 2393 + queue_delayed_work(xprtiod_workqueue, 2411 2394 &transport->connect_worker, 0); 2412 2395 } 2413 2396 } ··· 3170 3153 3171 3154 static int param_set_portnr(const char *val, const struct kernel_param *kp) 3172 3155 { 3173 - return param_set_uint_minmax(val, kp, 3156 + if (kp->arg == &xprt_min_resvport) 3157 + return param_set_uint_minmax(val, kp, 3174 3158 RPC_MIN_RESVPORT, 3159 + xprt_max_resvport); 3160 + return param_set_uint_minmax(val, kp, 3161 + xprt_min_resvport, 3175 3162 RPC_MAX_RESVPORT); 3176 3163 } 3177 3164