NFSD: Implement NFSD_IO_DIRECT for NFS WRITE

When NFSD_IO_DIRECT is selected via the
/sys/kernel/debug/nfsd/io_cache_write experimental tunable, split
incoming unaligned NFS WRITE requests into a prefix, middle and
suffix segment, as needed. The middle segment is now DIO-aligned and
the prefix and/or suffix are unaligned. Synchronous buffered IO is
used for the unaligned segments, and IOCB_DIRECT is used for the
middle DIO-aligned extent.

Although IOCB_DIRECT avoids the use of the page cache, by itself it
doesn't guarantee data durability. For UNSTABLE WRITE requests,
durability is obtained by a subsequent NFS COMMIT request.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Co-developed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

authored by Mike Snitzer and committed by Chuck Lever 06c5c972 e3e8e176

+144 -4
+1
fs/nfsd/debugfs.c
··· 108 switch (val) { 109 case NFSD_IO_BUFFERED: 110 case NFSD_IO_DONTCACHE: 111 nfsd_io_cache_write = val; 112 break; 113 default:
··· 108 switch (val) { 109 case NFSD_IO_BUFFERED: 110 case NFSD_IO_DONTCACHE: 111 + case NFSD_IO_DIRECT: 112 nfsd_io_cache_write = val; 113 break; 114 default:
+2
fs/nfsd/trace.h
··· 469 DEFINE_NFSD_IO_EVENT(read_done); 470 DEFINE_NFSD_IO_EVENT(write_start); 471 DEFINE_NFSD_IO_EVENT(write_opened); 472 DEFINE_NFSD_IO_EVENT(write_io_done); 473 DEFINE_NFSD_IO_EVENT(write_done); 474 DEFINE_NFSD_IO_EVENT(commit_start);
··· 469 DEFINE_NFSD_IO_EVENT(read_done); 470 DEFINE_NFSD_IO_EVENT(write_start); 471 DEFINE_NFSD_IO_EVENT(write_opened); 472 + DEFINE_NFSD_IO_EVENT(write_direct); 473 + DEFINE_NFSD_IO_EVENT(write_vector); 474 DEFINE_NFSD_IO_EVENT(write_io_done); 475 DEFINE_NFSD_IO_EVENT(write_done); 476 DEFINE_NFSD_IO_EVENT(commit_start);
+141 -4
fs/nfsd/vfs.c
··· 1254 return err; 1255 } 1256 1257 /** 1258 * nfsd_vfs_write - write data to an already-open file 1259 * @rqstp: RPC execution context ··· 1458 } 1459 1460 nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); 1461 - iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); 1462 since = READ_ONCE(file->f_wb_err); 1463 if (verf) 1464 nfsd_copy_write_verifier(verf, nn); 1465 1466 switch (nfsd_io_cache_write) { 1467 - case NFSD_IO_BUFFERED: 1468 break; 1469 case NFSD_IO_DONTCACHE: 1470 if (file->f_op->fop_flags & FOP_DONTCACHE) 1471 kiocb.ki_flags |= IOCB_DONTCACHE; 1472 break; 1473 } 1474 - host_err = vfs_iocb_iter_write(file, &kiocb, &iter); 1475 if (host_err < 0) { 1476 commit_reset_write_verifier(nn, rqstp, host_err); 1477 goto out_nfserr; 1478 } 1479 - *cnt = host_err; 1480 nfsd_stats_io_write_add(nn, exp, *cnt); 1481 fsnotify_modify(file); 1482 host_err = filemap_check_wb_err(file->f_mapping, since);
··· 1254 return err; 1255 } 1256 1257 + struct nfsd_write_dio_seg { 1258 + struct iov_iter iter; 1259 + int flags; 1260 + }; 1261 + 1262 + static unsigned long 1263 + iov_iter_bvec_offset(const struct iov_iter *iter) 1264 + { 1265 + return (unsigned long)(iter->bvec->bv_offset + iter->iov_offset); 1266 + } 1267 + 1268 + static void 1269 + nfsd_write_dio_seg_init(struct nfsd_write_dio_seg *segment, 1270 + struct bio_vec *bvec, unsigned int nvecs, 1271 + unsigned long total, size_t start, size_t len, 1272 + struct kiocb *iocb) 1273 + { 1274 + iov_iter_bvec(&segment->iter, ITER_SOURCE, bvec, nvecs, total); 1275 + if (start) 1276 + iov_iter_advance(&segment->iter, start); 1277 + iov_iter_truncate(&segment->iter, len); 1278 + segment->flags = iocb->ki_flags; 1279 + } 1280 + 1281 + static unsigned int 1282 + nfsd_write_dio_iters_init(struct nfsd_file *nf, struct bio_vec *bvec, 1283 + unsigned int nvecs, struct kiocb *iocb, 1284 + unsigned long total, 1285 + struct nfsd_write_dio_seg segments[3]) 1286 + { 1287 + u32 offset_align = nf->nf_dio_offset_align; 1288 + loff_t prefix_end, orig_end, middle_end; 1289 + u32 mem_align = nf->nf_dio_mem_align; 1290 + size_t prefix, middle, suffix; 1291 + loff_t offset = iocb->ki_pos; 1292 + unsigned int nsegs = 0; 1293 + 1294 + /* 1295 + * Check if direct I/O is feasible for this write request. 1296 + * If alignments are not available, the write is too small, 1297 + * or no alignment can be found, fall back to buffered I/O. 1298 + */ 1299 + if (unlikely(!mem_align || !offset_align) || 1300 + unlikely(total < max(offset_align, mem_align))) 1301 + goto no_dio; 1302 + 1303 + prefix_end = round_up(offset, offset_align); 1304 + orig_end = offset + total; 1305 + middle_end = round_down(orig_end, offset_align); 1306 + 1307 + prefix = prefix_end - offset; 1308 + middle = middle_end - prefix_end; 1309 + suffix = orig_end - middle_end; 1310 + 1311 + if (!middle) 1312 + goto no_dio; 1313 + 1314 + if (prefix) 1315 + nfsd_write_dio_seg_init(&segments[nsegs++], bvec, 1316 + nvecs, total, 0, prefix, iocb); 1317 + 1318 + nfsd_write_dio_seg_init(&segments[nsegs], bvec, nvecs, 1319 + total, prefix, middle, iocb); 1320 + 1321 + /* 1322 + * Check if the bvec iterator is aligned for direct I/O. 1323 + * 1324 + * bvecs generated from RPC receive buffers are contiguous: After 1325 + * the first bvec, all subsequent bvecs start at bv_offset zero 1326 + * (page-aligned). Therefore, only the first bvec is checked. 1327 + */ 1328 + if (iov_iter_bvec_offset(&segments[nsegs].iter) & (mem_align - 1)) 1329 + goto no_dio; 1330 + segments[nsegs].flags |= IOCB_DIRECT; 1331 + nsegs++; 1332 + 1333 + if (suffix) 1334 + nfsd_write_dio_seg_init(&segments[nsegs++], bvec, nvecs, total, 1335 + prefix + middle, suffix, iocb); 1336 + 1337 + return nsegs; 1338 + 1339 + no_dio: 1340 + /* No DIO alignment possible - pack into single non-DIO segment. */ 1341 + nfsd_write_dio_seg_init(&segments[0], bvec, nvecs, total, 0, 1342 + total, iocb); 1343 + return 1; 1344 + } 1345 + 1346 + static noinline_for_stack int 1347 + nfsd_direct_write(struct svc_rqst *rqstp, struct svc_fh *fhp, 1348 + struct nfsd_file *nf, unsigned int nvecs, 1349 + unsigned long *cnt, struct kiocb *kiocb) 1350 + { 1351 + struct nfsd_write_dio_seg segments[3]; 1352 + struct file *file = nf->nf_file; 1353 + unsigned int nsegs, i; 1354 + ssize_t host_err; 1355 + 1356 + nsegs = nfsd_write_dio_iters_init(nf, rqstp->rq_bvec, nvecs, 1357 + kiocb, *cnt, segments); 1358 + 1359 + *cnt = 0; 1360 + for (i = 0; i < nsegs; i++) { 1361 + kiocb->ki_flags = segments[i].flags; 1362 + if (kiocb->ki_flags & IOCB_DIRECT) 1363 + trace_nfsd_write_direct(rqstp, fhp, kiocb->ki_pos, 1364 + segments[i].iter.count); 1365 + else { 1366 + trace_nfsd_write_vector(rqstp, fhp, kiocb->ki_pos, 1367 + segments[i].iter.count); 1368 + /* 1369 + * Mark the I/O buffer as evict-able to reduce 1370 + * memory contention. 1371 + */ 1372 + if (nf->nf_file->f_op->fop_flags & FOP_DONTCACHE) 1373 + kiocb->ki_flags |= IOCB_DONTCACHE; 1374 + } 1375 + 1376 + host_err = vfs_iocb_iter_write(file, kiocb, &segments[i].iter); 1377 + if (host_err < 0) 1378 + return host_err; 1379 + *cnt += host_err; 1380 + if (host_err < segments[i].iter.count) 1381 + break; /* partial write */ 1382 + } 1383 + 1384 + return 0; 1385 + } 1386 + 1387 /** 1388 * nfsd_vfs_write - write data to an already-open file 1389 * @rqstp: RPC execution context ··· 1328 } 1329 1330 nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); 1331 + 1332 since = READ_ONCE(file->f_wb_err); 1333 if (verf) 1334 nfsd_copy_write_verifier(verf, nn); 1335 1336 switch (nfsd_io_cache_write) { 1337 + case NFSD_IO_DIRECT: 1338 + host_err = nfsd_direct_write(rqstp, fhp, nf, nvecs, 1339 + cnt, &kiocb); 1340 break; 1341 case NFSD_IO_DONTCACHE: 1342 if (file->f_op->fop_flags & FOP_DONTCACHE) 1343 kiocb.ki_flags |= IOCB_DONTCACHE; 1344 + fallthrough; 1345 + case NFSD_IO_BUFFERED: 1346 + iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); 1347 + host_err = vfs_iocb_iter_write(file, &kiocb, &iter); 1348 + if (host_err < 0) 1349 + break; 1350 + *cnt = host_err; 1351 break; 1352 } 1353 if (host_err < 0) { 1354 commit_reset_write_verifier(nn, rqstp, host_err); 1355 goto out_nfserr; 1356 } 1357 nfsd_stats_io_write_add(nn, exp, *cnt); 1358 fsnotify_modify(file); 1359 host_err = filemap_check_wb_err(file->f_mapping, since);