Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xprtrdma: Remove support for FMR memory registration

FMR is not supported on most recent RDMA devices. It is also less
secure than FRWR because an FMR memory registration can expose
adjacent bytes to remote reading or writing. As discussed during the
RDMA BoF at LPC 2018, it is time to remove support for FMR in the
NFS/RDMA client stack.

Note that NFS/RDMA server-side uses either local memory registration
or FRWR. FMR is not used.

There are a few Infiniband/RoCE devices in the kernel tree that do
not appear to support MEM_MGT_EXTENSIONS (FRWR), and therefore will
not support client-side NFS/RDMA after this patch. These are:

- mthca
- qib
- hns (RoCE)

Users of these devices can use NFS/TCP on IPoIB instead.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>

authored by

Chuck Lever and committed by
Anna Schumaker
ba69cd12 a7886849

+2 -359
+1 -2
net/sunrpc/xprtrdma/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o 3 3 4 - rpcrdma-y := transport.o rpc_rdma.o verbs.o \ 5 - fmr_ops.o frwr_ops.o \ 4 + rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \ 6 5 svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ 7 6 svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ 8 7 module.o
-340
net/sunrpc/xprtrdma/fmr_ops.c
··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * Copyright (c) 2015, 2017 Oracle. All rights reserved. 4 - * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 5 - */ 6 - 7 - /* Lightweight memory registration using Fast Memory Regions (FMR). 8 - * Referred to sometimes as MTHCAFMR mode. 9 - * 10 - * FMR uses synchronous memory registration and deregistration. 11 - * FMR registration is known to be fast, but FMR deregistration 12 - * can take tens of usecs to complete. 13 - */ 14 - 15 - /* Normal operation 16 - * 17 - * A Memory Region is prepared for RDMA READ or WRITE using the 18 - * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is 19 - * finished, the Memory Region is unmapped using the ib_unmap_fmr 20 - * verb (fmr_op_unmap). 21 - */ 22 - 23 - #include <linux/sunrpc/svc_rdma.h> 24 - 25 - #include "xprt_rdma.h" 26 - #include <trace/events/rpcrdma.h> 27 - 28 - #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 29 - # define RPCDBG_FACILITY RPCDBG_TRANS 30 - #endif 31 - 32 - /* Maximum scatter/gather per FMR */ 33 - #define RPCRDMA_MAX_FMR_SGES (64) 34 - 35 - /* Access mode of externally registered pages */ 36 - enum { 37 - RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | 38 - IB_ACCESS_REMOTE_READ, 39 - }; 40 - 41 - bool 42 - fmr_is_supported(struct rpcrdma_ia *ia) 43 - { 44 - if (!ia->ri_device->alloc_fmr) { 45 - pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", 46 - ia->ri_device->name); 47 - return false; 48 - } 49 - return true; 50 - } 51 - 52 - static void 53 - __fmr_unmap(struct rpcrdma_mr *mr) 54 - { 55 - LIST_HEAD(l); 56 - int rc; 57 - 58 - list_add(&mr->fmr.fm_mr->list, &l); 59 - rc = ib_unmap_fmr(&l); 60 - list_del(&mr->fmr.fm_mr->list); 61 - if (rc) 62 - pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", 63 - mr, rc); 64 - } 65 - 66 - /* Release an MR. 67 - */ 68 - static void 69 - fmr_op_release_mr(struct rpcrdma_mr *mr) 70 - { 71 - int rc; 72 - 73 - kfree(mr->fmr.fm_physaddrs); 74 - kfree(mr->mr_sg); 75 - 76 - /* In case this one was left mapped, try to unmap it 77 - * to prevent dealloc_fmr from failing with EBUSY 78 - */ 79 - __fmr_unmap(mr); 80 - 81 - rc = ib_dealloc_fmr(mr->fmr.fm_mr); 82 - if (rc) 83 - pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", 84 - mr, rc); 85 - 86 - kfree(mr); 87 - } 88 - 89 - /* MRs are dynamically allocated, so simply clean up and release the MR. 90 - * A replacement MR will subsequently be allocated on demand. 91 - */ 92 - static void 93 - fmr_mr_recycle_worker(struct work_struct *work) 94 - { 95 - struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); 96 - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; 97 - 98 - trace_xprtrdma_mr_recycle(mr); 99 - 100 - trace_xprtrdma_mr_unmap(mr); 101 - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, 102 - mr->mr_sg, mr->mr_nents, mr->mr_dir); 103 - 104 - spin_lock(&r_xprt->rx_buf.rb_mrlock); 105 - list_del(&mr->mr_all); 106 - r_xprt->rx_stats.mrs_recycled++; 107 - spin_unlock(&r_xprt->rx_buf.rb_mrlock); 108 - fmr_op_release_mr(mr); 109 - } 110 - 111 - static int 112 - fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) 113 - { 114 - static struct ib_fmr_attr fmr_attr = { 115 - .max_pages = RPCRDMA_MAX_FMR_SGES, 116 - .max_maps = 1, 117 - .page_shift = PAGE_SHIFT 118 - }; 119 - 120 - mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, 121 - sizeof(u64), GFP_KERNEL); 122 - if (!mr->fmr.fm_physaddrs) 123 - goto out_free; 124 - 125 - mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, 126 - sizeof(*mr->mr_sg), GFP_KERNEL); 127 - if (!mr->mr_sg) 128 - goto out_free; 129 - 130 - sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); 131 - 132 - mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, 133 - &fmr_attr); 134 - if (IS_ERR(mr->fmr.fm_mr)) 135 - goto out_fmr_err; 136 - 137 - INIT_LIST_HEAD(&mr->mr_list); 138 - INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker); 139 - return 0; 140 - 141 - out_fmr_err: 142 - dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, 143 - PTR_ERR(mr->fmr.fm_mr)); 144 - 145 - out_free: 146 - kfree(mr->mr_sg); 147 - kfree(mr->fmr.fm_physaddrs); 148 - return -ENOMEM; 149 - } 150 - 151 - /* On success, sets: 152 - * ep->rep_attr.cap.max_send_wr 153 - * ep->rep_attr.cap.max_recv_wr 154 - * cdata->max_requests 155 - * ia->ri_max_segs 156 - */ 157 - static int 158 - fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, 159 - struct rpcrdma_create_data_internal *cdata) 160 - { 161 - int max_qp_wr; 162 - 163 - max_qp_wr = ia->ri_device->attrs.max_qp_wr; 164 - max_qp_wr -= RPCRDMA_BACKWARD_WRS; 165 - max_qp_wr -= 1; 166 - if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) 167 - return -ENOMEM; 168 - if (cdata->max_requests > max_qp_wr) 169 - cdata->max_requests = max_qp_wr; 170 - ep->rep_attr.cap.max_send_wr = cdata->max_requests; 171 - ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; 172 - ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ 173 - ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 174 - ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; 175 - ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ 176 - 177 - ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / 178 - RPCRDMA_MAX_FMR_SGES); 179 - /* Reply chunks require segments for head and tail buffers */ 180 - ia->ri_max_segs += 2; 181 - if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS) 182 - ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS; 183 - return 0; 184 - } 185 - 186 - /* FMR mode conveys up to 64 pages of payload per chunk segment. 187 - */ 188 - static size_t 189 - fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) 190 - { 191 - return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, 192 - (r_xprt->rx_ia.ri_max_segs - 2) * RPCRDMA_MAX_FMR_SGES); 193 - } 194 - 195 - /* Use the ib_map_phys_fmr() verb to register a memory region 196 - * for remote access via RDMA READ or RDMA WRITE. 197 - */ 198 - static struct rpcrdma_mr_seg * 199 - fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 200 - int nsegs, bool writing, struct rpcrdma_mr **out) 201 - { 202 - struct rpcrdma_mr_seg *seg1 = seg; 203 - int len, pageoff, i, rc; 204 - struct rpcrdma_mr *mr; 205 - u64 *dma_pages; 206 - 207 - mr = rpcrdma_mr_get(r_xprt); 208 - if (!mr) 209 - return ERR_PTR(-EAGAIN); 210 - 211 - pageoff = offset_in_page(seg1->mr_offset); 212 - seg1->mr_offset -= pageoff; /* start of page */ 213 - seg1->mr_len += pageoff; 214 - len = -pageoff; 215 - if (nsegs > RPCRDMA_MAX_FMR_SGES) 216 - nsegs = RPCRDMA_MAX_FMR_SGES; 217 - for (i = 0; i < nsegs;) { 218 - if (seg->mr_page) 219 - sg_set_page(&mr->mr_sg[i], 220 - seg->mr_page, 221 - seg->mr_len, 222 - offset_in_page(seg->mr_offset)); 223 - else 224 - sg_set_buf(&mr->mr_sg[i], seg->mr_offset, 225 - seg->mr_len); 226 - len += seg->mr_len; 227 - ++seg; 228 - ++i; 229 - /* Check for holes */ 230 - if ((i < nsegs && offset_in_page(seg->mr_offset)) || 231 - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) 232 - break; 233 - } 234 - mr->mr_dir = rpcrdma_data_dir(writing); 235 - 236 - mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, 237 - mr->mr_sg, i, mr->mr_dir); 238 - if (!mr->mr_nents) 239 - goto out_dmamap_err; 240 - trace_xprtrdma_mr_map(mr); 241 - 242 - for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) 243 - dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); 244 - rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents, 245 - dma_pages[0]); 246 - if (rc) 247 - goto out_maperr; 248 - 249 - mr->mr_handle = mr->fmr.fm_mr->rkey; 250 - mr->mr_length = len; 251 - mr->mr_offset = dma_pages[0] + pageoff; 252 - 253 - *out = mr; 254 - return seg; 255 - 256 - out_dmamap_err: 257 - pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", 258 - mr->mr_sg, i); 259 - rpcrdma_mr_put(mr); 260 - return ERR_PTR(-EIO); 261 - 262 - out_maperr: 263 - pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", 264 - len, (unsigned long long)dma_pages[0], 265 - pageoff, mr->mr_nents, rc); 266 - rpcrdma_mr_unmap_and_put(mr); 267 - return ERR_PTR(-EIO); 268 - } 269 - 270 - /* Post Send WR containing the RPC Call message. 271 - */ 272 - static int 273 - fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) 274 - { 275 - return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, NULL); 276 - } 277 - 278 - /* Invalidate all memory regions that were registered for "req". 279 - * 280 - * Sleeps until it is safe for the host CPU to access the 281 - * previously mapped memory regions. 282 - * 283 - * Caller ensures that @mrs is not empty before the call. This 284 - * function empties the list. 285 - */ 286 - static void 287 - fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) 288 - { 289 - struct rpcrdma_mr *mr; 290 - LIST_HEAD(unmap_list); 291 - int rc; 292 - 293 - /* ORDER: Invalidate all of the req's MRs first 294 - * 295 - * ib_unmap_fmr() is slow, so use a single call instead 296 - * of one call per mapped FMR. 297 - */ 298 - list_for_each_entry(mr, mrs, mr_list) { 299 - dprintk("RPC: %s: unmapping fmr %p\n", 300 - __func__, &mr->fmr); 301 - trace_xprtrdma_mr_localinv(mr); 302 - list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); 303 - } 304 - r_xprt->rx_stats.local_inv_needed++; 305 - rc = ib_unmap_fmr(&unmap_list); 306 - if (rc) 307 - goto out_release; 308 - 309 - /* ORDER: Now DMA unmap all of the req's MRs, and return 310 - * them to the free MW list. 311 - */ 312 - while (!list_empty(mrs)) { 313 - mr = rpcrdma_mr_pop(mrs); 314 - list_del(&mr->fmr.fm_mr->list); 315 - rpcrdma_mr_unmap_and_put(mr); 316 - } 317 - 318 - return; 319 - 320 - out_release: 321 - pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); 322 - 323 - while (!list_empty(mrs)) { 324 - mr = rpcrdma_mr_pop(mrs); 325 - list_del(&mr->fmr.fm_mr->list); 326 - rpcrdma_mr_recycle(mr); 327 - } 328 - } 329 - 330 - const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { 331 - .ro_map = fmr_op_map, 332 - .ro_send = fmr_op_send, 333 - .ro_unmap_sync = fmr_op_unmap_sync, 334 - .ro_open = fmr_op_open, 335 - .ro_maxpages = fmr_op_maxpages, 336 - .ro_init_mr = fmr_op_init_mr, 337 - .ro_release_mr = fmr_op_release_mr, 338 - .ro_displayname = "fmr", 339 - .ro_send_w_inv_ok = 0, 340 - };
-6
net/sunrpc/xprtrdma/verbs.c
··· 397 397 break; 398 398 } 399 399 /*FALLTHROUGH*/ 400 - case RPCRDMA_MTHCAFMR: 401 - if (fmr_is_supported(ia)) { 402 - ia->ri_ops = &rpcrdma_fmr_memreg_ops; 403 - break; 404 - } 405 - /*FALLTHROUGH*/ 406 400 default: 407 401 pr_err("rpcrdma: Device %s does not support memreg mode %d\n", 408 402 ia->ri_device->name, xprt_rdma_memreg_strategy);
+1 -11
net/sunrpc/xprtrdma/xprt_rdma.h
··· 262 262 }; 263 263 }; 264 264 265 - struct rpcrdma_fmr { 266 - struct ib_fmr *fm_mr; 267 - u64 *fm_physaddrs; 268 - }; 269 - 270 265 struct rpcrdma_mr { 271 266 struct list_head mr_list; 272 267 struct scatterlist *mr_sg; 273 268 int mr_nents; 274 269 enum dma_data_direction mr_dir; 275 - union { 276 - struct rpcrdma_fmr fmr; 277 - struct rpcrdma_frwr frwr; 278 - }; 270 + struct rpcrdma_frwr frwr; 279 271 struct rpcrdma_xprt *mr_xprt; 280 272 u32 mr_handle; 281 273 u32 mr_length; ··· 482 490 const int ro_send_w_inv_ok; 483 491 }; 484 492 485 - extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; 486 493 extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; 487 494 488 495 /* ··· 536 545 void rpcrdma_ia_remove(struct rpcrdma_ia *ia); 537 546 void rpcrdma_ia_close(struct rpcrdma_ia *); 538 547 bool frwr_is_supported(struct rpcrdma_ia *); 539 - bool fmr_is_supported(struct rpcrdma_ia *); 540 548 541 549 /* 542 550 * Endpoint calls - xprtrdma/verbs.c