Merge tag 'nfs-rdma-for-4.14-1' of git://git.linux-nfs.org/projects/anna/linux-nfs into linux-next

NFS-over-RDMA client updates for Linux 4.14

Bugfixes and cleanups:
- Constify rpc_xprt_ops
- Harden RPC call encoding and decoding
- Clean up rpc call decoding to use xdr_streams
- Remove unused variables from various structures
- Refactor code to remove imul instructions
- Rearrange rx_stats structure for better cacheline sharing

+604 -433
+13
include/linux/sunrpc/xdr.h
··· 239 239 extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); 240 240 extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); 241 241 242 + /** 243 + * xdr_stream_remaining - Return the number of bytes remaining in the stream 244 + * @xdr: pointer to struct xdr_stream 245 + * 246 + * Return value: 247 + * Number of bytes remaining in @xdr before xdr->end 248 + */ 249 + static inline size_t 250 + xdr_stream_remaining(const struct xdr_stream *xdr) 251 + { 252 + return xdr->nwords << 2; 253 + } 254 + 242 255 ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, 243 256 size_t maxlen, gfp_t gfp_flags); 244 257 /**
+1 -1
include/linux/sunrpc/xprt.h
··· 174 174 175 175 struct rpc_xprt { 176 176 struct kref kref; /* Reference count */ 177 - struct rpc_xprt_ops * ops; /* transport methods */ 177 + const struct rpc_xprt_ops *ops; /* transport methods */ 178 178 179 179 const struct rpc_timeout *timeout; /* timeout parms */ 180 180 struct sockaddr_storage addr; /* server address */
+26 -43
net/sunrpc/xprtrdma/backchannel.c
··· 49 49 if (IS_ERR(rb)) 50 50 goto out_fail; 51 51 req->rl_rdmabuf = rb; 52 + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); 52 53 53 54 size = r_xprt->rx_data.inline_rsize; 54 55 rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); ··· 203 202 */ 204 203 int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) 205 204 { 206 - struct rpc_xprt *xprt = rqst->rq_xprt; 207 - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 205 + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); 208 206 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 209 - struct rpcrdma_msg *headerp; 207 + __be32 *p; 210 208 211 - headerp = rdmab_to_msg(req->rl_rdmabuf); 212 - headerp->rm_xid = rqst->rq_xid; 213 - headerp->rm_vers = rpcrdma_version; 214 - headerp->rm_credit = 215 - cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); 216 - headerp->rm_type = rdma_msg; 217 - headerp->rm_body.rm_chunks[0] = xdr_zero; 218 - headerp->rm_body.rm_chunks[1] = xdr_zero; 219 - headerp->rm_body.rm_chunks[2] = xdr_zero; 209 + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); 210 + xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf, 211 + req->rl_rdmabuf->rg_base); 212 + 213 + p = xdr_reserve_space(&req->rl_stream, 28); 214 + if (unlikely(!p)) 215 + return -EIO; 216 + *p++ = rqst->rq_xid; 217 + *p++ = rpcrdma_version; 218 + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); 219 + *p++ = rdma_msg; 220 + *p++ = xdr_zero; 221 + *p++ = xdr_zero; 222 + *p = xdr_zero; 220 223 221 224 if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN, 222 225 &rqst->rq_snd_buf, rpcrdma_noch)) ··· 276 271 * @xprt: transport receiving the call 277 272 * @rep: receive buffer containing the call 278 273 * 279 - * Called in the RPC reply handler, which runs in a tasklet. 280 - * Be quick about it. 281 - * 282 274 * Operational assumptions: 283 275 * o Backchannel credits are ignored, just as the NFS server 284 276 * forechannel currently does ··· 286 284 struct rpcrdma_rep *rep) 287 285 { 288 286 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 289 - struct rpcrdma_msg *headerp; 290 287 struct svc_serv *bc_serv; 291 288 struct rpcrdma_req *req; 292 289 struct rpc_rqst *rqst; ··· 293 292 size_t size; 294 293 __be32 *p; 295 294 296 - headerp = rdmab_to_msg(rep->rr_rdmabuf); 295 + p = xdr_inline_decode(&rep->rr_stream, 0); 296 + size = xdr_stream_remaining(&rep->rr_stream); 297 + 297 298 #ifdef RPCRDMA_BACKCHANNEL_DEBUG 298 299 pr_info("RPC: %s: callback XID %08x, length=%u\n", 299 - __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len); 300 - pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp); 300 + __func__, be32_to_cpup(p), size); 301 + pr_info("RPC: %s: %*ph\n", __func__, size, p); 301 302 #endif 302 - 303 - /* Sanity check: 304 - * Need at least enough bytes for RPC/RDMA header, as code 305 - * here references the header fields by array offset. Also, 306 - * backward calls are always inline, so ensure there 307 - * are some bytes beyond the RPC/RDMA header. 308 - */ 309 - if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24) 310 - goto out_short; 311 - p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); 312 - size = rep->rr_len - RPCRDMA_HDRLEN_MIN; 313 303 314 304 /* Grab a free bc rqst */ 315 305 spin_lock(&xprt->bc_pa_lock); ··· 317 325 /* Prepare rqst */ 318 326 rqst->rq_reply_bytes_recvd = 0; 319 327 rqst->rq_bytes_sent = 0; 320 - rqst->rq_xid = headerp->rm_xid; 328 + rqst->rq_xid = *p; 321 329 322 330 rqst->rq_private_buf.len = size; 323 331 set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); ··· 329 337 buf->len = size; 330 338 331 339 /* The receive buffer has to be hooked to the rpcrdma_req 332 - * so that it can be reposted after the server is done 333 - * parsing it but just before sending the backward 334 - * direction reply. 340 + * so that it is not released while the req is pointing 341 + * to its buffer, and so that it can be reposted after 342 + * the Upper Layer is done decoding it. 335 343 */ 336 344 req = rpcr_to_rdmar(rqst); 337 345 dprintk("RPC: %s: attaching rep %p to req %p\n", ··· 359 367 * when the connection is re-established. 360 368 */ 361 369 return; 362 - 363 - out_short: 364 - pr_warn("RPC/RDMA short backward direction call\n"); 365 - 366 - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) 367 - xprt_disconnect_done(xprt); 368 - else 369 - pr_warn("RPC: %s: reposting rep %p\n", 370 - __func__, rep); 371 370 }
+5 -5
net/sunrpc/xprtrdma/fmr_ops.c
··· 177 177 /* Use the ib_map_phys_fmr() verb to register a memory region 178 178 * for remote access via RDMA READ or RDMA WRITE. 179 179 */ 180 - static int 180 + static struct rpcrdma_mr_seg * 181 181 fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 182 182 int nsegs, bool writing, struct rpcrdma_mw **out) 183 183 { ··· 188 188 189 189 mw = rpcrdma_get_mw(r_xprt); 190 190 if (!mw) 191 - return -ENOBUFS; 191 + return ERR_PTR(-ENOBUFS); 192 192 193 193 pageoff = offset_in_page(seg1->mr_offset); 194 194 seg1->mr_offset -= pageoff; /* start of page */ ··· 232 232 mw->mw_offset = dma_pages[0] + pageoff; 233 233 234 234 *out = mw; 235 - return mw->mw_nents; 235 + return seg; 236 236 237 237 out_dmamap_err: 238 238 pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", 239 239 mw->mw_sg, i); 240 240 rpcrdma_put_mw(r_xprt, mw); 241 - return -EIO; 241 + return ERR_PTR(-EIO); 242 242 243 243 out_maperr: 244 244 pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", ··· 247 247 ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, 248 248 mw->mw_sg, mw->mw_nents, mw->mw_dir); 249 249 rpcrdma_put_mw(r_xprt, mw); 250 - return -EIO; 250 + return ERR_PTR(-EIO); 251 251 } 252 252 253 253 /* Invalidate all memory regions that were registered for "req".
+6 -6
net/sunrpc/xprtrdma/frwr_ops.c
··· 344 344 /* Post a REG_MR Work Request to register a memory region 345 345 * for remote access via RDMA READ or RDMA WRITE. 346 346 */ 347 - static int 347 + static struct rpcrdma_mr_seg * 348 348 frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, 349 349 int nsegs, bool writing, struct rpcrdma_mw **out) 350 350 { ··· 364 364 rpcrdma_defer_mr_recovery(mw); 365 365 mw = rpcrdma_get_mw(r_xprt); 366 366 if (!mw) 367 - return -ENOBUFS; 367 + return ERR_PTR(-ENOBUFS); 368 368 } while (mw->frmr.fr_state != FRMR_IS_INVALID); 369 369 frmr = &mw->frmr; 370 370 frmr->fr_state = FRMR_IS_VALID; ··· 429 429 mw->mw_offset = mr->iova; 430 430 431 431 *out = mw; 432 - return mw->mw_nents; 432 + return seg; 433 433 434 434 out_dmamap_err: 435 435 pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", 436 436 mw->mw_sg, i); 437 437 frmr->fr_state = FRMR_IS_INVALID; 438 438 rpcrdma_put_mw(r_xprt, mw); 439 - return -EIO; 439 + return ERR_PTR(-EIO); 440 440 441 441 out_mapmr_err: 442 442 pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", 443 443 frmr->fr_mr, n, mw->mw_nents); 444 444 rpcrdma_defer_mr_recovery(mw); 445 - return -EIO; 445 + return ERR_PTR(-EIO); 446 446 447 447 out_senderr: 448 448 pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); 449 449 rpcrdma_defer_mr_recovery(mw); 450 - return -ENOTCONN; 450 + return ERR_PTR(-ENOTCONN); 451 451 } 452 452 453 453 /* Invalidate all memory regions that were registered for "req".
+513 -347
net/sunrpc/xprtrdma/rpc_rdma.c
··· 169 169 return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; 170 170 } 171 171 172 - /* Split "vec" on page boundaries into segments. FMR registers pages, 173 - * not a byte range. Other modes coalesce these segments into a single 174 - * MR when they can. 172 + /* Split @vec on page boundaries into SGEs. FMR registers pages, not 173 + * a byte range. Other modes coalesce these SGEs into a single MR 174 + * when they can. 175 + * 176 + * Returns pointer to next available SGE, and bumps the total number 177 + * of SGEs consumed. 175 178 */ 176 - static int 177 - rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) 179 + static struct rpcrdma_mr_seg * 180 + rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 181 + unsigned int *n) 178 182 { 179 - size_t page_offset; 180 - u32 remaining; 183 + u32 remaining, page_offset; 181 184 char *base; 182 185 183 186 base = vec->iov_base; 184 187 page_offset = offset_in_page(base); 185 188 remaining = vec->iov_len; 186 - while (remaining && n < RPCRDMA_MAX_SEGS) { 187 - seg[n].mr_page = NULL; 188 - seg[n].mr_offset = base; 189 - seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); 190 - remaining -= seg[n].mr_len; 191 - base += seg[n].mr_len; 192 - ++n; 189 + while (remaining) { 190 + seg->mr_page = NULL; 191 + seg->mr_offset = base; 192 + seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); 193 + remaining -= seg->mr_len; 194 + base += seg->mr_len; 195 + ++seg; 196 + ++(*n); 193 197 page_offset = 0; 194 198 } 195 - return n; 199 + return seg; 196 200 } 197 201 198 - /* 199 - * Chunk assembly from upper layer xdr_buf. 202 + /* Convert @xdrbuf into SGEs no larger than a page each. As they 203 + * are registered, these SGEs are then coalesced into RDMA segments 204 + * when the selected memreg mode supports it. 200 205 * 201 - * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk 202 - * elements. Segments are then coalesced when registered, if possible 203 - * within the selected memreg mode. 204 - * 205 - * Returns positive number of segments converted, or a negative errno. 206 + * Returns positive number of SGEs consumed, or a negative errno. 206 207 */ 207 208 208 209 static int ··· 211 210 unsigned int pos, enum rpcrdma_chunktype type, 212 211 struct rpcrdma_mr_seg *seg) 213 212 { 214 - int len, n, p, page_base; 213 + unsigned long page_base; 214 + unsigned int len, n; 215 215 struct page **ppages; 216 216 217 217 n = 0; 218 - if (pos == 0) { 219 - n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); 220 - if (n == RPCRDMA_MAX_SEGS) 221 - goto out_overflow; 222 - } 218 + if (pos == 0) 219 + seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); 223 220 224 221 len = xdrbuf->page_len; 225 222 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 226 223 page_base = offset_in_page(xdrbuf->page_base); 227 - p = 0; 228 - while (len && n < RPCRDMA_MAX_SEGS) { 229 - if (!ppages[p]) { 230 - /* alloc the pagelist for receiving buffer */ 231 - ppages[p] = alloc_page(GFP_ATOMIC); 232 - if (!ppages[p]) 224 + while (len) { 225 + if (unlikely(!*ppages)) { 226 + /* XXX: Certain upper layer operations do 227 + * not provide receive buffer pages. 228 + */ 229 + *ppages = alloc_page(GFP_ATOMIC); 230 + if (!*ppages) 233 231 return -EAGAIN; 234 232 } 235 - seg[n].mr_page = ppages[p]; 236 - seg[n].mr_offset = (void *)(unsigned long) page_base; 237 - seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); 238 - if (seg[n].mr_len > PAGE_SIZE) 239 - goto out_overflow; 240 - len -= seg[n].mr_len; 233 + seg->mr_page = *ppages; 234 + seg->mr_offset = (char *)page_base; 235 + seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); 236 + len -= seg->mr_len; 237 + ++ppages; 238 + ++seg; 241 239 ++n; 242 - ++p; 243 - page_base = 0; /* page offset only applies to first page */ 240 + page_base = 0; 244 241 } 245 - 246 - /* Message overflows the seg array */ 247 - if (len && n == RPCRDMA_MAX_SEGS) 248 - goto out_overflow; 249 242 250 243 /* When encoding a Read chunk, the tail iovec contains an 251 244 * XDR pad and may be omitted. 252 245 */ 253 246 if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) 254 - return n; 247 + goto out; 255 248 256 249 /* When encoding a Write chunk, some servers need to see an 257 250 * extra segment for non-XDR-aligned Write chunks. The upper ··· 253 258 * for this purpose. 254 259 */ 255 260 if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) 256 - return n; 261 + goto out; 257 262 258 - if (xdrbuf->tail[0].iov_len) { 259 - n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); 260 - if (n == RPCRDMA_MAX_SEGS) 261 - goto out_overflow; 262 - } 263 + if (xdrbuf->tail[0].iov_len) 264 + seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); 263 265 266 + out: 267 + if (unlikely(n > RPCRDMA_MAX_SEGS)) 268 + return -EIO; 264 269 return n; 265 - 266 - out_overflow: 267 - pr_err("rpcrdma: segment array overflow\n"); 268 - return -EIO; 269 270 } 270 271 271 - static inline __be32 * 272 + static inline int 273 + encode_item_present(struct xdr_stream *xdr) 274 + { 275 + __be32 *p; 276 + 277 + p = xdr_reserve_space(xdr, sizeof(*p)); 278 + if (unlikely(!p)) 279 + return -EMSGSIZE; 280 + 281 + *p = xdr_one; 282 + return 0; 283 + } 284 + 285 + static inline int 286 + encode_item_not_present(struct xdr_stream *xdr) 287 + { 288 + __be32 *p; 289 + 290 + p = xdr_reserve_space(xdr, sizeof(*p)); 291 + if (unlikely(!p)) 292 + return -EMSGSIZE; 293 + 294 + *p = xdr_zero; 295 + return 0; 296 + } 297 + 298 + static void 272 299 xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) 273 300 { 274 301 *iptr++ = cpu_to_be32(mw->mw_handle); 275 302 *iptr++ = cpu_to_be32(mw->mw_length); 276 - return xdr_encode_hyper(iptr, mw->mw_offset); 303 + xdr_encode_hyper(iptr, mw->mw_offset); 277 304 } 278 305 279 - /* XDR-encode the Read list. Supports encoding a list of read 306 + static int 307 + encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) 308 + { 309 + __be32 *p; 310 + 311 + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 312 + if (unlikely(!p)) 313 + return -EMSGSIZE; 314 + 315 + xdr_encode_rdma_segment(p, mw); 316 + return 0; 317 + } 318 + 319 + static int 320 + encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, 321 + u32 position) 322 + { 323 + __be32 *p; 324 + 325 + p = xdr_reserve_space(xdr, 6 * sizeof(*p)); 326 + if (unlikely(!p)) 327 + return -EMSGSIZE; 328 + 329 + *p++ = xdr_one; /* Item present */ 330 + *p++ = cpu_to_be32(position); 331 + xdr_encode_rdma_segment(p, mw); 332 + return 0; 333 + } 334 + 335 + /* Register and XDR encode the Read list. Supports encoding a list of read 280 336 * segments that belong to a single read chunk. 281 337 * 282 338 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): ··· 336 290 * N elements, position P (same P for all chunks of same arg!): 337 291 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 338 292 * 339 - * Returns a pointer to the XDR word in the RDMA header following 340 - * the end of the Read list, or an error pointer. 293 + * Returns zero on success, or a negative errno if a failure occurred. 294 + * @xdr is advanced to the next position in the stream. 295 + * 296 + * Only a single @pos value is currently supported. 341 297 */ 342 - static __be32 * 343 - rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, 344 - struct rpcrdma_req *req, struct rpc_rqst *rqst, 345 - __be32 *iptr, enum rpcrdma_chunktype rtype) 298 + static noinline int 299 + rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 300 + struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) 346 301 { 302 + struct xdr_stream *xdr = &req->rl_stream; 347 303 struct rpcrdma_mr_seg *seg; 348 304 struct rpcrdma_mw *mw; 349 305 unsigned int pos; 350 - int n, nsegs; 351 - 352 - if (rtype == rpcrdma_noch) { 353 - *iptr++ = xdr_zero; /* item not present */ 354 - return iptr; 355 - } 306 + int nsegs; 356 307 357 308 pos = rqst->rq_snd_buf.head[0].iov_len; 358 309 if (rtype == rpcrdma_areadch) ··· 358 315 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, 359 316 rtype, seg); 360 317 if (nsegs < 0) 361 - return ERR_PTR(nsegs); 318 + return nsegs; 362 319 363 320 do { 364 - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 365 - false, &mw); 366 - if (n < 0) 367 - return ERR_PTR(n); 321 + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 322 + false, &mw); 323 + if (IS_ERR(seg)) 324 + return PTR_ERR(seg); 368 325 rpcrdma_push_mw(mw, &req->rl_registered); 369 326 370 - *iptr++ = xdr_one; /* item present */ 371 - 372 - /* All read segments in this chunk 373 - * have the same "position". 374 - */ 375 - *iptr++ = cpu_to_be32(pos); 376 - iptr = xdr_encode_rdma_segment(iptr, mw); 327 + if (encode_read_segment(xdr, mw, pos) < 0) 328 + return -EMSGSIZE; 377 329 378 330 dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", 379 331 rqst->rq_task->tk_pid, __func__, pos, 380 332 mw->mw_length, (unsigned long long)mw->mw_offset, 381 - mw->mw_handle, n < nsegs ? "more" : "last"); 333 + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); 382 334 383 335 r_xprt->rx_stats.read_chunk_count++; 384 - seg += n; 385 - nsegs -= n; 336 + nsegs -= mw->mw_nents; 386 337 } while (nsegs); 387 338 388 - /* Finish Read list */ 389 - *iptr++ = xdr_zero; /* Next item not present */ 390 - return iptr; 339 + return 0; 391 340 } 392 341 393 - /* XDR-encode the Write list. Supports encoding a list containing 394 - * one array of plain segments that belong to a single write chunk. 342 + /* Register and XDR encode the Write list. Supports encoding a list 343 + * containing one array of plain segments that belong to a single 344 + * write chunk. 395 345 * 396 346 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 397 347 * ··· 392 356 * N elements: 393 357 * 1 - N - HLOO - HLOO - ... - HLOO - 0 394 358 * 395 - * Returns a pointer to the XDR word in the RDMA header following 396 - * the end of the Write list, or an error pointer. 359 + * Returns zero on success, or a negative errno if a failure occurred. 360 + * @xdr is advanced to the next position in the stream. 361 + * 362 + * Only a single Write chunk is currently supported. 397 363 */ 398 - static __be32 * 364 + static noinline int 399 365 rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 400 - struct rpc_rqst *rqst, __be32 *iptr, 401 - enum rpcrdma_chunktype wtype) 366 + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) 402 367 { 368 + struct xdr_stream *xdr = &req->rl_stream; 403 369 struct rpcrdma_mr_seg *seg; 404 370 struct rpcrdma_mw *mw; 405 - int n, nsegs, nchunks; 371 + int nsegs, nchunks; 406 372 __be32 *segcount; 407 - 408 - if (wtype != rpcrdma_writech) { 409 - *iptr++ = xdr_zero; /* no Write list present */ 410 - return iptr; 411 - } 412 373 413 374 seg = req->rl_segments; 414 375 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 415 376 rqst->rq_rcv_buf.head[0].iov_len, 416 377 wtype, seg); 417 378 if (nsegs < 0) 418 - return ERR_PTR(nsegs); 379 + return nsegs; 419 380 420 - *iptr++ = xdr_one; /* Write list present */ 421 - segcount = iptr++; /* save location of segment count */ 381 + if (encode_item_present(xdr) < 0) 382 + return -EMSGSIZE; 383 + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 384 + if (unlikely(!segcount)) 385 + return -EMSGSIZE; 386 + /* Actual value encoded below */ 422 387 423 388 nchunks = 0; 424 389 do { 425 - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 426 - true, &mw); 427 - if (n < 0) 428 - return ERR_PTR(n); 390 + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 391 + true, &mw); 392 + if (IS_ERR(seg)) 393 + return PTR_ERR(seg); 429 394 rpcrdma_push_mw(mw, &req->rl_registered); 430 395 431 - iptr = xdr_encode_rdma_segment(iptr, mw); 396 + if (encode_rdma_segment(xdr, mw) < 0) 397 + return -EMSGSIZE; 432 398 433 399 dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", 434 400 rqst->rq_task->tk_pid, __func__, 435 401 mw->mw_length, (unsigned long long)mw->mw_offset, 436 - mw->mw_handle, n < nsegs ? "more" : "last"); 402 + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); 437 403 438 404 r_xprt->rx_stats.write_chunk_count++; 439 405 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 440 406 nchunks++; 441 - seg += n; 442 - nsegs -= n; 407 + nsegs -= mw->mw_nents; 443 408 } while (nsegs); 444 409 445 410 /* Update count of segments in this Write chunk */ 446 411 *segcount = cpu_to_be32(nchunks); 447 412 448 - /* Finish Write list */ 449 - *iptr++ = xdr_zero; /* Next item not present */ 450 - return iptr; 413 + return 0; 451 414 } 452 415 453 - /* XDR-encode the Reply chunk. Supports encoding an array of plain 454 - * segments that belong to a single write (reply) chunk. 416 + /* Register and XDR encode the Reply chunk. Supports encoding an array 417 + * of plain segments that belong to a single write (reply) chunk. 455 418 * 456 419 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 457 420 * ··· 458 423 * N elements: 459 424 * 1 - N - HLOO - HLOO - ... - HLOO 460 425 * 461 - * Returns a pointer to the XDR word in the RDMA header following 462 - * the end of the Reply chunk, or an error pointer. 426 + * Returns zero on success, or a negative errno if a failure occurred. 427 + * @xdr is advanced to the next position in the stream. 463 428 */ 464 - static __be32 * 465 - rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, 466 - struct rpcrdma_req *req, struct rpc_rqst *rqst, 467 - __be32 *iptr, enum rpcrdma_chunktype wtype) 429 + static noinline int 430 + rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, 431 + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) 468 432 { 433 + struct xdr_stream *xdr = &req->rl_stream; 469 434 struct rpcrdma_mr_seg *seg; 470 435 struct rpcrdma_mw *mw; 471 - int n, nsegs, nchunks; 436 + int nsegs, nchunks; 472 437 __be32 *segcount; 473 - 474 - if (wtype != rpcrdma_replych) { 475 - *iptr++ = xdr_zero; /* no Reply chunk present */ 476 - return iptr; 477 - } 478 438 479 439 seg = req->rl_segments; 480 440 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); 481 441 if (nsegs < 0) 482 - return ERR_PTR(nsegs); 442 + return nsegs; 483 443 484 - *iptr++ = xdr_one; /* Reply chunk present */ 485 - segcount = iptr++; /* save location of segment count */ 444 + if (encode_item_present(xdr) < 0) 445 + return -EMSGSIZE; 446 + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 447 + if (unlikely(!segcount)) 448 + return -EMSGSIZE; 449 + /* Actual value encoded below */ 486 450 487 451 nchunks = 0; 488 452 do { 489 - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 490 - true, &mw); 491 - if (n < 0) 492 - return ERR_PTR(n); 453 + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, 454 + true, &mw); 455 + if (IS_ERR(seg)) 456 + return PTR_ERR(seg); 493 457 rpcrdma_push_mw(mw, &req->rl_registered); 494 458 495 - iptr = xdr_encode_rdma_segment(iptr, mw); 459 + if (encode_rdma_segment(xdr, mw) < 0) 460 + return -EMSGSIZE; 496 461 497 462 dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", 498 463 rqst->rq_task->tk_pid, __func__, 499 464 mw->mw_length, (unsigned long long)mw->mw_offset, 500 - mw->mw_handle, n < nsegs ? "more" : "last"); 465 + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); 501 466 502 467 r_xprt->rx_stats.reply_chunk_count++; 503 468 r_xprt->rx_stats.total_rdma_request += seg->mr_len; 504 469 nchunks++; 505 - seg += n; 506 - nsegs -= n; 470 + nsegs -= mw->mw_nents; 507 471 } while (nsegs); 508 472 509 473 /* Update count of segments in the Reply chunk */ 510 474 *segcount = cpu_to_be32(nchunks); 511 475 512 - return iptr; 476 + return 0; 513 477 } 514 478 515 479 /* Prepare the RPC-over-RDMA header SGE. ··· 685 651 req->rl_mapped_sges = 0; 686 652 } 687 653 688 - /* 689 - * Marshal a request: the primary job of this routine is to choose 690 - * the transfer modes. See comments below. 654 + /** 655 + * rpcrdma_marshal_req - Marshal and send one RPC request 656 + * @r_xprt: controlling transport 657 + * @rqst: RPC request to be marshaled 691 658 * 692 - * Returns zero on success, otherwise a negative errno. 659 + * For the RPC in "rqst", this function: 660 + * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) 661 + * - Registers Read, Write, and Reply chunks 662 + * - Constructs the transport header 663 + * - Posts a Send WR to send the transport header and request 664 + * 665 + * Returns: 666 + * %0 if the RPC was sent successfully, 667 + * %-ENOTCONN if the connection was lost, 668 + * %-EAGAIN if not enough pages are available for on-demand reply buffer, 669 + * %-ENOBUFS if no MRs are available to register chunks, 670 + * %-EMSGSIZE if the transport header is too small, 671 + * %-EIO if a permanent problem occurred while marshaling. 693 672 */ 694 - 695 673 int 696 - rpcrdma_marshal_req(struct rpc_rqst *rqst) 674 + rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) 697 675 { 698 - struct rpc_xprt *xprt = rqst->rq_xprt; 699 - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); 700 676 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 677 + struct xdr_stream *xdr = &req->rl_stream; 701 678 enum rpcrdma_chunktype rtype, wtype; 702 - struct rpcrdma_msg *headerp; 703 679 bool ddp_allowed; 704 - ssize_t hdrlen; 705 - size_t rpclen; 706 - __be32 *iptr; 680 + __be32 *p; 681 + int ret; 707 682 708 683 #if defined(CONFIG_SUNRPC_BACKCHANNEL) 709 684 if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) 710 685 return rpcrdma_bc_marshal_reply(rqst); 711 686 #endif 712 687 713 - headerp = rdmab_to_msg(req->rl_rdmabuf); 714 - /* don't byte-swap XID, it's already done in request */ 715 - headerp->rm_xid = rqst->rq_xid; 716 - headerp->rm_vers = rpcrdma_version; 717 - headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); 718 - headerp->rm_type = rdma_msg; 688 + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); 689 + xdr_init_encode(xdr, &req->rl_hdrbuf, 690 + req->rl_rdmabuf->rg_base); 691 + 692 + /* Fixed header fields */ 693 + ret = -EMSGSIZE; 694 + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 695 + if (!p) 696 + goto out_err; 697 + *p++ = rqst->rq_xid; 698 + *p++ = rpcrdma_version; 699 + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); 719 700 720 701 /* When the ULP employs a GSS flavor that guarantees integrity 721 702 * or privacy, direct data placement of individual data items ··· 770 721 * by themselves are larger than the inline threshold. 771 722 */ 772 723 if (rpcrdma_args_inline(r_xprt, rqst)) { 724 + *p++ = rdma_msg; 773 725 rtype = rpcrdma_noch; 774 - rpclen = rqst->rq_snd_buf.len; 775 726 } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { 727 + *p++ = rdma_msg; 776 728 rtype = rpcrdma_readch; 777 - rpclen = rqst->rq_snd_buf.head[0].iov_len + 778 - rqst->rq_snd_buf.tail[0].iov_len; 779 729 } else { 780 730 r_xprt->rx_stats.nomsg_call_count++; 781 - headerp->rm_type = htonl(RDMA_NOMSG); 731 + *p++ = rdma_nomsg; 782 732 rtype = rpcrdma_areadch; 783 - rpclen = 0; 784 733 } 785 734 786 735 req->rl_xid = rqst->rq_xid; ··· 806 759 * send a Call message with a Position Zero Read chunk and a 807 760 * regular Read chunk at the same time. 808 761 */ 809 - iptr = headerp->rm_body.rm_chunks; 810 - iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); 811 - if (IS_ERR(iptr)) 762 + if (rtype != rpcrdma_noch) { 763 + ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); 764 + if (ret) 765 + goto out_err; 766 + } 767 + ret = encode_item_not_present(xdr); 768 + if (ret) 812 769 goto out_err; 813 - iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); 814 - if (IS_ERR(iptr)) 815 - goto out_err; 816 - iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); 817 - if (IS_ERR(iptr)) 818 - goto out_err; 819 - hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; 820 770 821 - dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", 771 + if (wtype == rpcrdma_writech) { 772 + ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); 773 + if (ret) 774 + goto out_err; 775 + } 776 + ret = encode_item_not_present(xdr); 777 + if (ret) 778 + goto out_err; 779 + 780 + if (wtype != rpcrdma_replych) 781 + ret = encode_item_not_present(xdr); 782 + else 783 + ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); 784 + if (ret) 785 + goto out_err; 786 + 787 + dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n", 822 788 rqst->rq_task->tk_pid, __func__, 823 789 transfertypes[rtype], transfertypes[wtype], 824 - hdrlen, rpclen); 790 + xdr_stream_pos(xdr)); 825 791 826 - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, 792 + if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, 793 + xdr_stream_pos(xdr), 827 794 &rqst->rq_snd_buf, rtype)) { 828 - iptr = ERR_PTR(-EIO); 795 + ret = -EIO; 829 796 goto out_err; 830 797 } 831 798 return 0; 832 799 833 800 out_err: 834 - if (PTR_ERR(iptr) != -ENOBUFS) { 835 - pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", 836 - PTR_ERR(iptr)); 801 + if (ret != -ENOBUFS) { 802 + pr_err("rpcrdma: header marshaling failed (%d)\n", ret); 837 803 r_xprt->rx_stats.failed_marshal_count++; 838 804 } 839 - return PTR_ERR(iptr); 840 - } 841 - 842 - /* 843 - * Chase down a received write or reply chunklist to get length 844 - * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) 845 - */ 846 - static int 847 - rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) 848 - { 849 - unsigned int i, total_len; 850 - struct rpcrdma_write_chunk *cur_wchunk; 851 - char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); 852 - 853 - i = be32_to_cpu(**iptrp); 854 - cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); 855 - total_len = 0; 856 - while (i--) { 857 - struct rpcrdma_segment *seg = &cur_wchunk->wc_target; 858 - ifdebug(FACILITY) { 859 - u64 off; 860 - xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); 861 - dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n", 862 - __func__, 863 - be32_to_cpu(seg->rs_length), 864 - (unsigned long long)off, 865 - be32_to_cpu(seg->rs_handle)); 866 - } 867 - total_len += be32_to_cpu(seg->rs_length); 868 - ++cur_wchunk; 869 - } 870 - /* check and adjust for properly terminated write chunk */ 871 - if (wrchunk) { 872 - __be32 *w = (__be32 *) cur_wchunk; 873 - if (*w++ != xdr_zero) 874 - return -1; 875 - cur_wchunk = (struct rpcrdma_write_chunk *) w; 876 - } 877 - if ((char *)cur_wchunk > base + rep->rr_len) 878 - return -1; 879 - 880 - *iptrp = (__be32 *) cur_wchunk; 881 - return total_len; 805 + return ret; 882 806 } 883 807 884 808 /** ··· 967 949 } 968 950 } 969 951 970 - #if defined(CONFIG_SUNRPC_BACKCHANNEL) 971 952 /* By convention, backchannel calls arrive via rdma_msg type 972 953 * messages, and never populate the chunk lists. This makes 973 954 * the RPC/RDMA header small and fixed in size, so it is 974 955 * straightforward to check the RPC header's direction field. 975 956 */ 976 957 static bool 977 - rpcrdma_is_bcall(struct rpcrdma_msg *headerp) 958 + rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 959 + __be32 xid, __be32 proc) 960 + #if defined(CONFIG_SUNRPC_BACKCHANNEL) 978 961 { 979 - __be32 *p = (__be32 *)headerp; 962 + struct xdr_stream *xdr = &rep->rr_stream; 963 + __be32 *p; 980 964 981 - if (headerp->rm_type != rdma_msg) 982 - return false; 983 - if (headerp->rm_body.rm_chunks[0] != xdr_zero) 984 - return false; 985 - if (headerp->rm_body.rm_chunks[1] != xdr_zero) 986 - return false; 987 - if (headerp->rm_body.rm_chunks[2] != xdr_zero) 965 + if (proc != rdma_msg) 988 966 return false; 989 967 990 - /* sanity */ 991 - if (p[7] != headerp->rm_xid) 968 + /* Peek at stream contents without advancing. */ 969 + p = xdr_inline_decode(xdr, 0); 970 + 971 + /* Chunk lists */ 972 + if (*p++ != xdr_zero) 992 973 return false; 993 - /* call direction */ 994 - if (p[8] != cpu_to_be32(RPC_CALL)) 974 + if (*p++ != xdr_zero) 975 + return false; 976 + if (*p++ != xdr_zero) 995 977 return false; 996 978 979 + /* RPC header */ 980 + if (*p++ != xid) 981 + return false; 982 + if (*p != cpu_to_be32(RPC_CALL)) 983 + return false; 984 + 985 + /* Now that we are sure this is a backchannel call, 986 + * advance to the RPC header. 987 + */ 988 + p = xdr_inline_decode(xdr, 3 * sizeof(*p)); 989 + if (unlikely(!p)) 990 + goto out_short; 991 + 992 + rpcrdma_bc_receive_call(r_xprt, rep); 993 + return true; 994 + 995 + out_short: 996 + pr_warn("RPC/RDMA short backward direction call\n"); 997 + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) 998 + xprt_disconnect_done(&r_xprt->rx_xprt); 997 999 return true; 998 1000 } 1001 + #else /* CONFIG_SUNRPC_BACKCHANNEL */ 1002 + { 1003 + return false; 1004 + } 999 1005 #endif /* CONFIG_SUNRPC_BACKCHANNEL */ 1006 + 1007 + static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) 1008 + { 1009 + __be32 *p; 1010 + 1011 + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); 1012 + if (unlikely(!p)) 1013 + return -EIO; 1014 + 1015 + ifdebug(FACILITY) { 1016 + u64 offset; 1017 + u32 handle; 1018 + 1019 + handle = be32_to_cpup(p++); 1020 + *length = be32_to_cpup(p++); 1021 + xdr_decode_hyper(p, &offset); 1022 + dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n", 1023 + __func__, *length, (unsigned long long)offset, 1024 + handle); 1025 + } else { 1026 + *length = be32_to_cpup(p + 1); 1027 + } 1028 + 1029 + return 0; 1030 + } 1031 + 1032 + static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) 1033 + { 1034 + u32 segcount, seglength; 1035 + __be32 *p; 1036 + 1037 + p = xdr_inline_decode(xdr, sizeof(*p)); 1038 + if (unlikely(!p)) 1039 + return -EIO; 1040 + 1041 + *length = 0; 1042 + segcount = be32_to_cpup(p); 1043 + while (segcount--) { 1044 + if (decode_rdma_segment(xdr, &seglength)) 1045 + return -EIO; 1046 + *length += seglength; 1047 + } 1048 + 1049 + dprintk("RPC: %s: segcount=%u, %u bytes\n", 1050 + __func__, be32_to_cpup(p), *length); 1051 + return 0; 1052 + } 1053 + 1054 + /* In RPC-over-RDMA Version One replies, a Read list is never 1055 + * expected. This decoder is a stub that returns an error if 1056 + * a Read list is present. 1057 + */ 1058 + static int decode_read_list(struct xdr_stream *xdr) 1059 + { 1060 + __be32 *p; 1061 + 1062 + p = xdr_inline_decode(xdr, sizeof(*p)); 1063 + if (unlikely(!p)) 1064 + return -EIO; 1065 + if (unlikely(*p != xdr_zero)) 1066 + return -EIO; 1067 + return 0; 1068 + } 1069 + 1070 + /* Supports only one Write chunk in the Write list 1071 + */ 1072 + static int decode_write_list(struct xdr_stream *xdr, u32 *length) 1073 + { 1074 + u32 chunklen; 1075 + bool first; 1076 + __be32 *p; 1077 + 1078 + *length = 0; 1079 + first = true; 1080 + do { 1081 + p = xdr_inline_decode(xdr, sizeof(*p)); 1082 + if (unlikely(!p)) 1083 + return -EIO; 1084 + if (*p == xdr_zero) 1085 + break; 1086 + if (!first) 1087 + return -EIO; 1088 + 1089 + if (decode_write_chunk(xdr, &chunklen)) 1090 + return -EIO; 1091 + *length += chunklen; 1092 + first = false; 1093 + } while (true); 1094 + return 0; 1095 + } 1096 + 1097 + static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) 1098 + { 1099 + __be32 *p; 1100 + 1101 + p = xdr_inline_decode(xdr, sizeof(*p)); 1102 + if (unlikely(!p)) 1103 + return -EIO; 1104 + 1105 + *length = 0; 1106 + if (*p != xdr_zero) 1107 + if (decode_write_chunk(xdr, length)) 1108 + return -EIO; 1109 + return 0; 1110 + } 1111 + 1112 + static int 1113 + rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1114 + struct rpc_rqst *rqst) 1115 + { 1116 + struct xdr_stream *xdr = &rep->rr_stream; 1117 + u32 writelist, replychunk, rpclen; 1118 + char *base; 1119 + 1120 + /* Decode the chunk lists */ 1121 + if (decode_read_list(xdr)) 1122 + return -EIO; 1123 + if (decode_write_list(xdr, &writelist)) 1124 + return -EIO; 1125 + if (decode_reply_chunk(xdr, &replychunk)) 1126 + return -EIO; 1127 + 1128 + /* RDMA_MSG sanity checks */ 1129 + if (unlikely(replychunk)) 1130 + return -EIO; 1131 + 1132 + /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ 1133 + base = (char *)xdr_inline_decode(xdr, 0); 1134 + rpclen = xdr_stream_remaining(xdr); 1135 + r_xprt->rx_stats.fixup_copy_count += 1136 + rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); 1137 + 1138 + r_xprt->rx_stats.total_rdma_reply += writelist; 1139 + return rpclen + xdr_align_size(writelist); 1140 + } 1141 + 1142 + static noinline int 1143 + rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1144 + { 1145 + struct xdr_stream *xdr = &rep->rr_stream; 1146 + u32 writelist, replychunk; 1147 + 1148 + /* Decode the chunk lists */ 1149 + if (decode_read_list(xdr)) 1150 + return -EIO; 1151 + if (decode_write_list(xdr, &writelist)) 1152 + return -EIO; 1153 + if (decode_reply_chunk(xdr, &replychunk)) 1154 + return -EIO; 1155 + 1156 + /* RDMA_NOMSG sanity checks */ 1157 + if (unlikely(writelist)) 1158 + return -EIO; 1159 + if (unlikely(!replychunk)) 1160 + return -EIO; 1161 + 1162 + /* Reply chunk buffer already is the reply vector */ 1163 + r_xprt->rx_stats.total_rdma_reply += replychunk; 1164 + return replychunk; 1165 + } 1166 + 1167 + static noinline int 1168 + rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1169 + struct rpc_rqst *rqst) 1170 + { 1171 + struct xdr_stream *xdr = &rep->rr_stream; 1172 + __be32 *p; 1173 + 1174 + p = xdr_inline_decode(xdr, sizeof(*p)); 1175 + if (unlikely(!p)) 1176 + return -EIO; 1177 + 1178 + switch (*p) { 1179 + case err_vers: 1180 + p = xdr_inline_decode(xdr, 2 * sizeof(*p)); 1181 + if (!p) 1182 + break; 1183 + dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n", 1184 + rqst->rq_task->tk_pid, __func__, 1185 + be32_to_cpup(p), be32_to_cpu(*(p + 1))); 1186 + break; 1187 + case err_chunk: 1188 + dprintk("RPC: %5u: %s: server reports header decoding error\n", 1189 + rqst->rq_task->tk_pid, __func__); 1190 + break; 1191 + default: 1192 + dprintk("RPC: %5u: %s: server reports unrecognized error %d\n", 1193 + rqst->rq_task->tk_pid, __func__, be32_to_cpup(p)); 1194 + } 1195 + 1196 + r_xprt->rx_stats.bad_reply_count++; 1197 + return -EREMOTEIO; 1198 + } 1000 1199 1001 1200 /* Process received RPC/RDMA messages. 1002 1201 * ··· 1228 993 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1229 994 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1230 995 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1231 - struct rpcrdma_msg *headerp; 996 + struct xdr_stream *xdr = &rep->rr_stream; 1232 997 struct rpcrdma_req *req; 1233 998 struct rpc_rqst *rqst; 1234 - __be32 *iptr; 1235 - int rdmalen, status, rmerr; 999 + __be32 *p, xid, vers, proc; 1236 1000 unsigned long cwnd; 1237 1001 struct list_head mws; 1002 + int status; 1238 1003 1239 1004 dprintk("RPC: %s: incoming rep %p\n", __func__, rep); 1240 1005 1241 - if (rep->rr_len == RPCRDMA_BAD_LEN) 1006 + if (rep->rr_hdrbuf.head[0].iov_len == 0) 1242 1007 goto out_badstatus; 1243 - if (rep->rr_len < RPCRDMA_HDRLEN_ERR) 1244 - goto out_shortreply; 1245 1008 1246 - headerp = rdmab_to_msg(rep->rr_rdmabuf); 1247 - #if defined(CONFIG_SUNRPC_BACKCHANNEL) 1248 - if (rpcrdma_is_bcall(headerp)) 1249 - goto out_bcall; 1250 - #endif 1009 + xdr_init_decode(xdr, &rep->rr_hdrbuf, 1010 + rep->rr_hdrbuf.head[0].iov_base); 1011 + 1012 + /* Fixed transport header fields */ 1013 + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); 1014 + if (unlikely(!p)) 1015 + goto out_shortreply; 1016 + xid = *p++; 1017 + vers = *p++; 1018 + p++; /* credits */ 1019 + proc = *p++; 1020 + 1021 + if (rpcrdma_is_bcall(r_xprt, rep, xid, proc)) 1022 + return; 1251 1023 1252 1024 /* Match incoming rpcrdma_rep to an rpcrdma_req to 1253 1025 * get context for handling any incoming chunks. 1254 1026 */ 1255 1027 spin_lock(&buf->rb_lock); 1256 - req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf, 1257 - headerp->rm_xid); 1028 + req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf, xid); 1258 1029 if (!req) 1259 1030 goto out_nomatch; 1260 1031 if (req->rl_reply) ··· 1276 1035 spin_unlock(&buf->rb_lock); 1277 1036 1278 1037 dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", 1279 - __func__, rep, req, be32_to_cpu(headerp->rm_xid)); 1038 + __func__, rep, req, be32_to_cpu(xid)); 1280 1039 1281 1040 /* Invalidate and unmap the data payloads before waking the 1282 1041 * waiting application. This guarantees the memory regions ··· 1293 1052 * the rep, rqst, and rq_task pointers remain stable. 1294 1053 */ 1295 1054 spin_lock(&xprt->recv_lock); 1296 - rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); 1055 + rqst = xprt_lookup_rqst(xprt, xid); 1297 1056 if (!rqst) 1298 1057 goto out_norqst; 1299 1058 xprt->reestablish_timeout = 0; 1300 - if (headerp->rm_vers != rpcrdma_version) 1059 + if (vers != rpcrdma_version) 1301 1060 goto out_badversion; 1302 1061 1303 - /* check for expected message types */ 1304 - /* The order of some of these tests is important. */ 1305 - switch (headerp->rm_type) { 1062 + switch (proc) { 1306 1063 case rdma_msg: 1307 - /* never expect read chunks */ 1308 - /* never expect reply chunks (two ways to check) */ 1309 - if (headerp->rm_body.rm_chunks[0] != xdr_zero || 1310 - (headerp->rm_body.rm_chunks[1] == xdr_zero && 1311 - headerp->rm_body.rm_chunks[2] != xdr_zero)) 1312 - goto badheader; 1313 - if (headerp->rm_body.rm_chunks[1] != xdr_zero) { 1314 - /* count any expected write chunks in read reply */ 1315 - /* start at write chunk array count */ 1316 - iptr = &headerp->rm_body.rm_chunks[2]; 1317 - rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); 1318 - /* check for validity, and no reply chunk after */ 1319 - if (rdmalen < 0 || *iptr++ != xdr_zero) 1320 - goto badheader; 1321 - rep->rr_len -= 1322 - ((unsigned char *)iptr - (unsigned char *)headerp); 1323 - status = rep->rr_len + rdmalen; 1324 - r_xprt->rx_stats.total_rdma_reply += rdmalen; 1325 - /* special case - last chunk may omit padding */ 1326 - if (rdmalen &= 3) { 1327 - rdmalen = 4 - rdmalen; 1328 - status += rdmalen; 1329 - } 1330 - } else { 1331 - /* else ordinary inline */ 1332 - rdmalen = 0; 1333 - iptr = (__be32 *)((unsigned char *)headerp + 1334 - RPCRDMA_HDRLEN_MIN); 1335 - rep->rr_len -= RPCRDMA_HDRLEN_MIN; 1336 - status = rep->rr_len; 1337 - } 1338 - 1339 - r_xprt->rx_stats.fixup_copy_count += 1340 - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, 1341 - rdmalen); 1064 + status = rpcrdma_decode_msg(r_xprt, rep, rqst); 1342 1065 break; 1343 - 1344 1066 case rdma_nomsg: 1345 - /* never expect read or write chunks, always reply chunks */ 1346 - if (headerp->rm_body.rm_chunks[0] != xdr_zero || 1347 - headerp->rm_body.rm_chunks[1] != xdr_zero || 1348 - headerp->rm_body.rm_chunks[2] != xdr_one) 1349 - goto badheader; 1350 - iptr = (__be32 *)((unsigned char *)headerp + 1351 - RPCRDMA_HDRLEN_MIN); 1352 - rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); 1353 - if (rdmalen < 0) 1354 - goto badheader; 1355 - r_xprt->rx_stats.total_rdma_reply += rdmalen; 1356 - /* Reply chunk buffer already is the reply vector - no fixup. */ 1357 - status = rdmalen; 1067 + status = rpcrdma_decode_nomsg(r_xprt, rep); 1358 1068 break; 1359 - 1360 1069 case rdma_error: 1361 - goto out_rdmaerr; 1362 - 1363 - badheader: 1364 - default: 1365 - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", 1366 - rqst->rq_task->tk_pid, __func__, 1367 - be32_to_cpu(headerp->rm_type)); 1368 - status = -EIO; 1369 - r_xprt->rx_stats.bad_reply_count++; 1070 + status = rpcrdma_decode_error(r_xprt, rep, rqst); 1370 1071 break; 1072 + default: 1073 + status = -EIO; 1371 1074 } 1075 + if (status < 0) 1076 + goto out_badheader; 1372 1077 1373 1078 out: 1374 1079 cwnd = xprt->cwnd; ··· 1336 1149 } 1337 1150 return; 1338 1151 1339 - #if defined(CONFIG_SUNRPC_BACKCHANNEL) 1340 - out_bcall: 1341 - rpcrdma_bc_receive_call(r_xprt, rep); 1342 - return; 1343 - #endif 1344 - 1345 1152 /* If the incoming reply terminated a pending RPC, the next 1346 1153 * RPC call will post a replacement receive buffer as it is 1347 1154 * being marshaled. 1348 1155 */ 1349 1156 out_badversion: 1350 1157 dprintk("RPC: %s: invalid version %d\n", 1351 - __func__, be32_to_cpu(headerp->rm_vers)); 1158 + __func__, be32_to_cpu(vers)); 1352 1159 status = -EIO; 1353 1160 r_xprt->rx_stats.bad_reply_count++; 1354 1161 goto out; 1355 1162 1356 - out_rdmaerr: 1357 - rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err); 1358 - switch (rmerr) { 1359 - case ERR_VERS: 1360 - pr_err("%s: server reports header version error (%u-%u)\n", 1361 - __func__, 1362 - be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low), 1363 - be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high)); 1364 - break; 1365 - case ERR_CHUNK: 1366 - pr_err("%s: server reports header decoding error\n", 1367 - __func__); 1368 - break; 1369 - default: 1370 - pr_err("%s: server reports unknown error %d\n", 1371 - __func__, rmerr); 1372 - } 1373 - status = -EREMOTEIO; 1163 + out_badheader: 1164 + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", 1165 + rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc)); 1374 1166 r_xprt->rx_stats.bad_reply_count++; 1167 + status = -EIO; 1375 1168 goto out; 1376 1169 1377 1170 /* The req was still available, but by the time the recv_lock ··· 1371 1204 1372 1205 out_nomatch: 1373 1206 spin_unlock(&buf->rb_lock); 1374 - dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n", 1375 - __func__, be32_to_cpu(headerp->rm_xid), 1376 - rep->rr_len); 1207 + dprintk("RPC: %s: no match for incoming xid 0x%08x\n", 1208 + __func__, be32_to_cpu(xid)); 1377 1209 goto repost; 1378 1210 1379 1211 out_duplicate: 1380 1212 spin_unlock(&buf->rb_lock); 1381 1213 dprintk("RPC: %s: " 1382 1214 "duplicate reply %p to RPC request %p: xid 0x%08x\n", 1383 - __func__, rep, req, be32_to_cpu(headerp->rm_xid)); 1215 + __func__, rep, req, be32_to_cpu(xid)); 1384 1216 1385 1217 /* If no pending RPC transaction was matched, post a replacement 1386 1218 * receive buffer before returning.
+1 -1
net/sunrpc/xprtrdma/svc_rdma_backchannel.c
··· 269 269 module_put(THIS_MODULE); 270 270 } 271 271 272 - static struct rpc_xprt_ops xprt_rdma_bc_procs = { 272 + static const struct rpc_xprt_ops xprt_rdma_bc_procs = { 273 273 .reserve_xprt = xprt_reserve_xprt_cong, 274 274 .release_xprt = xprt_release_xprt_cong, 275 275 .alloc_slot = xprt_alloc_slot,
+4 -3
net/sunrpc/xprtrdma/transport.c
··· 149 149 150 150 #endif 151 151 152 - static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */ 152 + static const struct rpc_xprt_ops xprt_rdma_procs; 153 153 154 154 static void 155 155 xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) ··· 559 559 560 560 r_xprt->rx_stats.hardway_register_count += size; 561 561 req->rl_rdmabuf = rb; 562 + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); 562 563 return true; 563 564 } 564 565 ··· 731 730 if (unlikely(!list_empty(&req->rl_registered))) 732 731 r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); 733 732 734 - rc = rpcrdma_marshal_req(rqst); 733 + rc = rpcrdma_marshal_req(r_xprt, rqst); 735 734 if (rc < 0) 736 735 goto failed_marshal; 737 736 ··· 812 811 * Plumbing for rpc transport switch and kernel module 813 812 */ 814 813 815 - static struct rpc_xprt_ops xprt_rdma_procs = { 814 + static const struct rpc_xprt_ops xprt_rdma_procs = { 816 815 .reserve_xprt = xprt_reserve_xprt_cong, 817 816 .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ 818 817 .alloc_slot = xprt_alloc_slot,
+9 -12
net/sunrpc/xprtrdma/verbs.c
··· 139 139 static void 140 140 rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) 141 141 { 142 - struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf); 143 142 struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf; 143 + __be32 *p = rep->rr_rdmabuf->rg_base; 144 144 u32 credits; 145 145 146 - if (rep->rr_len < RPCRDMA_HDRLEN_ERR) 147 - return; 148 - 149 - credits = be32_to_cpu(rmsgp->rm_credit); 146 + credits = be32_to_cpup(p + 2); 150 147 if (credits == 0) 151 148 credits = 1; /* don't deadlock */ 152 149 else if (credits > buffer->rb_max_requests) ··· 170 173 goto out_fail; 171 174 172 175 /* status == SUCCESS means all fields in wc are trustworthy */ 173 - if (wc->opcode != IB_WC_RECV) 174 - return; 175 - 176 176 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", 177 177 __func__, rep, wc->byte_len); 178 178 179 - rep->rr_len = wc->byte_len; 179 + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); 180 180 rep->rr_wc_flags = wc->wc_flags; 181 181 rep->rr_inv_rkey = wc->ex.invalidate_rkey; 182 182 183 183 ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf), 184 184 rdmab_addr(rep->rr_rdmabuf), 185 - rep->rr_len, DMA_FROM_DEVICE); 185 + wc->byte_len, DMA_FROM_DEVICE); 186 186 187 - rpcrdma_update_granted_credits(rep); 187 + if (wc->byte_len >= RPCRDMA_HDRLEN_ERR) 188 + rpcrdma_update_granted_credits(rep); 188 189 189 190 out_schedule: 190 191 queue_work(rpcrdma_receive_wq, &rep->rr_work); ··· 193 198 pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", 194 199 ib_wc_status_msg(wc->status), 195 200 wc->status, wc->vendor_err); 196 - rep->rr_len = RPCRDMA_BAD_LEN; 201 + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0); 197 202 goto out_schedule; 198 203 } 199 204 ··· 969 974 rc = PTR_ERR(rep->rr_rdmabuf); 970 975 goto out_free; 971 976 } 977 + xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base, 978 + rdmab_length(rep->rr_rdmabuf)); 972 979 973 980 rep->rr_cqe.done = rpcrdma_wc_receive; 974 981 rep->rr_rxprt = r_xprt;
+22 -11
net/sunrpc/xprtrdma/xprt_rdma.h
··· 218 218 219 219 struct rpcrdma_rep { 220 220 struct ib_cqe rr_cqe; 221 - unsigned int rr_len; 222 221 int rr_wc_flags; 223 222 u32 rr_inv_rkey; 223 + struct rpcrdma_regbuf *rr_rdmabuf; 224 224 struct rpcrdma_xprt *rr_rxprt; 225 225 struct work_struct rr_work; 226 + struct xdr_buf rr_hdrbuf; 227 + struct xdr_stream rr_stream; 226 228 struct list_head rr_list; 227 229 struct ib_recv_wr rr_recv_wr; 228 - struct rpcrdma_regbuf *rr_rdmabuf; 229 230 }; 230 - 231 - #define RPCRDMA_BAD_LEN (~0U) 232 231 233 232 /* 234 233 * struct rpcrdma_mw - external memory region metadata ··· 345 346 unsigned int rl_connect_cookie; 346 347 struct rpcrdma_buffer *rl_buffer; 347 348 struct rpcrdma_rep *rl_reply; 349 + struct xdr_stream rl_stream; 350 + struct xdr_buf rl_hdrbuf; 348 351 struct ib_send_wr rl_send_wr; 349 352 struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES]; 350 353 struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ ··· 441 440 * Statistics for RPCRDMA 442 441 */ 443 442 struct rpcrdma_stats { 443 + /* accessed when sending a call */ 444 444 unsigned long read_chunk_count; 445 445 unsigned long write_chunk_count; 446 446 unsigned long reply_chunk_count; 447 - 448 447 unsigned long long total_rdma_request; 449 - unsigned long long total_rdma_reply; 450 448 449 + /* rarely accessed error counters */ 451 450 unsigned long long pullup_copy_count; 452 - unsigned long long fixup_copy_count; 453 451 unsigned long hardway_register_count; 454 452 unsigned long failed_marshal_count; 455 453 unsigned long bad_reply_count; 456 - unsigned long nomsg_call_count; 457 - unsigned long bcall_count; 458 454 unsigned long mrs_recovered; 459 455 unsigned long mrs_orphaned; 460 456 unsigned long mrs_allocated; 457 + 458 + /* accessed when receiving a reply */ 459 + unsigned long long total_rdma_reply; 460 + unsigned long long fixup_copy_count; 461 461 unsigned long local_inv_needed; 462 + unsigned long nomsg_call_count; 463 + unsigned long bcall_count; 462 464 }; 463 465 464 466 /* ··· 469 465 */ 470 466 struct rpcrdma_xprt; 471 467 struct rpcrdma_memreg_ops { 472 - int (*ro_map)(struct rpcrdma_xprt *, 468 + struct rpcrdma_mr_seg * 469 + (*ro_map)(struct rpcrdma_xprt *, 473 470 struct rpcrdma_mr_seg *, int, bool, 474 471 struct rpcrdma_mw **); 475 472 void (*ro_unmap_sync)(struct rpcrdma_xprt *, ··· 643 638 bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, 644 639 u32, struct xdr_buf *, enum rpcrdma_chunktype); 645 640 void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); 646 - int rpcrdma_marshal_req(struct rpc_rqst *); 641 + int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); 647 642 void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); 648 643 void rpcrdma_reply_handler(struct work_struct *work); 644 + 645 + static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) 646 + { 647 + xdr->head[0].iov_len = len; 648 + xdr->len = len; 649 + } 649 650 650 651 /* RPC/RDMA module init - xprtrdma/transport.c 651 652 */
+4 -4
net/sunrpc/xprtsock.c
··· 2728 2728 module_put(THIS_MODULE); 2729 2729 } 2730 2730 2731 - static struct rpc_xprt_ops xs_local_ops = { 2731 + static const struct rpc_xprt_ops xs_local_ops = { 2732 2732 .reserve_xprt = xprt_reserve_xprt, 2733 2733 .release_xprt = xs_tcp_release_xprt, 2734 2734 .alloc_slot = xprt_alloc_slot, ··· 2746 2746 .disable_swap = xs_disable_swap, 2747 2747 }; 2748 2748 2749 - static struct rpc_xprt_ops xs_udp_ops = { 2749 + static const struct rpc_xprt_ops xs_udp_ops = { 2750 2750 .set_buffer_size = xs_udp_set_buffer_size, 2751 2751 .reserve_xprt = xprt_reserve_xprt_cong, 2752 2752 .release_xprt = xprt_release_xprt_cong, ··· 2768 2768 .inject_disconnect = xs_inject_disconnect, 2769 2769 }; 2770 2770 2771 - static struct rpc_xprt_ops xs_tcp_ops = { 2771 + static const struct rpc_xprt_ops xs_tcp_ops = { 2772 2772 .reserve_xprt = xprt_reserve_xprt, 2773 2773 .release_xprt = xs_tcp_release_xprt, 2774 2774 .alloc_slot = xprt_lock_and_alloc_slot, ··· 2799 2799 * The rpc_xprt_ops for the server backchannel 2800 2800 */ 2801 2801 2802 - static struct rpc_xprt_ops bc_tcp_ops = { 2802 + static const struct rpc_xprt_ops bc_tcp_ops = { 2803 2803 .reserve_xprt = xprt_reserve_xprt, 2804 2804 .release_xprt = xprt_release_xprt, 2805 2805 .alloc_slot = xprt_alloc_slot,