Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

svcrdma: Use struct xdr_stream to decode ingress transport headers

The logic that checks incoming network headers has to be scrupulous.

De-duplicate: replace open-coded buffer overflow checks with the use
of xdr_stream helpers that are used most everywhere else XDR
decoding is done.

One minor change to the sanity checks: instead of checking the
length of individual segments, cap the length of the whole chunk
to be sure it can fit in the set of pages available in rq_pages.
This should be a better test of whether the server can handle the
chunks in each request.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

+131 -88
+2 -1
include/linux/sunrpc/rpc_rdma.h
··· 58 58 enum { 59 59 rpcrdma_fixed_maxsz = 4, 60 60 rpcrdma_segment_maxsz = 4, 61 - rpcrdma_readchunk_maxsz = 2 + rpcrdma_segment_maxsz, 61 + rpcrdma_readseg_maxsz = 1 + rpcrdma_segment_maxsz, 62 + rpcrdma_readchunk_maxsz = 1 + rpcrdma_readseg_maxsz, 62 63 }; 63 64 64 65 /*
+1
include/linux/sunrpc/svc_rdma.h
··· 132 132 struct ib_sge rc_recv_sge; 133 133 void *rc_recv_buf; 134 134 struct xdr_buf rc_arg; 135 + struct xdr_stream rc_stream; 135 136 bool rc_temp; 136 137 u32 rc_byte_len; 137 138 unsigned int rc_page_count;
+4 -3
include/trace/events/rpcrdma.h
··· 1469 1469 ); 1470 1470 1471 1471 #define DEFINE_SEGMENT_EVENT(name) \ 1472 - DEFINE_EVENT(svcrdma_segment_event, svcrdma_encode_##name,\ 1472 + DEFINE_EVENT(svcrdma_segment_event, svcrdma_##name,\ 1473 1473 TP_PROTO( \ 1474 1474 u32 handle, \ 1475 1475 u32 length, \ ··· 1477 1477 ), \ 1478 1478 TP_ARGS(handle, length, offset)) 1479 1479 1480 - DEFINE_SEGMENT_EVENT(rseg); 1481 - DEFINE_SEGMENT_EVENT(wseg); 1480 + DEFINE_SEGMENT_EVENT(decode_wseg); 1481 + DEFINE_SEGMENT_EVENT(encode_rseg); 1482 + DEFINE_SEGMENT_EVENT(encode_wseg); 1482 1483 1483 1484 DECLARE_EVENT_CLASS(svcrdma_chunk_event, 1484 1485 TP_PROTO(
+124 -84
net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
··· 358 358 arg->len = ctxt->rc_byte_len; 359 359 } 360 360 361 - /* This accommodates the largest possible Write chunk, 362 - * in one segment. 361 + /* This accommodates the largest possible Write chunk. 363 362 */ 364 - #define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) 363 + #define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) 365 364 366 365 /* This accommodates the largest possible Position-Zero 367 - * Read chunk or Reply chunk, in one segment. 366 + * Read chunk or Reply chunk. 368 367 */ 369 - #define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) 368 + #define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) 370 369 371 370 /* Sanity check the Read list. 372 371 * ··· 373 374 * - This implementation supports only one Read chunk. 374 375 * 375 376 * Sanity checks: 376 - * - Read list does not overflow buffer. 377 + * - Read list does not overflow Receive buffer. 377 378 * - Segment size limited by largest NFS data payload. 378 379 * 379 380 * The segment count is limited to how many segments can ··· 381 382 * buffer. That's about 40 Read segments for a 1KB inline 382 383 * threshold. 383 384 * 384 - * Returns pointer to the following Write list. 385 + * Return values: 386 + * %true: Read list is valid. @rctxt's xdr_stream is updated 387 + * to point to the first byte past the Read list. 388 + * %false: Read list is corrupt. @rctxt's xdr_stream is left 389 + * in an unknown state. 385 390 */ 386 - static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) 391 + static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) 387 392 { 388 - u32 position; 393 + u32 position, len; 389 394 bool first; 395 + __be32 *p; 390 396 397 + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); 398 + if (!p) 399 + return false; 400 + 401 + len = 0; 391 402 first = true; 392 - while (*p++ != xdr_zero) { 393 - if (first) { 394 - position = be32_to_cpup(p++); 395 - first = false; 396 - } else if (be32_to_cpup(p++) != position) { 397 - return NULL; 398 - } 399 - p++; /* handle */ 400 - if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG) 401 - return NULL; 402 - p += 2; /* offset */ 403 + while (*p != xdr_zero) { 404 + p = xdr_inline_decode(&rctxt->rc_stream, 405 + rpcrdma_readseg_maxsz * sizeof(*p)); 406 + if (!p) 407 + return false; 403 408 404 - if (p > end) 405 - return NULL; 409 + if (first) { 410 + position = be32_to_cpup(p); 411 + first = false; 412 + } else if (be32_to_cpup(p) != position) { 413 + return false; 414 + } 415 + p += 2; 416 + len += be32_to_cpup(p); 417 + 418 + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); 419 + if (!p) 420 + return false; 406 421 } 407 - return p; 422 + return len <= MAX_BYTES_SPECIAL_CHUNK; 408 423 } 409 424 410 425 /* The segment count is limited to how many segments can ··· 426 413 * buffer. That's about 60 Write segments for a 1KB inline 427 414 * threshold. 428 415 */ 429 - static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end, 430 - u32 maxlen) 416 + static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen) 431 417 { 432 - u32 i, segcount; 418 + u32 i, segcount, total; 419 + __be32 *p; 433 420 434 - segcount = be32_to_cpup(p++); 421 + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); 422 + if (!p) 423 + return false; 424 + segcount = be32_to_cpup(p); 425 + 426 + total = 0; 435 427 for (i = 0; i < segcount; i++) { 436 - p++; /* handle */ 437 - if (be32_to_cpup(p++) > maxlen) 438 - return NULL; 439 - p += 2; /* offset */ 428 + u32 handle, length; 429 + u64 offset; 440 430 441 - if (p > end) 442 - return NULL; 431 + p = xdr_inline_decode(&rctxt->rc_stream, 432 + rpcrdma_segment_maxsz * sizeof(*p)); 433 + if (!p) 434 + return false; 435 + 436 + handle = be32_to_cpup(p++); 437 + length = be32_to_cpup(p++); 438 + xdr_decode_hyper(p, &offset); 439 + trace_svcrdma_decode_wseg(handle, length, offset); 440 + 441 + total += length; 443 442 } 444 - 445 - return p; 443 + return total <= maxlen; 446 444 } 447 445 448 446 /* Sanity check the Write list. 449 447 * 450 448 * Implementation limits: 451 - * - This implementation supports only one Write chunk. 449 + * - This implementation currently supports only one Write chunk. 452 450 * 453 451 * Sanity checks: 454 - * - Write list does not overflow buffer. 455 - * - Segment size limited by largest NFS data payload. 452 + * - Write list does not overflow Receive buffer. 453 + * - Chunk size limited by largest NFS data payload. 456 454 * 457 - * Returns pointer to the following Reply chunk. 455 + * Return values: 456 + * %true: Write list is valid. @rctxt's xdr_stream is updated 457 + * to point to the first byte past the Write list. 458 + * %false: Write list is corrupt. @rctxt's xdr_stream is left 459 + * in an unknown state. 458 460 */ 459 - static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end) 461 + static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt) 460 462 { 461 - u32 chcount; 463 + u32 chcount = 0; 464 + __be32 *p; 462 465 463 - chcount = 0; 464 - while (*p++ != xdr_zero) { 465 - p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG); 466 + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); 467 + if (!p) 468 + return false; 469 + while (*p != xdr_zero) { 470 + if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK)) 471 + return false; 472 + ++chcount; 473 + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); 466 474 if (!p) 467 - return NULL; 468 - if (chcount++ > 1) 469 - return NULL; 475 + return false; 470 476 } 471 - return p; 477 + return chcount < 2; 472 478 } 473 479 474 480 /* Sanity check the Reply chunk. 475 481 * 476 482 * Sanity checks: 477 - * - Reply chunk does not overflow buffer. 478 - * - Segment size limited by largest NFS data payload. 483 + * - Reply chunk does not overflow Receive buffer. 484 + * - Chunk size limited by largest NFS data payload. 479 485 * 480 - * Returns pointer to the following RPC header. 486 + * Return values: 487 + * %true: Reply chunk is valid. @rctxt's xdr_stream is updated 488 + * to point to the first byte past the Reply chunk. 489 + * %false: Reply chunk is corrupt. @rctxt's xdr_stream is left 490 + * in an unknown state. 481 491 */ 482 - static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) 492 + static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) 483 493 { 484 - if (*p++ != xdr_zero) { 485 - p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG); 486 - if (!p) 487 - return NULL; 488 - } 489 - return p; 494 + __be32 *p; 495 + 496 + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); 497 + if (!p) 498 + return false; 499 + if (*p != xdr_zero) 500 + if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK)) 501 + return false; 502 + return true; 490 503 } 491 504 492 505 /* RPC-over-RDMA Version One private extension: Remote Invalidation. ··· 577 538 ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey); 578 539 } 579 540 580 - /* On entry, xdr->head[0].iov_base points to first byte in the 581 - * RPC-over-RDMA header. 541 + /** 542 + * svc_rdma_xdr_decode_req - Decode the transport header 543 + * @rq_arg: xdr_buf containing ingress RPC/RDMA message 544 + * @rctxt: state of decoding 545 + * 546 + * On entry, xdr->head[0].iov_base points to first byte of the 547 + * RPC-over-RDMA transport header. 582 548 * 583 549 * On successful exit, head[0] points to first byte past the 584 550 * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. 551 + * 585 552 * The length of the RPC-over-RDMA header is returned. 586 553 * 587 554 * Assumptions: 588 555 * - The transport header is entirely contained in the head iovec. 589 556 */ 590 - static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) 557 + static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg, 558 + struct svc_rdma_recv_ctxt *rctxt) 591 559 { 592 - __be32 *p, *end, *rdma_argp; 560 + __be32 *p, *rdma_argp; 593 561 unsigned int hdr_len; 594 562 595 - /* Verify that there's enough bytes for header + something */ 596 - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) 597 - goto out_short; 598 - 599 563 rdma_argp = rq_arg->head[0].iov_base; 600 - if (*(rdma_argp + 1) != rpcrdma_version) 601 - goto out_version; 564 + xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL); 602 565 603 - switch (*(rdma_argp + 3)) { 566 + p = xdr_inline_decode(&rctxt->rc_stream, 567 + rpcrdma_fixed_maxsz * sizeof(*p)); 568 + if (unlikely(!p)) 569 + goto out_short; 570 + p++; 571 + if (*p != rpcrdma_version) 572 + goto out_version; 573 + p += 2; 574 + switch (*p) { 604 575 case rdma_msg: 605 576 break; 606 577 case rdma_nomsg: 607 578 break; 608 - 609 579 case rdma_done: 610 580 goto out_drop; 611 - 612 581 case rdma_error: 613 582 goto out_drop; 614 - 615 583 default: 616 584 goto out_proc; 617 585 } 618 586 619 - end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); 620 - p = xdr_check_read_list(rdma_argp + 4, end); 621 - if (!p) 587 + if (!xdr_check_read_list(rctxt)) 622 588 goto out_inval; 623 - p = xdr_check_write_list(p, end); 624 - if (!p) 589 + if (!xdr_check_write_list(rctxt)) 625 590 goto out_inval; 626 - p = xdr_check_reply_chunk(p, end); 627 - if (!p) 628 - goto out_inval; 629 - if (p > end) 591 + if (!xdr_check_reply_chunk(rctxt)) 630 592 goto out_inval; 631 593 632 - rq_arg->head[0].iov_base = p; 633 - hdr_len = (unsigned long)p - (unsigned long)rdma_argp; 594 + rq_arg->head[0].iov_base = rctxt->rc_stream.p; 595 + hdr_len = xdr_stream_pos(&rctxt->rc_stream); 634 596 rq_arg->head[0].iov_len -= hdr_len; 635 597 rq_arg->len -= hdr_len; 636 598 trace_svcrdma_decode_rqst(rdma_argp, hdr_len); ··· 826 786 rqstp->rq_next_page = rqstp->rq_respages; 827 787 828 788 p = (__be32 *)rqstp->rq_arg.head[0].iov_base; 829 - ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); 789 + ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); 830 790 if (ret < 0) 831 791 goto out_err; 832 792 if (ret == 0)