rdma: ONCRPC RDMA protocol marshalling

This logic parses the ONCRDMA protocol headers that
precede the actual RPC header. It is placed in a separate
file to keep all protocol aware code in a single place.

Signed-off-by: Tom Tucker <tom@opengridcomputing.com>
Acked-by: Neil Brown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>

authored by Tom Tucker and committed by J. Bruce Fields ef1eac0a c06b540a

+412
+412
net/sunrpc/xprtrdma/svc_rdma_marshal.c
··· 1 + /* 2 + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. 3 + * 4 + * This software is available to you under a choice of one of two 5 + * licenses. You may choose to be licensed under the terms of the GNU 6 + * General Public License (GPL) Version 2, available from the file 7 + * COPYING in the main directory of this source tree, or the BSD-type 8 + * license below: 9 + * 10 + * Redistribution and use in source and binary forms, with or without 11 + * modification, are permitted provided that the following conditions 12 + * are met: 13 + * 14 + * Redistributions of source code must retain the above copyright 15 + * notice, this list of conditions and the following disclaimer. 16 + * 17 + * Redistributions in binary form must reproduce the above 18 + * copyright notice, this list of conditions and the following 19 + * disclaimer in the documentation and/or other materials provided 20 + * with the distribution. 21 + * 22 + * Neither the name of the Network Appliance, Inc. nor the names of 23 + * its contributors may be used to endorse or promote products 24 + * derived from this software without specific prior written 25 + * permission. 26 + * 27 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + * 39 + * Author: Tom Tucker <tom@opengridcomputing.com> 40 + */ 41 + 42 + #include <linux/sunrpc/xdr.h> 43 + #include <linux/sunrpc/debug.h> 44 + #include <asm/unaligned.h> 45 + #include <linux/sunrpc/rpc_rdma.h> 46 + #include <linux/sunrpc/svc_rdma.h> 47 + 48 + #define RPCDBG_FACILITY RPCDBG_SVCXPRT 49 + 50 + /* 51 + * Decodes a read chunk list. The expected format is as follows: 52 + * descrim : xdr_one 53 + * position : u32 offset into XDR stream 54 + * handle : u32 RKEY 55 + * . . . 56 + * end-of-list: xdr_zero 57 + */ 58 + static u32 *decode_read_list(u32 *va, u32 *vaend) 59 + { 60 + struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; 61 + 62 + while (ch->rc_discrim != xdr_zero) { 63 + u64 ch_offset; 64 + 65 + if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > 66 + (unsigned long)vaend) { 67 + dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); 68 + return NULL; 69 + } 70 + 71 + ch->rc_discrim = ntohl(ch->rc_discrim); 72 + ch->rc_position = ntohl(ch->rc_position); 73 + ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle); 74 + ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length); 75 + va = (u32 *)&ch->rc_target.rs_offset; 76 + xdr_decode_hyper(va, &ch_offset); 77 + put_unaligned(ch_offset, (u64 *)va); 78 + ch++; 79 + } 80 + return (u32 *)&ch->rc_position; 81 + } 82 + 83 + /* 84 + * Determine number of chunks and total bytes in chunk list. The chunk 85 + * list has already been verified to fit within the RPCRDMA header. 86 + */ 87 + void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch, 88 + int *ch_count, int *byte_count) 89 + { 90 + /* compute the number of bytes represented by read chunks */ 91 + *byte_count = 0; 92 + *ch_count = 0; 93 + for (; ch->rc_discrim != 0; ch++) { 94 + *byte_count = *byte_count + ch->rc_target.rs_length; 95 + *ch_count = *ch_count + 1; 96 + } 97 + } 98 + 99 + /* 100 + * Decodes a write chunk list. The expected format is as follows: 101 + * descrim : xdr_one 102 + * nchunks : <count> 103 + * handle : u32 RKEY ---+ 104 + * length : u32 <len of segment> | 105 + * offset : remove va + <count> 106 + * . . . | 107 + * ---+ 108 + */ 109 + static u32 *decode_write_list(u32 *va, u32 *vaend) 110 + { 111 + int ch_no; 112 + struct rpcrdma_write_array *ary = 113 + (struct rpcrdma_write_array *)va; 114 + 115 + /* Check for not write-array */ 116 + if (ary->wc_discrim == xdr_zero) 117 + return (u32 *)&ary->wc_nchunks; 118 + 119 + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > 120 + (unsigned long)vaend) { 121 + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); 122 + return NULL; 123 + } 124 + ary->wc_discrim = ntohl(ary->wc_discrim); 125 + ary->wc_nchunks = ntohl(ary->wc_nchunks); 126 + if (((unsigned long)&ary->wc_array[0] + 127 + (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > 128 + (unsigned long)vaend) { 129 + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", 130 + ary, ary->wc_nchunks, vaend); 131 + return NULL; 132 + } 133 + for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { 134 + u64 ch_offset; 135 + 136 + ary->wc_array[ch_no].wc_target.rs_handle = 137 + ntohl(ary->wc_array[ch_no].wc_target.rs_handle); 138 + ary->wc_array[ch_no].wc_target.rs_length = 139 + ntohl(ary->wc_array[ch_no].wc_target.rs_length); 140 + va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; 141 + xdr_decode_hyper(va, &ch_offset); 142 + put_unaligned(ch_offset, (u64 *)va); 143 + } 144 + 145 + /* 146 + * rs_length is the 2nd 4B field in wc_target and taking its 147 + * address skips the list terminator 148 + */ 149 + return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length; 150 + } 151 + 152 + static u32 *decode_reply_array(u32 *va, u32 *vaend) 153 + { 154 + int ch_no; 155 + struct rpcrdma_write_array *ary = 156 + (struct rpcrdma_write_array *)va; 157 + 158 + /* Check for no reply-array */ 159 + if (ary->wc_discrim == xdr_zero) 160 + return (u32 *)&ary->wc_nchunks; 161 + 162 + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > 163 + (unsigned long)vaend) { 164 + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); 165 + return NULL; 166 + } 167 + ary->wc_discrim = ntohl(ary->wc_discrim); 168 + ary->wc_nchunks = ntohl(ary->wc_nchunks); 169 + if (((unsigned long)&ary->wc_array[0] + 170 + (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > 171 + (unsigned long)vaend) { 172 + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", 173 + ary, ary->wc_nchunks, vaend); 174 + return NULL; 175 + } 176 + for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { 177 + u64 ch_offset; 178 + 179 + ary->wc_array[ch_no].wc_target.rs_handle = 180 + ntohl(ary->wc_array[ch_no].wc_target.rs_handle); 181 + ary->wc_array[ch_no].wc_target.rs_length = 182 + ntohl(ary->wc_array[ch_no].wc_target.rs_length); 183 + va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; 184 + xdr_decode_hyper(va, &ch_offset); 185 + put_unaligned(ch_offset, (u64 *)va); 186 + } 187 + 188 + return (u32 *)&ary->wc_array[ch_no]; 189 + } 190 + 191 + int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, 192 + struct svc_rqst *rqstp) 193 + { 194 + struct rpcrdma_msg *rmsgp = NULL; 195 + u32 *va; 196 + u32 *vaend; 197 + u32 hdr_len; 198 + 199 + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; 200 + 201 + /* Verify that there's enough bytes for header + something */ 202 + if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) { 203 + dprintk("svcrdma: header too short = %d\n", 204 + rqstp->rq_arg.len); 205 + return -EINVAL; 206 + } 207 + 208 + /* Decode the header */ 209 + rmsgp->rm_xid = ntohl(rmsgp->rm_xid); 210 + rmsgp->rm_vers = ntohl(rmsgp->rm_vers); 211 + rmsgp->rm_credit = ntohl(rmsgp->rm_credit); 212 + rmsgp->rm_type = ntohl(rmsgp->rm_type); 213 + 214 + if (rmsgp->rm_vers != RPCRDMA_VERSION) 215 + return -ENOSYS; 216 + 217 + /* Pull in the extra for the padded case and bump our pointer */ 218 + if (rmsgp->rm_type == RDMA_MSGP) { 219 + int hdrlen; 220 + rmsgp->rm_body.rm_padded.rm_align = 221 + ntohl(rmsgp->rm_body.rm_padded.rm_align); 222 + rmsgp->rm_body.rm_padded.rm_thresh = 223 + ntohl(rmsgp->rm_body.rm_padded.rm_thresh); 224 + 225 + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; 226 + rqstp->rq_arg.head[0].iov_base = va; 227 + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); 228 + rqstp->rq_arg.head[0].iov_len -= hdrlen; 229 + if (hdrlen > rqstp->rq_arg.len) 230 + return -EINVAL; 231 + return hdrlen; 232 + } 233 + 234 + /* The chunk list may contain either a read chunk list or a write 235 + * chunk list and a reply chunk list. 236 + */ 237 + va = &rmsgp->rm_body.rm_chunks[0]; 238 + vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); 239 + va = decode_read_list(va, vaend); 240 + if (!va) 241 + return -EINVAL; 242 + va = decode_write_list(va, vaend); 243 + if (!va) 244 + return -EINVAL; 245 + va = decode_reply_array(va, vaend); 246 + if (!va) 247 + return -EINVAL; 248 + 249 + rqstp->rq_arg.head[0].iov_base = va; 250 + hdr_len = (unsigned long)va - (unsigned long)rmsgp; 251 + rqstp->rq_arg.head[0].iov_len -= hdr_len; 252 + 253 + *rdma_req = rmsgp; 254 + return hdr_len; 255 + } 256 + 257 + int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp) 258 + { 259 + struct rpcrdma_msg *rmsgp = NULL; 260 + struct rpcrdma_read_chunk *ch; 261 + struct rpcrdma_write_array *ary; 262 + u32 *va; 263 + u32 hdrlen; 264 + 265 + dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n", 266 + rqstp); 267 + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; 268 + 269 + /* Pull in the extra for the padded case and bump our pointer */ 270 + if (rmsgp->rm_type == RDMA_MSGP) { 271 + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; 272 + rqstp->rq_arg.head[0].iov_base = va; 273 + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); 274 + rqstp->rq_arg.head[0].iov_len -= hdrlen; 275 + return hdrlen; 276 + } 277 + 278 + /* 279 + * Skip all chunks to find RPC msg. These were previously processed 280 + */ 281 + va = &rmsgp->rm_body.rm_chunks[0]; 282 + 283 + /* Skip read-list */ 284 + for (ch = (struct rpcrdma_read_chunk *)va; 285 + ch->rc_discrim != xdr_zero; ch++); 286 + va = (u32 *)&ch->rc_position; 287 + 288 + /* Skip write-list */ 289 + ary = (struct rpcrdma_write_array *)va; 290 + if (ary->wc_discrim == xdr_zero) 291 + va = (u32 *)&ary->wc_nchunks; 292 + else 293 + /* 294 + * rs_length is the 2nd 4B field in wc_target and taking its 295 + * address skips the list terminator 296 + */ 297 + va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length; 298 + 299 + /* Skip reply-array */ 300 + ary = (struct rpcrdma_write_array *)va; 301 + if (ary->wc_discrim == xdr_zero) 302 + va = (u32 *)&ary->wc_nchunks; 303 + else 304 + va = (u32 *)&ary->wc_array[ary->wc_nchunks]; 305 + 306 + rqstp->rq_arg.head[0].iov_base = va; 307 + hdrlen = (unsigned long)va - (unsigned long)rmsgp; 308 + rqstp->rq_arg.head[0].iov_len -= hdrlen; 309 + 310 + return hdrlen; 311 + } 312 + 313 + int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, 314 + struct rpcrdma_msg *rmsgp, 315 + enum rpcrdma_errcode err, u32 *va) 316 + { 317 + u32 *startp = va; 318 + 319 + *va++ = htonl(rmsgp->rm_xid); 320 + *va++ = htonl(rmsgp->rm_vers); 321 + *va++ = htonl(xprt->sc_max_requests); 322 + *va++ = htonl(RDMA_ERROR); 323 + *va++ = htonl(err); 324 + if (err == ERR_VERS) { 325 + *va++ = htonl(RPCRDMA_VERSION); 326 + *va++ = htonl(RPCRDMA_VERSION); 327 + } 328 + 329 + return (int)((unsigned long)va - (unsigned long)startp); 330 + } 331 + 332 + int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) 333 + { 334 + struct rpcrdma_write_array *wr_ary; 335 + 336 + /* There is no read-list in a reply */ 337 + 338 + /* skip write list */ 339 + wr_ary = (struct rpcrdma_write_array *) 340 + &rmsgp->rm_body.rm_chunks[1]; 341 + if (wr_ary->wc_discrim) 342 + wr_ary = (struct rpcrdma_write_array *) 343 + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]. 344 + wc_target.rs_length; 345 + else 346 + wr_ary = (struct rpcrdma_write_array *) 347 + &wr_ary->wc_nchunks; 348 + 349 + /* skip reply array */ 350 + if (wr_ary->wc_discrim) 351 + wr_ary = (struct rpcrdma_write_array *) 352 + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]; 353 + else 354 + wr_ary = (struct rpcrdma_write_array *) 355 + &wr_ary->wc_nchunks; 356 + 357 + return (unsigned long) wr_ary - (unsigned long) rmsgp; 358 + } 359 + 360 + void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) 361 + { 362 + struct rpcrdma_write_array *ary; 363 + 364 + /* no read-list */ 365 + rmsgp->rm_body.rm_chunks[0] = xdr_zero; 366 + 367 + /* write-array discrim */ 368 + ary = (struct rpcrdma_write_array *) 369 + &rmsgp->rm_body.rm_chunks[1]; 370 + ary->wc_discrim = xdr_one; 371 + ary->wc_nchunks = htonl(chunks); 372 + 373 + /* write-list terminator */ 374 + ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; 375 + 376 + /* reply-array discriminator */ 377 + ary->wc_array[chunks].wc_target.rs_length = xdr_zero; 378 + } 379 + 380 + void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, 381 + int chunks) 382 + { 383 + ary->wc_discrim = xdr_one; 384 + ary->wc_nchunks = htonl(chunks); 385 + } 386 + 387 + void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, 388 + int chunk_no, 389 + u32 rs_handle, u64 rs_offset, 390 + u32 write_len) 391 + { 392 + struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; 393 + seg->rs_handle = htonl(rs_handle); 394 + seg->rs_length = htonl(write_len); 395 + xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset); 396 + } 397 + 398 + void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, 399 + struct rpcrdma_msg *rdma_argp, 400 + struct rpcrdma_msg *rdma_resp, 401 + enum rpcrdma_proc rdma_type) 402 + { 403 + rdma_resp->rm_xid = htonl(rdma_argp->rm_xid); 404 + rdma_resp->rm_vers = htonl(rdma_argp->rm_vers); 405 + rdma_resp->rm_credit = htonl(xprt->sc_max_requests); 406 + rdma_resp->rm_type = htonl(rdma_type); 407 + 408 + /* Encode <nul> chunks lists */ 409 + rdma_resp->rm_body.rm_chunks[0] = xdr_zero; 410 + rdma_resp->rm_body.rm_chunks[1] = xdr_zero; 411 + rdma_resp->rm_body.rm_chunks[2] = xdr_zero; 412 + }