Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mctp: Implement message fragmentation & reassembly

This change implements MCTP fragmentation (based on route & device MTU),
and corresponding reassembly.

The MCTP specification only allows for fragmentation on the originating
message endpoint, and reassembly on the destination endpoint -
intermediate nodes do not need to reassemble/refragment. Consequently,
we only fragment in the local transmit path, and reassemble
locally-bound packets. Messages are required to be in-order, so we
simply cancel reassembly on out-of-order or missing packets.

In the fragmentation path, we just break up the message into MTU-sized
fragments; the skb structure is a simple copy for now, which we can later
improve with a shared data implementation.

For reassembly, we keep track of incoming message fragments using the
existing tag infrastructure, allocating a key on the (src,dest,tag)
tuple, and reassembles matching fragments into a skb->frag_list.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Jeremy Kerr and committed by
David S. Miller
4a992bbd 833ef3b9

+363 -46
+22 -3
include/net/mctp.h
··· 84 84 * updates to either list are performed under the netns_mctp->keys 85 85 * lock. 86 86 * 87 - * - there is a single destruction path for a mctp_sk_key - through socket 88 - * unhash (see mctp_sk_unhash). This performs the list removal under 89 - * keys_lock. 87 + * - a key may have a sk_buff attached as part of an in-progress message 88 + * reassembly (->reasm_head). The reassembly context is protected by 89 + * reasm_lock, which may be acquired with the keys lock (above) held, if 90 + * necessary. Consequently, keys lock *cannot* be acquired with the 91 + * reasm_lock held. 92 + * 93 + * - there are two destruction paths for a mctp_sk_key: 94 + * 95 + * - through socket unhash (see mctp_sk_unhash). This performs the list 96 + * removal under keys_lock. 97 + * 98 + * - where a key is established to receive a reply message: after receiving 99 + * the (complete) reply, or during reassembly errors. Here, we clean up 100 + * the reassembly context (marking reasm_dead, to prevent another from 101 + * starting), and remove the socket from the netns & socket lists. 90 102 */ 91 103 struct mctp_sk_key { 92 104 mctp_eid_t peer_addr; ··· 113 101 114 102 /* per-socket list */ 115 103 struct hlist_node sklist; 104 + 105 + /* incoming fragment reassembly context */ 106 + spinlock_t reasm_lock; 107 + struct sk_buff *reasm_head; 108 + struct sk_buff **reasm_tailp; 109 + bool reasm_dead; 110 + u8 last_seq; 116 111 117 112 struct rcu_head rcu; 118 113 };
+8
net/mctp/af_mctp.c
··· 263 263 hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { 264 264 hlist_del_rcu(&key->sklist); 265 265 hlist_del_rcu(&key->hlist); 266 + 267 + spin_lock(&key->reasm_lock); 268 + if (key->reasm_head) 269 + kfree_skb(key->reasm_head); 270 + key->reasm_head = NULL; 271 + key->reasm_dead = true; 272 + spin_unlock(&key->reasm_lock); 273 + 266 274 kfree_rcu(key, rcu); 267 275 } 268 276 spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
+333 -43
net/mctp/route.c
··· 23 23 #include <net/netlink.h> 24 24 #include <net/sock.h> 25 25 26 + static const unsigned int mctp_message_maxlen = 64 * 1024; 27 + 26 28 /* route output callbacks */ 27 29 static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) 28 30 { ··· 107 105 return ret; 108 106 } 109 107 108 + static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, 109 + mctp_eid_t local, mctp_eid_t peer, 110 + u8 tag, gfp_t gfp) 111 + { 112 + struct mctp_sk_key *key; 113 + 114 + key = kzalloc(sizeof(*key), gfp); 115 + if (!key) 116 + return NULL; 117 + 118 + key->peer_addr = peer; 119 + key->local_addr = local; 120 + key->tag = tag; 121 + key->sk = &msk->sk; 122 + spin_lock_init(&key->reasm_lock); 123 + 124 + return key; 125 + } 126 + 127 + static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) 128 + { 129 + struct net *net = sock_net(&msk->sk); 130 + struct mctp_sk_key *tmp; 131 + unsigned long flags; 132 + int rc = 0; 133 + 134 + spin_lock_irqsave(&net->mctp.keys_lock, flags); 135 + 136 + hlist_for_each_entry(tmp, &net->mctp.keys, hlist) { 137 + if (mctp_key_match(tmp, key->local_addr, key->peer_addr, 138 + key->tag)) { 139 + rc = -EEXIST; 140 + break; 141 + } 142 + } 143 + 144 + if (!rc) { 145 + hlist_add_head(&key->hlist, &net->mctp.keys); 146 + hlist_add_head(&key->sklist, &msk->keys); 147 + } 148 + 149 + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); 150 + 151 + return rc; 152 + } 153 + 154 + /* Must be called with key->reasm_lock, which it will release. Will schedule 155 + * the key for an RCU free. 156 + */ 157 + static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net, 158 + unsigned long flags) 159 + __releases(&key->reasm_lock) 160 + { 161 + struct sk_buff *skb; 162 + 163 + skb = key->reasm_head; 164 + key->reasm_head = NULL; 165 + key->reasm_dead = true; 166 + spin_unlock_irqrestore(&key->reasm_lock, flags); 167 + 168 + spin_lock_irqsave(&net->mctp.keys_lock, flags); 169 + hlist_del_rcu(&key->hlist); 170 + hlist_del_rcu(&key->sklist); 171 + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); 172 + kfree_rcu(key, rcu); 173 + 174 + if (skb) 175 + kfree_skb(skb); 176 + } 177 + 178 + static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) 179 + { 180 + struct mctp_hdr *hdr = mctp_hdr(skb); 181 + u8 exp_seq, this_seq; 182 + 183 + this_seq = (hdr->flags_seq_tag >> MCTP_HDR_SEQ_SHIFT) 184 + & MCTP_HDR_SEQ_MASK; 185 + 186 + if (!key->reasm_head) { 187 + key->reasm_head = skb; 188 + key->reasm_tailp = &(skb_shinfo(skb)->frag_list); 189 + key->last_seq = this_seq; 190 + return 0; 191 + } 192 + 193 + exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK; 194 + 195 + if (this_seq != exp_seq) 196 + return -EINVAL; 197 + 198 + if (key->reasm_head->len + skb->len > mctp_message_maxlen) 199 + return -EINVAL; 200 + 201 + skb->next = NULL; 202 + skb->sk = NULL; 203 + *key->reasm_tailp = skb; 204 + key->reasm_tailp = &skb->next; 205 + 206 + key->last_seq = this_seq; 207 + 208 + key->reasm_head->data_len += skb->len; 209 + key->reasm_head->len += skb->len; 210 + key->reasm_head->truesize += skb->truesize; 211 + 212 + return 0; 213 + } 214 + 110 215 static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) 111 216 { 112 217 struct net *net = dev_net(skb->dev); 113 218 struct mctp_sk_key *key; 114 219 struct mctp_sock *msk; 115 220 struct mctp_hdr *mh; 221 + unsigned long f; 222 + u8 tag, flags; 223 + int rc; 116 224 117 225 msk = NULL; 226 + rc = -EINVAL; 118 227 119 228 /* we may be receiving a locally-routed packet; drop source sk 120 229 * accounting ··· 234 121 235 122 /* ensure we have enough data for a header and a type */ 236 123 if (skb->len < sizeof(struct mctp_hdr) + 1) 237 - goto drop; 124 + goto out; 238 125 239 126 /* grab header, advance data ptr */ 240 127 mh = mctp_hdr(skb); 241 128 skb_pull(skb, sizeof(struct mctp_hdr)); 242 129 243 130 if (mh->ver != 1) 244 - goto drop; 131 + goto out; 245 132 246 - /* TODO: reassembly */ 247 - if ((mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM)) 248 - != (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM)) 249 - goto drop; 133 + flags = mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM); 134 + tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); 250 135 251 136 rcu_read_lock(); 252 - /* 1. lookup socket matching (src,dest,tag) */ 137 + 138 + /* lookup socket / reasm context, exactly matching (src,dest,tag) */ 253 139 key = mctp_lookup_key(net, skb, mh->src); 254 140 255 - /* 2. lookup socket macthing (BCAST,dest,tag) */ 256 - if (!key) 257 - key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY); 141 + if (flags & MCTP_HDR_FLAG_SOM) { 142 + if (key) { 143 + msk = container_of(key->sk, struct mctp_sock, sk); 144 + } else { 145 + /* first response to a broadcast? do a more general 146 + * key lookup to find the socket, but don't use this 147 + * key for reassembly - we'll create a more specific 148 + * one for future packets if required (ie, !EOM). 149 + */ 150 + key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY); 151 + if (key) { 152 + msk = container_of(key->sk, 153 + struct mctp_sock, sk); 154 + key = NULL; 155 + } 156 + } 258 157 259 - /* 3. SOM? -> lookup bound socket, conditionally (!EOM) create 260 - * mapping for future (1)/(2). 261 - */ 262 - if (key) 263 - msk = container_of(key->sk, struct mctp_sock, sk); 264 - else if (!msk && (mh->flags_seq_tag & MCTP_HDR_FLAG_SOM)) 265 - msk = mctp_lookup_bind(net, skb); 158 + if (!key && !msk && (tag & MCTP_HDR_FLAG_TO)) 159 + msk = mctp_lookup_bind(net, skb); 266 160 267 - if (!msk) 268 - goto unlock_drop; 161 + if (!msk) { 162 + rc = -ENOENT; 163 + goto out_unlock; 164 + } 269 165 270 - sock_queue_rcv_skb(&msk->sk, skb); 166 + /* single-packet message? deliver to socket, clean up any 167 + * pending key. 168 + */ 169 + if (flags & MCTP_HDR_FLAG_EOM) { 170 + sock_queue_rcv_skb(&msk->sk, skb); 171 + if (key) { 172 + spin_lock_irqsave(&key->reasm_lock, f); 173 + /* we've hit a pending reassembly; not much we 174 + * can do but drop it 175 + */ 176 + __mctp_key_unlock_drop(key, net, f); 177 + } 178 + rc = 0; 179 + goto out_unlock; 180 + } 271 181 182 + /* broadcast response or a bind() - create a key for further 183 + * packets for this message 184 + */ 185 + if (!key) { 186 + key = mctp_key_alloc(msk, mh->dest, mh->src, 187 + tag, GFP_ATOMIC); 188 + if (!key) { 189 + rc = -ENOMEM; 190 + goto out_unlock; 191 + } 192 + 193 + /* we can queue without the reasm lock here, as the 194 + * key isn't observable yet 195 + */ 196 + mctp_frag_queue(key, skb); 197 + 198 + /* if the key_add fails, we've raced with another 199 + * SOM packet with the same src, dest and tag. There's 200 + * no way to distinguish future packets, so all we 201 + * can do is drop; we'll free the skb on exit from 202 + * this function. 203 + */ 204 + rc = mctp_key_add(key, msk); 205 + if (rc) 206 + kfree(key); 207 + 208 + } else { 209 + /* existing key: start reassembly */ 210 + spin_lock_irqsave(&key->reasm_lock, f); 211 + 212 + if (key->reasm_head || key->reasm_dead) { 213 + /* duplicate start? drop everything */ 214 + __mctp_key_unlock_drop(key, net, f); 215 + rc = -EEXIST; 216 + } else { 217 + rc = mctp_frag_queue(key, skb); 218 + spin_unlock_irqrestore(&key->reasm_lock, f); 219 + } 220 + } 221 + 222 + } else if (key) { 223 + /* this packet continues a previous message; reassemble 224 + * using the message-specific key 225 + */ 226 + 227 + spin_lock_irqsave(&key->reasm_lock, f); 228 + 229 + /* we need to be continuing an existing reassembly... */ 230 + if (!key->reasm_head) 231 + rc = -EINVAL; 232 + else 233 + rc = mctp_frag_queue(key, skb); 234 + 235 + /* end of message? deliver to socket, and we're done with 236 + * the reassembly/response key 237 + */ 238 + if (!rc && flags & MCTP_HDR_FLAG_EOM) { 239 + sock_queue_rcv_skb(key->sk, key->reasm_head); 240 + key->reasm_head = NULL; 241 + __mctp_key_unlock_drop(key, net, f); 242 + } else { 243 + spin_unlock_irqrestore(&key->reasm_lock, f); 244 + } 245 + 246 + } else { 247 + /* not a start, no matching key */ 248 + rc = -ENOENT; 249 + } 250 + 251 + out_unlock: 272 252 rcu_read_unlock(); 253 + out: 254 + if (rc) 255 + kfree_skb(skb); 256 + return rc; 257 + } 273 258 274 - return 0; 275 - 276 - unlock_drop: 277 - rcu_read_unlock(); 278 - drop: 279 - kfree_skb(skb); 280 - return 0; 259 + static unsigned int mctp_route_mtu(struct mctp_route *rt) 260 + { 261 + return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu); 281 262 } 282 263 283 264 static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) ··· 435 228 436 229 lockdep_assert_held(&mns->keys_lock); 437 230 438 - key->sk = &msk->sk; 439 - 440 231 /* we hold the net->key_lock here, allowing updates to both 441 232 * then net and sk 442 233 */ ··· 456 251 u8 tagbits; 457 252 458 253 /* be optimistic, alloc now */ 459 - key = kzalloc(sizeof(*key), GFP_KERNEL); 254 + key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL); 460 255 if (!key) 461 256 return -ENOMEM; 462 - key->local_addr = saddr; 463 - key->peer_addr = daddr; 464 257 465 258 /* 8 possible tag values */ 466 259 tagbits = 0xff; ··· 543 340 return rc; 544 341 } 545 342 343 + static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, 344 + unsigned int mtu, u8 tag) 345 + { 346 + const unsigned int hlen = sizeof(struct mctp_hdr); 347 + struct mctp_hdr *hdr, *hdr2; 348 + unsigned int pos, size; 349 + struct sk_buff *skb2; 350 + int rc; 351 + u8 seq; 352 + 353 + hdr = mctp_hdr(skb); 354 + seq = 0; 355 + rc = 0; 356 + 357 + if (mtu < hlen + 1) { 358 + kfree_skb(skb); 359 + return -EMSGSIZE; 360 + } 361 + 362 + /* we've got the header */ 363 + skb_pull(skb, hlen); 364 + 365 + for (pos = 0; pos < skb->len;) { 366 + /* size of message payload */ 367 + size = min(mtu - hlen, skb->len - pos); 368 + 369 + skb2 = alloc_skb(MCTP_HEADER_MAXLEN + hlen + size, GFP_KERNEL); 370 + if (!skb2) { 371 + rc = -ENOMEM; 372 + break; 373 + } 374 + 375 + /* generic skb copy */ 376 + skb2->protocol = skb->protocol; 377 + skb2->priority = skb->priority; 378 + skb2->dev = skb->dev; 379 + memcpy(skb2->cb, skb->cb, sizeof(skb2->cb)); 380 + 381 + if (skb->sk) 382 + skb_set_owner_w(skb2, skb->sk); 383 + 384 + /* establish packet */ 385 + skb_reserve(skb2, MCTP_HEADER_MAXLEN); 386 + skb_reset_network_header(skb2); 387 + skb_put(skb2, hlen + size); 388 + skb2->transport_header = skb2->network_header + hlen; 389 + 390 + /* copy header fields, calculate SOM/EOM flags & seq */ 391 + hdr2 = mctp_hdr(skb2); 392 + hdr2->ver = hdr->ver; 393 + hdr2->dest = hdr->dest; 394 + hdr2->src = hdr->src; 395 + hdr2->flags_seq_tag = tag & 396 + (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); 397 + 398 + if (pos == 0) 399 + hdr2->flags_seq_tag |= MCTP_HDR_FLAG_SOM; 400 + 401 + if (pos + size == skb->len) 402 + hdr2->flags_seq_tag |= MCTP_HDR_FLAG_EOM; 403 + 404 + hdr2->flags_seq_tag |= seq << MCTP_HDR_SEQ_SHIFT; 405 + 406 + /* copy message payload */ 407 + skb_copy_bits(skb, pos, skb_transport_header(skb2), size); 408 + 409 + /* do route, but don't drop the rt reference */ 410 + rc = rt->output(rt, skb2); 411 + if (rc) 412 + break; 413 + 414 + seq = (seq + 1) & MCTP_HDR_SEQ_MASK; 415 + pos += size; 416 + } 417 + 418 + mctp_route_release(rt); 419 + consume_skb(skb); 420 + return rc; 421 + } 422 + 546 423 int mctp_local_output(struct sock *sk, struct mctp_route *rt, 547 424 struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag) 548 425 { ··· 630 347 struct mctp_skb_cb *cb = mctp_cb(skb); 631 348 struct mctp_hdr *hdr; 632 349 unsigned long flags; 350 + unsigned int mtu; 633 351 mctp_eid_t saddr; 634 352 int rc; 635 353 u8 tag; ··· 660 376 tag = req_tag; 661 377 } 662 378 663 - /* TODO: we have the route MTU here; packetise */ 664 379 380 + skb->protocol = htons(ETH_P_MCTP); 381 + skb->priority = 0; 665 382 skb_reset_transport_header(skb); 666 383 skb_push(skb, sizeof(struct mctp_hdr)); 667 384 skb_reset_network_header(skb); 668 - hdr = mctp_hdr(skb); 669 - hdr->ver = 1; 670 - hdr->dest = daddr; 671 - hdr->src = saddr; 672 - hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | /* TODO */ 673 - tag; 674 - 675 385 skb->dev = rt->dev->dev; 676 - skb->protocol = htons(ETH_P_MCTP); 677 - skb->priority = 0; 678 386 679 387 /* cb->net will have been set on initial ingress */ 680 388 cb->src = saddr; 681 389 682 - return mctp_do_route(rt, skb); 390 + /* set up common header fields */ 391 + hdr = mctp_hdr(skb); 392 + hdr->ver = 1; 393 + hdr->dest = daddr; 394 + hdr->src = saddr; 395 + 396 + mtu = mctp_route_mtu(rt); 397 + 398 + if (skb->len + sizeof(struct mctp_hdr) <= mtu) { 399 + hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | 400 + tag; 401 + return mctp_do_route(rt, skb); 402 + } else { 403 + return mctp_do_fragment_route(rt, skb, mtu, tag); 404 + } 683 405 } 684 406 685 407 /* route management */