Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rds: Enable RDS IPv6 support

This patch enables RDS to use IPv6 addresses. For RDS/TCP, the
listener is now an IPv6 endpoint which accepts both IPv4 and IPv6
connection requests. RDS/RDMA/IB uses a private data (struct
rds_ib_connect_private) exchange between endpoints at RDS connection
establishment time to support RDMA. This private data exchange uses a
32 bit integer to represent an IP address. This needs to be changed in
order to support IPv6. A new private data struct
rds6_ib_connect_private is introduced to handle this. To ensure
backward compatibility, an IPv6 capable RDS stack uses another RDMA
listener port (RDS_CM_PORT) to accept IPv6 connection. And it
continues to use the original RDS_PORT for IPv4 RDS connections. When
it needs to communicate with an IPv6 peer, it uses the RDS_CM_PORT to
send the connection set up request.

v5: Fixed syntax problem (David Miller).

v4: Changed port history comments in rds.h (Sowmini Varadhan).

v3: Added support to set up IPv4 connection using mapped address
(David Miller).
Added support to set up connection between link local and non-link
addresses.
Various review comments from Santosh Shilimkar and Sowmini Varadhan.

v2: Fixed bound and peer address scope mismatched issue.
Added back rds_connect() IPv6 changes.

Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Ka-Cheong Poon and committed by
David S. Miller
1e2b44e7 eee2fa6a

+459 -114
+77 -14
net/rds/af_rds.c
··· 142 142 uaddr_len = sizeof(*sin6); 143 143 } 144 144 } else { 145 - /* If socket is not yet bound, set the return address family 146 - * to be AF_UNSPEC (value 0) and the address size to be that 147 - * of an IPv4 address. 145 + /* If socket is not yet bound and the socket is connected, 146 + * set the return address family to be the same as the 147 + * connected address, but with 0 address value. If it is not 148 + * connected, set the family to be AF_UNSPEC (value 0) and 149 + * the address size to be that of an IPv4 address. 148 150 */ 149 151 if (ipv6_addr_any(&rs->rs_bound_addr)) { 150 - sin = (struct sockaddr_in *)uaddr; 151 - memset(sin, 0, sizeof(*sin)); 152 - sin->sin_family = AF_UNSPEC; 153 - return sizeof(*sin); 152 + if (ipv6_addr_any(&rs->rs_conn_addr)) { 153 + sin = (struct sockaddr_in *)uaddr; 154 + memset(sin, 0, sizeof(*sin)); 155 + sin->sin_family = AF_UNSPEC; 156 + return sizeof(*sin); 157 + } 158 + 159 + if (ipv6_addr_type(&rs->rs_conn_addr) & 160 + IPV6_ADDR_MAPPED) { 161 + sin = (struct sockaddr_in *)uaddr; 162 + memset(sin, 0, sizeof(*sin)); 163 + sin->sin_family = AF_INET; 164 + return sizeof(*sin); 165 + } 166 + 167 + sin6 = (struct sockaddr_in6 *)uaddr; 168 + memset(sin6, 0, sizeof(*sin6)); 169 + sin6->sin6_family = AF_INET6; 170 + return sizeof(*sin6); 154 171 } 155 172 if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { 156 173 sin = (struct sockaddr_in *)uaddr; ··· 501 484 { 502 485 struct sock *sk = sock->sk; 503 486 struct sockaddr_in *sin; 487 + struct sockaddr_in6 *sin6; 504 488 struct rds_sock *rs = rds_sk_to_rs(sk); 489 + int addr_type; 505 490 int ret = 0; 506 491 507 492 lock_sock(sk); 508 493 509 - switch (addr_len) { 510 - case sizeof(struct sockaddr_in): 494 + switch (uaddr->sa_family) { 495 + case AF_INET: 511 496 sin = (struct sockaddr_in *)uaddr; 512 - if (sin->sin_family != AF_INET) { 513 - ret = -EAFNOSUPPORT; 497 + if (addr_len < sizeof(struct sockaddr_in)) { 498 + ret = -EINVAL; 514 499 break; 515 500 } 516 501 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { ··· 528 509 rs->rs_conn_port = sin->sin_port; 529 510 break; 530 511 531 - case sizeof(struct sockaddr_in6): 532 - ret = -EPROTONOSUPPORT; 512 + case AF_INET6: 513 + sin6 = (struct sockaddr_in6 *)uaddr; 514 + if (addr_len < sizeof(struct sockaddr_in6)) { 515 + ret = -EINVAL; 516 + break; 517 + } 518 + addr_type = ipv6_addr_type(&sin6->sin6_addr); 519 + if (!(addr_type & IPV6_ADDR_UNICAST)) { 520 + __be32 addr4; 521 + 522 + if (!(addr_type & IPV6_ADDR_MAPPED)) { 523 + ret = -EPROTOTYPE; 524 + break; 525 + } 526 + 527 + /* It is a mapped address. Need to do some sanity 528 + * checks. 529 + */ 530 + addr4 = sin6->sin6_addr.s6_addr32[3]; 531 + if (addr4 == htonl(INADDR_ANY) || 532 + addr4 == htonl(INADDR_BROADCAST) || 533 + IN_MULTICAST(ntohl(addr4))) { 534 + ret = -EPROTOTYPE; 535 + break; 536 + } 537 + } 538 + 539 + if (addr_type & IPV6_ADDR_LINKLOCAL) { 540 + /* If socket is arleady bound to a link local address, 541 + * the peer address must be on the same link. 542 + */ 543 + if (sin6->sin6_scope_id == 0 || 544 + (!ipv6_addr_any(&rs->rs_bound_addr) && 545 + rs->rs_bound_scope_id && 546 + sin6->sin6_scope_id != rs->rs_bound_scope_id)) { 547 + ret = -EINVAL; 548 + break; 549 + } 550 + /* Remember the connected address scope ID. It will 551 + * be checked against the binding local address when 552 + * the socket is bound. 553 + */ 554 + rs->rs_bound_scope_id = sin6->sin6_scope_id; 555 + } 556 + rs->rs_conn_addr = sin6->sin6_addr; 557 + rs->rs_conn_port = sin6->sin6_port; 533 558 break; 534 559 535 560 default: 536 - ret = -EINVAL; 561 + ret = -EAFNOSUPPORT; 537 562 break; 538 563 } 539 564
+50 -9
net/rds/bind.c
··· 127 127 if (!rhashtable_insert_fast(&bind_hash_table, 128 128 &rs->rs_bound_node, ht_parms)) { 129 129 *port = rs->rs_bound_port; 130 + rs->rs_bound_scope_id = scope_id; 130 131 ret = 0; 131 - rdsdebug("rs %p binding to %pI4:%d\n", 132 - rs, &addr, (int)ntohs(*port)); 132 + rdsdebug("rs %p binding to %pI6c:%d\n", 133 + rs, addr, (int)ntohs(*port)); 133 134 break; 134 135 } else { 135 136 rs->rs_bound_addr = in6addr_any; ··· 165 164 struct in6_addr v6addr, *binding_addr; 166 165 struct rds_transport *trans; 167 166 __u32 scope_id = 0; 167 + int addr_type; 168 168 int ret = 0; 169 169 __be16 port; 170 170 171 - /* We only allow an RDS socket to be bound to an IPv4 address. IPv6 172 - * address support will be added later. 171 + /* We allow an RDS socket to be bound to either IPv4 or IPv6 172 + * address. 173 173 */ 174 - if (addr_len == sizeof(struct sockaddr_in)) { 174 + if (uaddr->sa_family == AF_INET) { 175 175 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 176 176 177 - if (sin->sin_family != AF_INET || 178 - sin->sin_addr.s_addr == htonl(INADDR_ANY)) 177 + if (addr_len < sizeof(struct sockaddr_in) || 178 + sin->sin_addr.s_addr == htonl(INADDR_ANY) || 179 + sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || 180 + IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 179 181 return -EINVAL; 180 182 ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); 181 183 binding_addr = &v6addr; 182 184 port = sin->sin_port; 183 - } else if (addr_len == sizeof(struct sockaddr_in6)) { 184 - return -EPROTONOSUPPORT; 185 + } else if (uaddr->sa_family == AF_INET6) { 186 + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; 187 + 188 + if (addr_len < sizeof(struct sockaddr_in6)) 189 + return -EINVAL; 190 + addr_type = ipv6_addr_type(&sin6->sin6_addr); 191 + if (!(addr_type & IPV6_ADDR_UNICAST)) { 192 + __be32 addr4; 193 + 194 + if (!(addr_type & IPV6_ADDR_MAPPED)) 195 + return -EINVAL; 196 + 197 + /* It is a mapped address. Need to do some sanity 198 + * checks. 199 + */ 200 + addr4 = sin6->sin6_addr.s6_addr32[3]; 201 + if (addr4 == htonl(INADDR_ANY) || 202 + addr4 == htonl(INADDR_BROADCAST) || 203 + IN_MULTICAST(ntohl(addr4))) 204 + return -EINVAL; 205 + } 206 + /* The scope ID must be specified for link local address. */ 207 + if (addr_type & IPV6_ADDR_LINKLOCAL) { 208 + if (sin6->sin6_scope_id == 0) 209 + return -EINVAL; 210 + scope_id = sin6->sin6_scope_id; 211 + } 212 + binding_addr = &sin6->sin6_addr; 213 + port = sin6->sin6_port; 185 214 } else { 186 215 return -EINVAL; 187 216 } ··· 219 188 220 189 /* RDS socket does not allow re-binding. */ 221 190 if (!ipv6_addr_any(&rs->rs_bound_addr)) { 191 + ret = -EINVAL; 192 + goto out; 193 + } 194 + /* Socket is connected. The binding address should have the same 195 + * scope ID as the connected address, except the case when one is 196 + * non-link local address (scope_id is 0). 197 + */ 198 + if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id && 199 + rs->rs_bound_scope_id && 200 + scope_id != rs->rs_bound_scope_id) { 222 201 ret = -EINVAL; 223 202 goto out; 224 203 }
+39 -15
net/rds/connection.c
··· 1 1 /* 2 - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 2 + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 36 36 #include <linux/export.h> 37 37 #include <net/ipv6.h> 38 38 #include <net/inet6_hashtables.h> 39 + #include <net/addrconf.h> 39 40 40 41 #include "rds.h" 41 42 #include "loop.h" ··· 201 200 conn->c_isv6 = !ipv6_addr_v4mapped(laddr); 202 201 conn->c_faddr = *faddr; 203 202 conn->c_dev_if = dev_if; 203 + /* If the local address is link local, set c_bound_if to be the 204 + * index used for this connection. Otherwise, set it to 0 as 205 + * the socket is not bound to an interface. c_bound_if is used 206 + * to look up a socket when a packet is received 207 + */ 208 + if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL) 209 + conn->c_bound_if = dev_if; 210 + else 211 + conn->c_bound_if = 0; 204 212 205 213 rds_conn_net_set(conn, net); 206 214 ··· 496 486 } 497 487 EXPORT_SYMBOL_GPL(rds_conn_destroy); 498 488 499 - static void rds_conn_message_info(struct socket *sock, unsigned int len, 500 - struct rds_info_iterator *iter, 501 - struct rds_info_lengths *lens, 502 - int want_send) 489 + static void __rds_inc_msg_cp(struct rds_incoming *inc, 490 + struct rds_info_iterator *iter, 491 + void *saddr, void *daddr, int flip) 492 + { 493 + rds_inc_info_copy(inc, iter, *(__be32 *)saddr, 494 + *(__be32 *)daddr, flip); 495 + } 496 + 497 + static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, 498 + struct rds_info_iterator *iter, 499 + struct rds_info_lengths *lens, 500 + int want_send) 503 501 { 504 502 struct hlist_head *head; 505 503 struct list_head *list; ··· 542 524 543 525 /* XXX too lazy to maintain counts.. */ 544 526 list_for_each_entry(rm, list, m_conn_item) { 545 - __be32 laddr; 546 - __be32 faddr; 547 - 548 527 total++; 549 - laddr = conn->c_laddr.s6_addr32[3]; 550 - faddr = conn->c_faddr.s6_addr32[3]; 551 528 if (total <= len) 552 - rds_inc_info_copy(&rm->m_inc, 553 - iter, 554 - laddr, 555 - faddr, 556 - 0); 529 + __rds_inc_msg_cp(&rm->m_inc, 530 + iter, 531 + &conn->c_laddr, 532 + &conn->c_faddr, 533 + 0); 557 534 } 558 535 559 536 spin_unlock_irqrestore(&cp->cp_lock, flags); ··· 559 546 560 547 lens->nr = total; 561 548 lens->each = sizeof(struct rds_info_message); 549 + } 550 + 551 + static void rds_conn_message_info(struct socket *sock, unsigned int len, 552 + struct rds_info_iterator *iter, 553 + struct rds_info_lengths *lens, 554 + int want_send) 555 + { 556 + rds_conn_message_info_cmn(sock, len, iter, lens, want_send); 562 557 } 563 558 564 559 static void rds_conn_message_info_send(struct socket *sock, unsigned int len, ··· 675 654 { 676 655 struct rds_info_connection *cinfo = buffer; 677 656 struct rds_connection *conn = cp->cp_conn; 657 + 658 + if (conn->c_isv6) 659 + return 0; 678 660 679 661 cinfo->next_tx_seq = cp->cp_next_tx_seq; 680 662 cinfo->next_rx_seq = cp->cp_next_rx_seq;
+47 -8
net/rds/ib.c
··· 1 1 /* 2 - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 2 + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 39 39 #include <linux/delay.h> 40 40 #include <linux/slab.h> 41 41 #include <linux/module.h> 42 + #include <net/addrconf.h> 42 43 43 44 #include "rds_single_path.h" 44 45 #include "rds.h" ··· 296 295 /* We will only ever look at IB transports */ 297 296 if (conn->c_trans != &rds_ib_transport) 298 297 return 0; 298 + if (conn->c_isv6) 299 + return 0; 299 300 300 301 iinfo->src_addr = conn->c_laddr.s6_addr32[3]; 301 302 iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; ··· 333 330 sizeof(struct rds_info_rdma_connection)); 334 331 } 335 332 336 - 337 333 /* 338 334 * Early RDS/IB was built to only bind to an address if there is an IPoIB 339 335 * device with that address set. ··· 348 346 { 349 347 int ret; 350 348 struct rdma_cm_id *cm_id; 349 + struct sockaddr_in6 sin6; 351 350 struct sockaddr_in sin; 351 + struct sockaddr *sa; 352 + bool isv4; 352 353 354 + isv4 = ipv6_addr_v4mapped(addr); 353 355 /* Create a CMA ID and try to bind it. This catches both 354 356 * IB and iWARP capable NICs. 355 357 */ ··· 362 356 if (IS_ERR(cm_id)) 363 357 return PTR_ERR(cm_id); 364 358 365 - memset(&sin, 0, sizeof(sin)); 366 - sin.sin_family = AF_INET; 367 - sin.sin_addr.s_addr = addr->s6_addr32[3]; 359 + if (isv4) { 360 + memset(&sin, 0, sizeof(sin)); 361 + sin.sin_family = AF_INET; 362 + sin.sin_addr.s_addr = addr->s6_addr32[3]; 363 + sa = (struct sockaddr *)&sin; 364 + } else { 365 + memset(&sin6, 0, sizeof(sin6)); 366 + sin6.sin6_family = AF_INET6; 367 + sin6.sin6_addr = *addr; 368 + sin6.sin6_scope_id = scope_id; 369 + sa = (struct sockaddr *)&sin6; 370 + 371 + /* XXX Do a special IPv6 link local address check here. The 372 + * reason is that rdma_bind_addr() always succeeds with IPv6 373 + * link local address regardless it is indeed configured in a 374 + * system. 375 + */ 376 + if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) { 377 + struct net_device *dev; 378 + 379 + if (scope_id == 0) 380 + return -EADDRNOTAVAIL; 381 + 382 + /* Use init_net for now as RDS is not network 383 + * name space aware. 384 + */ 385 + dev = dev_get_by_index(&init_net, scope_id); 386 + if (!dev) 387 + return -EADDRNOTAVAIL; 388 + if (!ipv6_chk_addr(&init_net, addr, dev, 1)) { 389 + dev_put(dev); 390 + return -EADDRNOTAVAIL; 391 + } 392 + dev_put(dev); 393 + } 394 + } 368 395 369 396 /* rdma_bind_addr will only succeed for IB & iWARP devices */ 370 - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 397 + ret = rdma_bind_addr(cm_id, sa); 371 398 /* due to this, we will claim to support iWARP devices unless we 372 399 check node_type. */ 373 400 if (ret || !cm_id->device || 374 401 cm_id->device->node_type != RDMA_NODE_IB_CA) 375 402 ret = -EADDRNOTAVAIL; 376 403 377 - rdsdebug("addr %pI6c ret %d node type %d\n", 378 - addr, ret, 404 + rdsdebug("addr %pI6c%%%u ret %d node type %d\n", 405 + addr, scope_id, ret, 379 406 cm_id->device ? cm_id->device->node_type : -1); 380 407 381 408 rdma_destroy_id(cm_id);
+15 -5
net/rds/ib_cm.c
··· 678 678 return version; 679 679 } 680 680 681 - /* Given an IPv6 address, find the IB net_device which hosts that address and 681 + /* Given an IPv6 address, find the net_device which hosts that address and 682 682 * return its index. This is used by the rds_ib_cm_handle_connect() code to 683 683 * find the interface index of where an incoming request comes from when 684 684 * the request is using a link local address. ··· 695 695 696 696 rcu_read_lock(); 697 697 for_each_netdev_rcu(net, dev) { 698 - if (dev->type == ARPHRD_INFINIBAND && 699 - ipv6_chk_addr(net, addr, dev, 0)) { 698 + if (ipv6_chk_addr(net, addr, dev, 1)) { 700 699 idx = dev->ifindex; 701 700 break; 702 701 } ··· 735 736 dp_cmn = &dp->ricp_v6.dp_cmn; 736 737 saddr6 = &dp->ricp_v6.dp_saddr; 737 738 daddr6 = &dp->ricp_v6.dp_daddr; 738 - /* If the local address is link local, need to find the 739 + /* If either address is link local, need to find the 739 740 * interface index in order to create a proper RDS 740 741 * connection. 741 742 */ 742 743 if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) { 743 744 /* Using init_net for now .. */ 745 + ifindex = __rds_find_ifindex(&init_net, daddr6); 746 + /* No index found... Need to bail out. */ 747 + if (ifindex == 0) { 748 + err = -EOPNOTSUPP; 749 + goto out; 750 + } 751 + } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) { 752 + /* Use our address to find the correct index. */ 744 753 ifindex = __rds_find_ifindex(&init_net, daddr6); 745 754 /* No index found... Need to bail out. */ 746 755 if (ifindex == 0) { ··· 893 886 894 887 /* XXX I wonder what affect the port space has */ 895 888 /* delegate cm event handler to rdma_transport */ 896 - handler = rds_rdma_cm_event_handler; 889 + if (conn->c_isv6) 890 + handler = rds6_rdma_cm_event_handler; 891 + else 892 + handler = rds_rdma_cm_event_handler; 897 893 ic->i_cm_id = rdma_create_id(&init_net, handler, conn, 898 894 RDMA_PS_TCP, IB_QPT_RC); 899 895 if (IS_ERR(ic->i_cm_id)) {
+29 -1
net/rds/rdma_transport.c
··· 37 37 #include "rdma_transport.h" 38 38 #include "ib.h" 39 39 40 + /* Global IPv4 and IPv6 RDS RDMA listener cm_id */ 40 41 static struct rdma_cm_id *rds_rdma_listen_id; 42 + static struct rdma_cm_id *rds6_rdma_listen_id; 41 43 42 44 static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, 43 45 struct rdma_cm_event *event, ··· 155 153 return rds_rdma_cm_event_handler_cmn(cm_id, event, false); 156 154 } 157 155 156 + int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 157 + struct rdma_cm_event *event) 158 + { 159 + return rds_rdma_cm_event_handler_cmn(cm_id, event, true); 160 + } 161 + 158 162 static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, 159 163 struct sockaddr *sa, 160 164 struct rdma_cm_id **ret_cm_id) ··· 214 206 static int rds_rdma_listen_init(void) 215 207 { 216 208 int ret; 209 + struct sockaddr_in6 sin6; 217 210 struct sockaddr_in sin; 218 211 219 212 sin.sin_family = PF_INET; ··· 223 214 ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler, 224 215 (struct sockaddr *)&sin, 225 216 &rds_rdma_listen_id); 226 - return ret; 217 + if (ret != 0) 218 + return ret; 219 + 220 + sin6.sin6_family = PF_INET6; 221 + sin6.sin6_addr = in6addr_any; 222 + sin6.sin6_port = htons(RDS_CM_PORT); 223 + sin6.sin6_scope_id = 0; 224 + sin6.sin6_flowinfo = 0; 225 + ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler, 226 + (struct sockaddr *)&sin6, 227 + &rds6_rdma_listen_id); 228 + /* Keep going even when IPv6 is not enabled in the system. */ 229 + if (ret != 0) 230 + rdsdebug("Cannot set up IPv6 RDMA listener\n"); 231 + return 0; 227 232 } 228 233 229 234 static void rds_rdma_listen_stop(void) ··· 246 223 rdsdebug("cm %p\n", rds_rdma_listen_id); 247 224 rdma_destroy_id(rds_rdma_listen_id); 248 225 rds_rdma_listen_id = NULL; 226 + } 227 + if (rds6_rdma_listen_id) { 228 + rdsdebug("cm %p\n", rds6_rdma_listen_id); 229 + rdma_destroy_id(rds6_rdma_listen_id); 230 + rds6_rdma_listen_id = NULL; 249 231 } 250 232 } 251 233
+5
net/rds/rdma_transport.h
··· 6 6 #include <rdma/rdma_cm.h> 7 7 #include "rds.h" 8 8 9 + /* RDMA_CM also uses 16385 as the listener port. */ 10 + #define RDS_CM_PORT 16385 11 + 9 12 #define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000 10 13 11 14 int rds_rdma_conn_connect(struct rds_connection *conn); 12 15 int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 13 16 struct rdma_cm_event *event); 17 + int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 18 + struct rdma_cm_event *event); 14 19 15 20 /* from ib.c */ 16 21 extern struct rds_transport rds_ib_transport;
+14 -8
net/rds/rds.h
··· 24 24 #define RDS_PROTOCOL_MINOR(v) ((v) & 255) 25 25 #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) 26 26 27 - /* 28 - * XXX randomly chosen, but at least seems to be unused: 29 - * # 18464-18768 Unassigned 30 - * We should do better. We want a reserved port to discourage unpriv'ed 31 - * userspace from listening. 27 + /* The following ports, 16385, 18634, 18635, are registered with IANA as 28 + * the ports to be used for RDS over TCP and UDP. Currently, only RDS over 29 + * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value 30 + * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After 31 + * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept 32 + * to ensure compatibility with older RDS modules. Those ports are defined 33 + * in each transport's header file. 32 34 */ 33 35 #define RDS_PORT 18634 34 - #define RDS_CM_PORT 16385 35 36 36 37 #ifdef ATOMIC64_INIT 37 38 #define KERNEL_HAS_ATOMIC64 ··· 141 140 struct hlist_node c_hash_node; 142 141 struct in6_addr c_laddr; 143 142 struct in6_addr c_faddr; 144 - int c_dev_if; /* c_laddrs's interface index */ 143 + int c_dev_if; /* ifindex used for this conn */ 144 + int c_bound_if; /* ifindex of c_laddr */ 145 145 unsigned int c_loopback:1, 146 146 c_isv6:1, 147 147 c_ping_triggered:1, ··· 738 736 void rds_cong_exit(void); 739 737 struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); 740 738 741 - /* conn.c */ 739 + /* connection.c */ 742 740 extern u32 rds_gen_num; 743 741 int rds_conn_init(void); 744 742 void rds_conn_exit(void); ··· 876 874 void rds_inc_info_copy(struct rds_incoming *inc, 877 875 struct rds_info_iterator *iter, 878 876 __be32 saddr, __be32 daddr, int flip); 877 + void rds6_inc_info_copy(struct rds_incoming *inc, 878 + struct rds_info_iterator *iter, 879 + struct in6_addr *saddr, struct in6_addr *daddr, 880 + int flip); 879 881 880 882 /* send.c */ 881 883 int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
+1 -1
net/rds/recv.c
··· 364 364 goto out; 365 365 } 366 366 367 - rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_dev_if); 367 + rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if); 368 368 if (!rs) { 369 369 rds_stats_inc(s_recv_drop_no_sock); 370 370 goto out;
+54 -7
net/rds/send.c
··· 1091 1091 ret = -EINVAL; 1092 1092 goto out; 1093 1093 } 1094 - switch (namelen) { 1095 - case sizeof(*usin): 1096 - if (usin->sin_family != AF_INET || 1097 - usin->sin_addr.s_addr == htonl(INADDR_ANY) || 1094 + switch (usin->sin_family) { 1095 + case AF_INET: 1096 + if (usin->sin_addr.s_addr == htonl(INADDR_ANY) || 1098 1097 usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || 1099 1098 IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) { 1100 1099 ret = -EINVAL; ··· 1103 1104 dport = usin->sin_port; 1104 1105 break; 1105 1106 1106 - case sizeof(*sin6): { 1107 - ret = -EPROTONOSUPPORT; 1108 - goto out; 1107 + case AF_INET6: { 1108 + int addr_type; 1109 + 1110 + if (namelen < sizeof(*sin6)) { 1111 + ret = -EINVAL; 1112 + goto out; 1113 + } 1114 + addr_type = ipv6_addr_type(&sin6->sin6_addr); 1115 + if (!(addr_type & IPV6_ADDR_UNICAST)) { 1116 + __be32 addr4; 1117 + 1118 + if (!(addr_type & IPV6_ADDR_MAPPED)) { 1119 + ret = -EINVAL; 1120 + goto out; 1121 + } 1122 + 1123 + /* It is a mapped address. Need to do some 1124 + * sanity checks. 1125 + */ 1126 + addr4 = sin6->sin6_addr.s6_addr32[3]; 1127 + if (addr4 == htonl(INADDR_ANY) || 1128 + addr4 == htonl(INADDR_BROADCAST) || 1129 + IN_MULTICAST(ntohl(addr4))) { 1130 + return -EINVAL; 1131 + goto out; 1132 + } 1133 + } 1134 + if (addr_type & IPV6_ADDR_LINKLOCAL) { 1135 + if (sin6->sin6_scope_id == 0) { 1136 + ret = -EINVAL; 1137 + goto out; 1138 + } 1139 + scope_id = sin6->sin6_scope_id; 1140 + } 1141 + 1142 + daddr = sin6->sin6_addr; 1143 + dport = sin6->sin6_port; 1144 + break; 1109 1145 } 1110 1146 1111 1147 default: ··· 1171 1137 release_sock(sk); 1172 1138 ret = -EOPNOTSUPP; 1173 1139 goto out; 1140 + } 1141 + /* If the socket is already bound to a link local address, 1142 + * it can only send to peers on the same link. But allow 1143 + * communicating beween link local and non-link local address. 1144 + */ 1145 + if (scope_id != rs->rs_bound_scope_id) { 1146 + if (!scope_id) { 1147 + scope_id = rs->rs_bound_scope_id; 1148 + } else if (rs->rs_bound_scope_id) { 1149 + release_sock(sk); 1150 + ret = -EINVAL; 1151 + goto out; 1152 + } 1174 1153 } 1175 1154 } 1176 1155 release_sock(sk);
+34 -20
net/rds/tcp.c
··· 1 1 /* 2 - * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 2 + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 46 46 /* only for info exporting */ 47 47 static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); 48 48 static LIST_HEAD(rds_tcp_tc_list); 49 + 50 + /* rds_tcp_tc_count counts only IPv4 connections. 51 + * rds6_tcp_tc_count counts both IPv4 and IPv6 connections. 52 + */ 49 53 static unsigned int rds_tcp_tc_count; 54 + static unsigned int rds6_tcp_tc_count; 50 55 51 56 /* Track rds_tcp_connection structs so they can be cleaned up */ 52 57 static DEFINE_SPINLOCK(rds_tcp_conn_lock); ··· 118 113 /* done under the callback_lock to serialize with write_space */ 119 114 spin_lock(&rds_tcp_tc_list_lock); 120 115 list_del_init(&tc->t_list_item); 121 - rds_tcp_tc_count--; 116 + rds6_tcp_tc_count--; 117 + if (!tc->t_cpath->cp_conn->c_isv6) 118 + rds_tcp_tc_count--; 122 119 spin_unlock(&rds_tcp_tc_list_lock); 123 120 124 121 tc->t_sock = NULL; ··· 207 200 /* done under the callback_lock to serialize with write_space */ 208 201 spin_lock(&rds_tcp_tc_list_lock); 209 202 list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); 210 - rds_tcp_tc_count++; 203 + rds6_tcp_tc_count++; 204 + if (!tc->t_cpath->cp_conn->c_isv6) 205 + rds_tcp_tc_count++; 211 206 spin_unlock(&rds_tcp_tc_list_lock); 212 207 213 208 /* accepted sockets need our listen data ready undone */ ··· 230 221 write_unlock_bh(&sock->sk->sk_callback_lock); 231 222 } 232 223 224 + /* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4 225 + * connections for backward compatibility. 226 + */ 233 227 static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, 234 228 struct rds_info_iterator *iter, 235 229 struct rds_info_lengths *lens) ··· 240 228 struct rds_info_tcp_socket tsinfo; 241 229 struct rds_tcp_connection *tc; 242 230 unsigned long flags; 243 - struct sockaddr_in sin; 244 - struct socket *sock; 245 231 246 232 spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); 247 233 ··· 247 237 goto out; 248 238 249 239 list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { 240 + struct inet_sock *inet = inet_sk(tc->t_sock->sk); 250 241 251 - sock = tc->t_sock; 252 - if (sock) { 253 - sock->ops->getname(sock, (struct sockaddr *)&sin, 0); 254 - tsinfo.local_addr = sin.sin_addr.s_addr; 255 - tsinfo.local_port = sin.sin_port; 256 - sock->ops->getname(sock, (struct sockaddr *)&sin, 1); 257 - tsinfo.peer_addr = sin.sin_addr.s_addr; 258 - tsinfo.peer_port = sin.sin_port; 259 - } 242 + if (tc->t_cpath->cp_conn->c_isv6) 243 + continue; 244 + 245 + tsinfo.local_addr = inet->inet_saddr; 246 + tsinfo.local_port = inet->inet_sport; 247 + tsinfo.peer_addr = inet->inet_daddr; 248 + tsinfo.peer_port = inet->inet_dport; 260 249 261 250 tsinfo.hdr_rem = tc->t_tinc_hdr_rem; 262 251 tsinfo.data_rem = tc->t_tinc_data_rem; ··· 503 494 err = -ENOMEM; 504 495 goto fail; 505 496 } 506 - rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); 497 + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true); 507 498 if (!rtn->rds_tcp_listen_sock) { 508 - pr_warn("could not set up listen sock\n"); 509 - unregister_net_sysctl_table(rtn->rds_tcp_sysctl); 510 - rtn->rds_tcp_sysctl = NULL; 511 - err = -EAFNOSUPPORT; 512 - goto fail; 499 + pr_warn("could not set up IPv6 listen sock\n"); 500 + 501 + /* Try IPv4 as some systems disable IPv6 */ 502 + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false); 503 + if (!rtn->rds_tcp_listen_sock) { 504 + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); 505 + rtn->rds_tcp_sysctl = NULL; 506 + err = -EAFNOSUPPORT; 507 + goto fail; 508 + } 513 509 } 514 510 INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); 515 511 return 0;
+1 -1
net/rds/tcp.h
··· 67 67 void rds_tcp_state_change(struct sock *sk); 68 68 69 69 /* tcp_listen.c */ 70 - struct socket *rds_tcp_listen_init(struct net *); 70 + struct socket *rds_tcp_listen_init(struct net *net, bool isv6); 71 71 void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); 72 72 void rds_tcp_listen_data_ready(struct sock *sk); 73 73 int rds_tcp_accept_one(struct socket *sock);
+42 -12
net/rds/tcp_connect.c
··· 89 89 int rds_tcp_conn_path_connect(struct rds_conn_path *cp) 90 90 { 91 91 struct socket *sock = NULL; 92 + struct sockaddr_in6 sin6; 92 93 struct sockaddr_in sin; 93 94 struct sockaddr *addr; 94 95 int addrlen; 96 + bool isv6; 95 97 int ret; 96 98 struct rds_connection *conn = cp->cp_conn; 97 99 struct rds_tcp_connection *tc = cp->cp_transport_data; ··· 110 108 mutex_unlock(&tc->t_conn_path_lock); 111 109 return 0; 112 110 } 113 - ret = sock_create_kern(rds_conn_net(conn), PF_INET, 114 - SOCK_STREAM, IPPROTO_TCP, &sock); 111 + if (ipv6_addr_v4mapped(&conn->c_laddr)) { 112 + ret = sock_create_kern(rds_conn_net(conn), PF_INET, 113 + SOCK_STREAM, IPPROTO_TCP, &sock); 114 + isv6 = false; 115 + } else { 116 + ret = sock_create_kern(rds_conn_net(conn), PF_INET6, 117 + SOCK_STREAM, IPPROTO_TCP, &sock); 118 + isv6 = true; 119 + } 120 + 115 121 if (ret < 0) 116 122 goto out; 117 123 118 124 rds_tcp_tune(sock); 119 125 120 - sin.sin_family = AF_INET; 121 - sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; 122 - sin.sin_port = 0; 123 - addr = (struct sockaddr *)&sin; 124 - addrlen = sizeof(sin); 126 + if (isv6) { 127 + sin6.sin6_family = AF_INET6; 128 + sin6.sin6_addr = conn->c_laddr; 129 + sin6.sin6_port = 0; 130 + sin6.sin6_flowinfo = 0; 131 + sin6.sin6_scope_id = conn->c_dev_if; 132 + addr = (struct sockaddr *)&sin6; 133 + addrlen = sizeof(sin6); 134 + } else { 135 + sin.sin_family = AF_INET; 136 + sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; 137 + sin.sin_port = 0; 138 + addr = (struct sockaddr *)&sin; 139 + addrlen = sizeof(sin); 140 + } 125 141 126 142 ret = sock->ops->bind(sock, addr, addrlen); 127 143 if (ret) { ··· 148 128 goto out; 149 129 } 150 130 151 - sin.sin_family = AF_INET; 152 - sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; 153 - sin.sin_port = htons(RDS_TCP_PORT); 154 - addr = (struct sockaddr *)&sin; 155 - addrlen = sizeof(sin); 131 + if (isv6) { 132 + sin6.sin6_family = AF_INET6; 133 + sin6.sin6_addr = conn->c_faddr; 134 + sin6.sin6_port = htons(RDS_TCP_PORT); 135 + sin6.sin6_flowinfo = 0; 136 + sin6.sin6_scope_id = conn->c_dev_if; 137 + addr = (struct sockaddr *)&sin6; 138 + addrlen = sizeof(sin6); 139 + } else { 140 + sin.sin_family = AF_INET; 141 + sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; 142 + sin.sin_port = htons(RDS_TCP_PORT); 143 + addr = (struct sockaddr *)&sin; 144 + addrlen = sizeof(sin); 145 + } 156 146 157 147 /* 158 148 * once we call connect() we can start getting callbacks and they
+51 -13
net/rds/tcp_listen.c
··· 131 131 struct rds_tcp_connection *rs_tcp = NULL; 132 132 int conn_state; 133 133 struct rds_conn_path *cp; 134 + struct in6_addr *my_addr, *peer_addr; 135 + int dev_if; 134 136 135 137 if (!sock) /* module unload or netns delete in progress */ 136 138 return -ENETUNREACH; ··· 165 163 166 164 inet = inet_sk(new_sock->sk); 167 165 166 + my_addr = &new_sock->sk->sk_v6_rcv_saddr; 167 + peer_addr = &new_sock->sk->sk_v6_daddr; 168 168 rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n", 169 - &new_sock->sk->sk_v6_rcv_saddr, ntohs(inet->inet_sport), 170 - &new_sock->sk->sk_v6_daddr, ntohs(inet->inet_dport)); 169 + my_addr, ntohs(inet->inet_sport), 170 + peer_addr, ntohs(inet->inet_dport)); 171 171 172 + /* sk_bound_dev_if is not set if the peer address is not link local 173 + * address. In this case, it happens that mcast_oif is set. So 174 + * just use it. 175 + */ 176 + if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) && 177 + !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) { 178 + struct ipv6_pinfo *inet6; 179 + 180 + inet6 = inet6_sk(new_sock->sk); 181 + dev_if = inet6->mcast_oif; 182 + } else { 183 + dev_if = new_sock->sk->sk_bound_dev_if; 184 + } 172 185 conn = rds_conn_create(sock_net(sock->sk), 173 186 &new_sock->sk->sk_v6_rcv_saddr, 174 187 &new_sock->sk->sk_v6_daddr, 175 - &rds_tcp_transport, GFP_KERNEL, 176 - new_sock->sk->sk_bound_dev_if); 188 + &rds_tcp_transport, GFP_KERNEL, dev_if); 177 189 178 190 if (IS_ERR(conn)) { 179 191 ret = PTR_ERR(conn); ··· 272 256 ready(sk); 273 257 } 274 258 275 - struct socket *rds_tcp_listen_init(struct net *net) 259 + struct socket *rds_tcp_listen_init(struct net *net, bool isv6) 276 260 { 277 - struct sockaddr_in sin; 278 261 struct socket *sock = NULL; 262 + struct sockaddr_storage ss; 263 + struct sockaddr_in6 *sin6; 264 + struct sockaddr_in *sin; 265 + int addr_len; 279 266 int ret; 280 267 281 - ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 282 - if (ret < 0) 268 + ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM, 269 + IPPROTO_TCP, &sock); 270 + if (ret < 0) { 271 + rdsdebug("could not create %s listener socket: %d\n", 272 + isv6 ? "IPv6" : "IPv4", ret); 283 273 goto out; 274 + } 284 275 285 276 sock->sk->sk_reuse = SK_CAN_REUSE; 286 277 rds_tcp_nonagle(sock); ··· 297 274 sock->sk->sk_data_ready = rds_tcp_listen_data_ready; 298 275 write_unlock_bh(&sock->sk->sk_callback_lock); 299 276 300 - sin.sin_family = PF_INET; 301 - sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); 302 - sin.sin_port = (__force u16)htons(RDS_TCP_PORT); 277 + if (isv6) { 278 + sin6 = (struct sockaddr_in6 *)&ss; 279 + sin6->sin6_family = PF_INET6; 280 + sin6->sin6_addr = in6addr_any; 281 + sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); 282 + sin6->sin6_scope_id = 0; 283 + sin6->sin6_flowinfo = 0; 284 + addr_len = sizeof(*sin6); 285 + } else { 286 + sin = (struct sockaddr_in *)&ss; 287 + sin->sin_family = PF_INET; 288 + sin->sin_addr.s_addr = INADDR_ANY; 289 + sin->sin_port = (__force u16)htons(RDS_TCP_PORT); 290 + addr_len = sizeof(*sin); 291 + } 303 292 304 - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 305 - if (ret < 0) 293 + ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len); 294 + if (ret < 0) { 295 + rdsdebug("could not bind %s listener socket: %d\n", 296 + isv6 ? "IPv6" : "IPv4", ret); 306 297 goto out; 298 + } 307 299 308 300 ret = sock->ops->listen(sock, 64); 309 301 if (ret < 0)