Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rds: Changing IP address internal representation to struct in6_addr

This patch changes the internal representation of an IP address to use
struct in6_addr. IPv4 address is stored as an IPv4 mapped address.
All the functions which take an IP address as argument are also
changed to use struct in6_addr. But RDS socket layer is not modified
such that it still does not accept IPv6 address from an application.
And RDS layer does not accept nor initiate IPv6 connections.

v2: Fixed sparse warnings.

Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Ka-Cheong Poon and committed by
David S. Miller
eee2fa6a a6c90dd3

+864 -370
+100 -40
net/rds/af_rds.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 35 35 #include <linux/kernel.h> 36 36 #include <linux/gfp.h> 37 37 #include <linux/in.h> 38 + #include <linux/ipv6.h> 38 39 #include <linux/poll.h> 39 40 #include <net/sock.h> 40 41 ··· 114 113 static int rds_getname(struct socket *sock, struct sockaddr *uaddr, 115 114 int peer) 116 115 { 117 - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 118 116 struct rds_sock *rs = rds_sk_to_rs(sock->sk); 119 - 120 - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 117 + struct sockaddr_in6 *sin6; 118 + struct sockaddr_in *sin; 119 + int uaddr_len; 121 120 122 121 /* racey, don't care */ 123 122 if (peer) { 124 - if (!rs->rs_conn_addr) 123 + if (ipv6_addr_any(&rs->rs_conn_addr)) 125 124 return -ENOTCONN; 126 125 127 - sin->sin_port = rs->rs_conn_port; 128 - sin->sin_addr.s_addr = rs->rs_conn_addr; 126 + if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { 127 + sin = (struct sockaddr_in *)uaddr; 128 + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 129 + sin->sin_family = AF_INET; 130 + sin->sin_port = rs->rs_conn_port; 131 + sin->sin_addr.s_addr = rs->rs_conn_addr_v4; 132 + uaddr_len = sizeof(*sin); 133 + } else { 134 + sin6 = (struct sockaddr_in6 *)uaddr; 135 + sin6->sin6_family = AF_INET6; 136 + sin6->sin6_port = rs->rs_conn_port; 137 + sin6->sin6_addr = rs->rs_conn_addr; 138 + sin6->sin6_flowinfo = 0; 139 + /* scope_id is the same as in the bound address. */ 140 + sin6->sin6_scope_id = rs->rs_bound_scope_id; 141 + uaddr_len = sizeof(*sin6); 142 + } 129 143 } else { 130 - sin->sin_port = rs->rs_bound_port; 131 - sin->sin_addr.s_addr = rs->rs_bound_addr; 144 + /* If socket is not yet bound, set the return address family 145 + * to be AF_UNSPEC (value 0) and the address size to be that 146 + * of an IPv4 address. 147 + */ 148 + if (ipv6_addr_any(&rs->rs_bound_addr)) { 149 + sin = (struct sockaddr_in *)uaddr; 150 + memset(sin, 0, sizeof(*sin)); 151 + sin->sin_family = AF_UNSPEC; 152 + return sizeof(*sin); 153 + } 154 + if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { 155 + sin = (struct sockaddr_in *)uaddr; 156 + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 157 + sin->sin_family = AF_INET; 158 + sin->sin_port = rs->rs_bound_port; 159 + sin->sin_addr.s_addr = rs->rs_bound_addr_v4; 160 + uaddr_len = sizeof(*sin); 161 + } else { 162 + sin6 = (struct sockaddr_in6 *)uaddr; 163 + sin6->sin6_family = AF_INET6; 164 + sin6->sin6_port = rs->rs_bound_port; 165 + sin6->sin6_addr = rs->rs_bound_addr; 166 + sin6->sin6_flowinfo = 0; 167 + sin6->sin6_scope_id = rs->rs_bound_scope_id; 168 + uaddr_len = sizeof(*sin6); 169 + } 132 170 } 133 171 134 - sin->sin_family = AF_INET; 135 - 136 - return sizeof(*sin); 172 + return uaddr_len; 137 173 } 138 174 139 175 /* ··· 241 203 static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, 242 204 int len) 243 205 { 206 + struct sockaddr_in6 sin6; 244 207 struct sockaddr_in sin; 245 208 int ret = 0; 246 209 247 210 /* racing with another thread binding seems ok here */ 248 - if (rs->rs_bound_addr == 0) { 211 + if (ipv6_addr_any(&rs->rs_bound_addr)) { 249 212 ret = -ENOTCONN; /* XXX not a great errno */ 250 213 goto out; 251 214 } ··· 254 215 if (len < sizeof(struct sockaddr_in)) { 255 216 ret = -EINVAL; 256 217 goto out; 218 + } else if (len < sizeof(struct sockaddr_in6)) { 219 + /* Assume IPv4 */ 220 + if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) { 221 + ret = -EFAULT; 222 + goto out; 223 + } 224 + ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); 225 + sin6.sin6_port = sin.sin_port; 226 + } else { 227 + if (copy_from_user(&sin6, optval, 228 + sizeof(struct sockaddr_in6))) { 229 + ret = -EFAULT; 230 + goto out; 231 + } 257 232 } 258 233 259 - if (copy_from_user(&sin, optval, sizeof(sin))) { 260 - ret = -EFAULT; 261 - goto out; 262 - } 263 - 264 - rds_send_drop_to(rs, &sin); 234 + rds_send_drop_to(rs, &sin6); 265 235 out: 266 236 return ret; 267 237 } ··· 483 435 int addr_len, int flags) 484 436 { 485 437 struct sock *sk = sock->sk; 486 - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 438 + struct sockaddr_in *sin; 487 439 struct rds_sock *rs = rds_sk_to_rs(sk); 488 440 int ret = 0; 489 441 490 442 lock_sock(sk); 491 443 492 - if (addr_len != sizeof(struct sockaddr_in)) { 444 + switch (addr_len) { 445 + case sizeof(struct sockaddr_in): 446 + sin = (struct sockaddr_in *)uaddr; 447 + if (sin->sin_family != AF_INET) { 448 + ret = -EAFNOSUPPORT; 449 + break; 450 + } 451 + if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 452 + ret = -EDESTADDRREQ; 453 + break; 454 + } 455 + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) || 456 + sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) { 457 + ret = -EINVAL; 458 + break; 459 + } 460 + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); 461 + rs->rs_conn_port = sin->sin_port; 462 + break; 463 + 464 + case sizeof(struct sockaddr_in6): 465 + ret = -EPROTONOSUPPORT; 466 + break; 467 + 468 + default: 493 469 ret = -EINVAL; 494 - goto out; 470 + break; 495 471 } 496 472 497 - if (sin->sin_family != AF_INET) { 498 - ret = -EAFNOSUPPORT; 499 - goto out; 500 - } 501 - 502 - if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 503 - ret = -EDESTADDRREQ; 504 - goto out; 505 - } 506 - 507 - rs->rs_conn_addr = sin->sin_addr.s_addr; 508 - rs->rs_conn_port = sin->sin_port; 509 - 510 - out: 511 473 release_sock(sk); 512 474 return ret; 513 475 } ··· 636 578 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { 637 579 total++; 638 580 if (total <= len) 639 - rds_inc_info_copy(inc, iter, inc->i_saddr, 640 - rs->rs_bound_addr, 1); 581 + rds_inc_info_copy(inc, iter, 582 + inc->i_saddr.s6_addr32[3], 583 + rs->rs_bound_addr_v4, 584 + 1); 641 585 } 642 586 643 587 read_unlock(&rs->rs_recv_lock); ··· 668 608 list_for_each_entry(rs, &rds_sock_list, rs_item) { 669 609 sinfo.sndbuf = rds_sk_sndbuf(rs); 670 610 sinfo.rcvbuf = rds_sk_rcvbuf(rs); 671 - sinfo.bound_addr = rs->rs_bound_addr; 672 - sinfo.connected_addr = rs->rs_conn_addr; 611 + sinfo.bound_addr = rs->rs_bound_addr_v4; 612 + sinfo.connected_addr = rs->rs_conn_addr_v4; 673 613 sinfo.bound_port = rs->rs_bound_port; 674 614 sinfo.connected_port = rs->rs_conn_port; 675 615 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
+63 -28
net/rds/bind.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 33 33 #include <linux/kernel.h> 34 34 #include <net/sock.h> 35 35 #include <linux/in.h> 36 + #include <linux/ipv6.h> 36 37 #include <linux/if_arp.h> 37 38 #include <linux/jhash.h> 38 39 #include <linux/ratelimit.h> ··· 43 42 44 43 static const struct rhashtable_params ht_parms = { 45 44 .nelem_hint = 768, 46 - .key_len = sizeof(u64), 45 + .key_len = RDS_BOUND_KEY_LEN, 47 46 .key_offset = offsetof(struct rds_sock, rs_bound_key), 48 47 .head_offset = offsetof(struct rds_sock, rs_bound_node), 49 48 .max_size = 16384, 50 49 .min_size = 1024, 51 50 }; 51 + 52 + /* Create a key for the bind hash table manipulation. Port is in network byte 53 + * order. 54 + */ 55 + static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr, 56 + __be16 port, __u32 scope_id) 57 + { 58 + memcpy(key, addr, sizeof(*addr)); 59 + key += sizeof(*addr); 60 + memcpy(key, &port, sizeof(port)); 61 + key += sizeof(port); 62 + memcpy(key, &scope_id, sizeof(scope_id)); 63 + } 52 64 53 65 /* 54 66 * Return the rds_sock bound at the given local address. ··· 69 55 * The rx path can race with rds_release. We notice if rds_release() has 70 56 * marked this socket and don't return a rs ref to the rx path. 71 57 */ 72 - struct rds_sock *rds_find_bound(__be32 addr, __be16 port) 58 + struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, 59 + __u32 scope_id) 73 60 { 74 - u64 key = ((u64)addr << 32) | port; 61 + u8 key[RDS_BOUND_KEY_LEN]; 75 62 struct rds_sock *rs; 76 63 77 - rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms); 64 + __rds_create_bind_key(key, addr, port, scope_id); 65 + rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms); 78 66 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) 79 67 rds_sock_addref(rs); 80 68 else 81 69 rs = NULL; 82 70 83 - rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, 84 - ntohs(port)); 71 + rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr, 72 + ntohs(port)); 85 73 86 74 return rs; 87 75 } 88 76 89 77 /* returns -ve errno or +ve port */ 90 - static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) 78 + static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, 79 + __be16 *port, __u32 scope_id) 91 80 { 92 81 int ret = -EADDRINUSE; 93 82 u16 rover, last; 94 - u64 key; 83 + u8 key[RDS_BOUND_KEY_LEN]; 95 84 96 85 if (*port != 0) { 97 86 rover = be16_to_cpu(*port); ··· 112 95 113 96 if (rover == RDS_FLAG_PROBE_PORT) 114 97 continue; 115 - key = ((u64)addr << 32) | cpu_to_be16(rover); 116 - if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms)) 98 + __rds_create_bind_key(key, addr, cpu_to_be16(rover), 99 + scope_id); 100 + if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms)) 117 101 continue; 118 102 119 - rs->rs_bound_key = key; 120 - rs->rs_bound_addr = addr; 103 + memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key)); 104 + rs->rs_bound_addr = *addr; 121 105 net_get_random_once(&rs->rs_hash_initval, 122 106 sizeof(rs->rs_hash_initval)); 123 107 rs->rs_bound_port = cpu_to_be16(rover); ··· 132 114 rs, &addr, (int)ntohs(*port)); 133 115 break; 134 116 } else { 135 - rs->rs_bound_addr = 0; 117 + rs->rs_bound_addr = in6addr_any; 136 118 rds_sock_put(rs); 137 119 ret = -ENOMEM; 138 120 break; ··· 145 127 void rds_remove_bound(struct rds_sock *rs) 146 128 { 147 129 148 - if (!rs->rs_bound_addr) 130 + if (ipv6_addr_any(&rs->rs_bound_addr)) 149 131 return; 150 132 151 - rdsdebug("rs %p unbinding from %pI4:%d\n", 133 + rdsdebug("rs %p unbinding from %pI6c:%d\n", 152 134 rs, &rs->rs_bound_addr, 153 135 ntohs(rs->rs_bound_port)); 154 136 155 137 rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms); 156 138 rds_sock_put(rs); 157 - rs->rs_bound_addr = 0; 139 + rs->rs_bound_addr = in6addr_any; 158 140 } 159 141 160 142 int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 161 143 { 162 144 struct sock *sk = sock->sk; 163 - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 164 145 struct rds_sock *rs = rds_sk_to_rs(sk); 146 + struct in6_addr v6addr, *binding_addr; 165 147 struct rds_transport *trans; 148 + __u32 scope_id = 0; 166 149 int ret = 0; 150 + __be16 port; 167 151 152 + /* We only allow an RDS socket to be bound to an IPv4 address. IPv6 153 + * address support will be added later. 154 + */ 155 + if (addr_len == sizeof(struct sockaddr_in)) { 156 + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; 157 + 158 + if (sin->sin_family != AF_INET || 159 + sin->sin_addr.s_addr == htonl(INADDR_ANY)) 160 + return -EINVAL; 161 + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); 162 + binding_addr = &v6addr; 163 + port = sin->sin_port; 164 + } else if (addr_len == sizeof(struct sockaddr_in6)) { 165 + return -EPROTONOSUPPORT; 166 + } else { 167 + return -EINVAL; 168 + } 168 169 lock_sock(sk); 169 170 170 - if (addr_len != sizeof(struct sockaddr_in) || 171 - sin->sin_family != AF_INET || 172 - rs->rs_bound_addr || 173 - sin->sin_addr.s_addr == htonl(INADDR_ANY)) { 171 + /* RDS socket does not allow re-binding. */ 172 + if (!ipv6_addr_any(&rs->rs_bound_addr)) { 174 173 ret = -EINVAL; 175 174 goto out; 176 175 } 177 176 178 - ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); 177 + ret = rds_add_bound(rs, binding_addr, &port, scope_id); 179 178 if (ret) 180 179 goto out; 181 180 182 181 if (rs->rs_transport) { /* previously bound */ 183 182 trans = rs->rs_transport; 184 183 if (trans->laddr_check(sock_net(sock->sk), 185 - sin->sin_addr.s_addr) != 0) { 184 + binding_addr, scope_id) != 0) { 186 185 ret = -ENOPROTOOPT; 187 186 rds_remove_bound(rs); 188 187 } else { ··· 207 172 } 208 173 goto out; 209 174 } 210 - trans = rds_trans_get_preferred(sock_net(sock->sk), 211 - sin->sin_addr.s_addr); 175 + trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr, 176 + scope_id); 212 177 if (!trans) { 213 178 ret = -EADDRNOTAVAIL; 214 179 rds_remove_bound(rs); 215 - pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n", 216 - __func__, &sin->sin_addr.s_addr); 180 + pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n", 181 + __func__, binding_addr); 217 182 goto out; 218 183 } 219 184
+13 -10
net/rds/cong.c
··· 1 1 /* 2 - * Copyright (c) 2007 Oracle. All rights reserved. 2 + * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 101 101 static DEFINE_SPINLOCK(rds_cong_lock); 102 102 static struct rb_root rds_cong_tree = RB_ROOT; 103 103 104 - static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, 104 + static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr, 105 105 struct rds_cong_map *insert) 106 106 { 107 107 struct rb_node **p = &rds_cong_tree.rb_node; ··· 109 109 struct rds_cong_map *map; 110 110 111 111 while (*p) { 112 + int diff; 113 + 112 114 parent = *p; 113 115 map = rb_entry(parent, struct rds_cong_map, m_rb_node); 114 116 115 - if (addr < map->m_addr) 117 + diff = rds_addr_cmp(addr, &map->m_addr); 118 + if (diff < 0) 116 119 p = &(*p)->rb_left; 117 - else if (addr > map->m_addr) 120 + else if (diff > 0) 118 121 p = &(*p)->rb_right; 119 122 else 120 123 return map; ··· 135 132 * these bitmaps in the process getting pointers to them. The bitmaps are only 136 133 * ever freed as the module is removed after all connections have been freed. 137 134 */ 138 - static struct rds_cong_map *rds_cong_from_addr(__be32 addr) 135 + static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr) 139 136 { 140 137 struct rds_cong_map *map; 141 138 struct rds_cong_map *ret = NULL; ··· 147 144 if (!map) 148 145 return NULL; 149 146 150 - map->m_addr = addr; 147 + map->m_addr = *addr; 151 148 init_waitqueue_head(&map->m_waitq); 152 149 INIT_LIST_HEAD(&map->m_conn_list); 153 150 ··· 174 171 kfree(map); 175 172 } 176 173 177 - rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); 174 + rdsdebug("map %p for addr %pI6c\n", ret, addr); 178 175 179 176 return ret; 180 177 } ··· 205 202 206 203 int rds_cong_get_maps(struct rds_connection *conn) 207 204 { 208 - conn->c_lcong = rds_cong_from_addr(conn->c_laddr); 209 - conn->c_fcong = rds_cong_from_addr(conn->c_faddr); 205 + conn->c_lcong = rds_cong_from_addr(&conn->c_laddr); 206 + conn->c_fcong = rds_cong_from_addr(&conn->c_faddr); 210 207 211 208 if (!(conn->c_lcong && conn->c_fcong)) 212 209 return -ENOMEM; ··· 356 353 357 354 /* update congestion map for now-closed port */ 358 355 spin_lock_irqsave(&rds_cong_lock, flags); 359 - map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); 356 + map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL); 360 357 spin_unlock_irqrestore(&rds_cong_lock, flags); 361 358 362 359 if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
+81 -51
net/rds/connection.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 34 34 #include <linux/list.h> 35 35 #include <linux/slab.h> 36 36 #include <linux/export.h> 37 - #include <net/inet_hashtables.h> 37 + #include <net/ipv6.h> 38 + #include <net/inet6_hashtables.h> 38 39 39 40 #include "rds.h" 40 41 #include "loop.h" ··· 50 49 static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; 51 50 static struct kmem_cache *rds_conn_slab; 52 51 53 - static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) 52 + static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, 53 + const struct in6_addr *faddr) 54 54 { 55 + static u32 rds6_hash_secret __read_mostly; 55 56 static u32 rds_hash_secret __read_mostly; 56 57 57 - unsigned long hash; 58 + u32 lhash, fhash, hash; 58 59 59 60 net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); 61 + net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); 60 62 61 - /* Pass NULL, don't need struct net for hash */ 62 - hash = __inet_ehashfn(be32_to_cpu(laddr), 0, 63 - be32_to_cpu(faddr), 0, 64 - rds_hash_secret); 63 + lhash = (__force u32)laddr->s6_addr32[3]; 64 + fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret); 65 + hash = __inet6_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); 66 + 65 67 return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; 66 68 } 67 69 ··· 76 72 /* rcu read lock must be held or the connection spinlock */ 77 73 static struct rds_connection *rds_conn_lookup(struct net *net, 78 74 struct hlist_head *head, 79 - __be32 laddr, __be32 faddr, 80 - struct rds_transport *trans) 75 + const struct in6_addr *laddr, 76 + const struct in6_addr *faddr, 77 + struct rds_transport *trans, 78 + int dev_if) 81 79 { 82 80 struct rds_connection *conn, *ret = NULL; 83 81 84 82 hlist_for_each_entry_rcu(conn, head, c_hash_node) { 85 - if (conn->c_faddr == faddr && conn->c_laddr == laddr && 86 - conn->c_trans == trans && net == rds_conn_net(conn)) { 83 + if (ipv6_addr_equal(&conn->c_faddr, faddr) && 84 + ipv6_addr_equal(&conn->c_laddr, laddr) && 85 + conn->c_trans == trans && 86 + net == rds_conn_net(conn) && 87 + conn->c_dev_if == dev_if) { 87 88 ret = conn; 88 89 break; 89 90 } 90 91 } 91 - rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, 92 - &laddr, &faddr); 92 + rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret, 93 + laddr, faddr); 93 94 return ret; 94 95 } 95 96 ··· 108 99 { 109 100 struct rds_connection *conn = cp->cp_conn; 110 101 111 - rdsdebug("connection %pI4 to %pI4 reset\n", 112 - &conn->c_laddr, &conn->c_faddr); 102 + rdsdebug("connection %pI6c to %pI6c reset\n", 103 + &conn->c_laddr, &conn->c_faddr); 113 104 114 105 rds_stats_inc(s_conn_reset); 115 106 rds_send_path_reset(cp); ··· 151 142 * are torn down as the module is removed, if ever. 152 143 */ 153 144 static struct rds_connection *__rds_conn_create(struct net *net, 154 - __be32 laddr, __be32 faddr, 155 - struct rds_transport *trans, gfp_t gfp, 156 - int is_outgoing) 145 + const struct in6_addr *laddr, 146 + const struct in6_addr *faddr, 147 + struct rds_transport *trans, 148 + gfp_t gfp, 149 + int is_outgoing, 150 + int dev_if) 157 151 { 158 152 struct rds_connection *conn, *parent = NULL; 159 153 struct hlist_head *head = rds_conn_bucket(laddr, faddr); ··· 166 154 int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); 167 155 168 156 rcu_read_lock(); 169 - conn = rds_conn_lookup(net, head, laddr, faddr, trans); 170 - if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && 171 - laddr == faddr && !is_outgoing) { 157 + conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if); 158 + if (conn && 159 + conn->c_loopback && 160 + conn->c_trans != &rds_loop_transport && 161 + ipv6_addr_equal(laddr, faddr) && 162 + !is_outgoing) { 172 163 /* This is a looped back IB connection, and we're 173 164 * called by the code handling the incoming connect. 174 165 * We need a second connection object into which we ··· 196 181 } 197 182 198 183 INIT_HLIST_NODE(&conn->c_hash_node); 199 - conn->c_laddr = laddr; 200 - conn->c_faddr = faddr; 184 + conn->c_laddr = *laddr; 185 + conn->c_isv6 = !ipv6_addr_v4mapped(laddr); 186 + conn->c_faddr = *faddr; 187 + conn->c_dev_if = dev_if; 201 188 202 189 rds_conn_net_set(conn, net); 203 190 ··· 216 199 * can bind to the destination address then we'd rather the messages 217 200 * flow through loopback rather than either transport. 218 201 */ 219 - loop_trans = rds_trans_get_preferred(net, faddr); 202 + loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if); 220 203 if (loop_trans) { 221 204 rds_trans_put(loop_trans); 222 205 conn->c_loopback = 1; ··· 250 233 goto out; 251 234 } 252 235 253 - rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", 254 - conn, &laddr, &faddr, 255 - strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name : 256 - "[unknown]", is_outgoing ? "(outgoing)" : ""); 236 + rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n", 237 + conn, laddr, faddr, 238 + strnlen(trans->t_name, sizeof(trans->t_name)) ? 239 + trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : ""); 257 240 258 241 /* 259 242 * Since we ran without holding the conn lock, someone could ··· 279 262 /* Creating normal conn */ 280 263 struct rds_connection *found; 281 264 282 - found = rds_conn_lookup(net, head, laddr, faddr, trans); 265 + found = rds_conn_lookup(net, head, laddr, faddr, trans, 266 + dev_if); 283 267 if (found) { 284 268 struct rds_conn_path *cp; 285 269 int i; ··· 313 295 } 314 296 315 297 struct rds_connection *rds_conn_create(struct net *net, 316 - __be32 laddr, __be32 faddr, 317 - struct rds_transport *trans, gfp_t gfp) 298 + const struct in6_addr *laddr, 299 + const struct in6_addr *faddr, 300 + struct rds_transport *trans, gfp_t gfp, 301 + int dev_if) 318 302 { 319 - return __rds_conn_create(net, laddr, faddr, trans, gfp, 0); 303 + return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if); 320 304 } 321 305 EXPORT_SYMBOL_GPL(rds_conn_create); 322 306 323 307 struct rds_connection *rds_conn_create_outgoing(struct net *net, 324 - __be32 laddr, __be32 faddr, 325 - struct rds_transport *trans, gfp_t gfp) 308 + const struct in6_addr *laddr, 309 + const struct in6_addr *faddr, 310 + struct rds_transport *trans, 311 + gfp_t gfp, int dev_if) 326 312 { 327 - return __rds_conn_create(net, laddr, faddr, trans, gfp, 1); 313 + return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if); 328 314 } 329 315 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); 330 316 ··· 524 502 525 503 /* XXX too lazy to maintain counts.. */ 526 504 list_for_each_entry(rm, list, m_conn_item) { 505 + __be32 laddr; 506 + __be32 faddr; 507 + 527 508 total++; 509 + laddr = conn->c_laddr.s6_addr32[3]; 510 + faddr = conn->c_faddr.s6_addr32[3]; 528 511 if (total <= len) 529 512 rds_inc_info_copy(&rm->m_inc, 530 513 iter, 531 - conn->c_laddr, 532 - conn->c_faddr, 514 + laddr, 515 + faddr, 533 516 0); 534 517 } 535 518 ··· 611 584 struct hlist_head *head; 612 585 struct rds_connection *conn; 613 586 size_t i; 614 - int j; 615 587 616 588 rcu_read_lock(); 617 589 ··· 621 595 i++, head++) { 622 596 hlist_for_each_entry_rcu(conn, head, c_hash_node) { 623 597 struct rds_conn_path *cp; 624 - int npaths; 625 598 626 - npaths = (conn->c_trans->t_mp_capable ? 627 - RDS_MPATH_WORKERS : 1); 628 - for (j = 0; j < npaths; j++) { 629 - cp = &conn->c_path[j]; 599 + /* XXX We only copy the information from the first 600 + * path for now. The problem is that if there are 601 + * more than one underlying paths, we cannot report 602 + * information of all of them using the existing 603 + * API. For example, there is only one next_tx_seq, 604 + * which path's next_tx_seq should we report? It is 605 + * a bug in the design of MPRDS. 606 + */ 607 + cp = conn->c_path; 630 608 631 - /* XXX no cp_lock usage.. */ 632 - if (!visitor(cp, buffer)) 633 - continue; 634 - } 609 + /* XXX no cp_lock usage.. */ 610 + if (!visitor(cp, buffer)) 611 + continue; 635 612 636 613 /* We copy as much as we can fit in the buffer, 637 614 * but we count all items so that the caller ··· 653 624 static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) 654 625 { 655 626 struct rds_info_connection *cinfo = buffer; 627 + struct rds_connection *conn = cp->cp_conn; 656 628 657 629 cinfo->next_tx_seq = cp->cp_next_tx_seq; 658 630 cinfo->next_rx_seq = cp->cp_next_rx_seq; 659 - cinfo->laddr = cp->cp_conn->c_laddr; 660 - cinfo->faddr = cp->cp_conn->c_faddr; 661 - strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name, 631 + cinfo->laddr = conn->c_laddr.s6_addr32[3]; 632 + cinfo->faddr = conn->c_faddr.s6_addr32[3]; 633 + strncpy(cinfo->transport, conn->c_trans->t_name, 662 634 sizeof(cinfo->transport)); 663 635 cinfo->flags = 0; 664 636
+9 -8
net/rds/ib.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 296 296 if (conn->c_trans != &rds_ib_transport) 297 297 return 0; 298 298 299 - iinfo->src_addr = conn->c_laddr; 300 - iinfo->dst_addr = conn->c_faddr; 299 + iinfo->src_addr = conn->c_laddr.s6_addr32[3]; 300 + iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; 301 301 302 302 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); 303 303 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); ··· 341 341 * allowed to influence which paths have priority. We could call userspace 342 342 * asserting this policy "routing". 343 343 */ 344 - static int rds_ib_laddr_check(struct net *net, __be32 addr) 344 + static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, 345 + __u32 scope_id) 345 346 { 346 347 int ret; 347 348 struct rdma_cm_id *cm_id; ··· 358 357 359 358 memset(&sin, 0, sizeof(sin)); 360 359 sin.sin_family = AF_INET; 361 - sin.sin_addr.s_addr = addr; 360 + sin.sin_addr.s_addr = addr->s6_addr32[3]; 362 361 363 362 /* rdma_bind_addr will only succeed for IB & iWARP devices */ 364 363 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); ··· 368 367 cm_id->device->node_type != RDMA_NODE_IB_CA) 369 368 ret = -EADDRNOTAVAIL; 370 369 371 - rdsdebug("addr %pI4 ret %d node type %d\n", 372 - &addr, ret, 373 - cm_id->device ? cm_id->device->node_type : -1); 370 + rdsdebug("addr %pI6c ret %d node type %d\n", 371 + addr, ret, 372 + cm_id->device ? cm_id->device->node_type : -1); 374 373 375 374 rdma_destroy_id(cm_id); 376 375
+40 -11
net/rds/ib.h
··· 57 57 struct list_head *ready; 58 58 }; 59 59 60 + /* This is the common structure for the IB private data exchange in setting up 61 + * an RDS connection. The exchange is different for IPv4 and IPv6 connections. 62 + * The reason is that the address size is different and the addresses 63 + * exchanged are in the beginning of the structure. Hence it is not possible 64 + * for interoperability if same structure is used. 65 + */ 66 + struct rds_ib_conn_priv_cmn { 67 + u8 ricpc_protocol_major; 68 + u8 ricpc_protocol_minor; 69 + __be16 ricpc_protocol_minor_mask; /* bitmask */ 70 + __be32 ricpc_reserved1; 71 + __be64 ricpc_ack_seq; 72 + __be32 ricpc_credit; /* non-zero enables flow ctl */ 73 + }; 74 + 60 75 struct rds_ib_connect_private { 61 76 /* Add new fields at the end, and don't permute existing fields. */ 62 - __be32 dp_saddr; 63 - __be32 dp_daddr; 64 - u8 dp_protocol_major; 65 - u8 dp_protocol_minor; 66 - __be16 dp_protocol_minor_mask; /* bitmask */ 67 - __be32 dp_reserved1; 68 - __be64 dp_ack_seq; 69 - __be32 dp_credit; /* non-zero enables flow ctl */ 77 + __be32 dp_saddr; 78 + __be32 dp_daddr; 79 + struct rds_ib_conn_priv_cmn dp_cmn; 80 + }; 81 + 82 + struct rds6_ib_connect_private { 83 + /* Add new fields at the end, and don't permute existing fields. */ 84 + struct in6_addr dp_saddr; 85 + struct in6_addr dp_daddr; 86 + struct rds_ib_conn_priv_cmn dp_cmn; 87 + }; 88 + 89 + #define dp_protocol_major dp_cmn.ricpc_protocol_major 90 + #define dp_protocol_minor dp_cmn.ricpc_protocol_minor 91 + #define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask 92 + #define dp_ack_seq dp_cmn.ricpc_ack_seq 93 + #define dp_credit dp_cmn.ricpc_credit 94 + 95 + union rds_ib_conn_priv { 96 + struct rds_ib_connect_private ricp_v4; 97 + struct rds6_ib_connect_private ricp_v6; 70 98 }; 71 99 72 100 struct rds_ib_send_work { ··· 379 351 __printf(2, 3) 380 352 void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); 381 353 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 382 - struct rdma_cm_event *event); 383 - int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); 354 + struct rdma_cm_event *event, bool isv6); 355 + int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6); 384 356 void rds_ib_cm_connect_complete(struct rds_connection *conn, 385 357 struct rdma_cm_event *event); 386 358 ··· 389 361 __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) 390 362 391 363 /* ib_rdma.c */ 392 - int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); 364 + int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, 365 + struct in6_addr *ipaddr); 393 366 void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 394 367 void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); 395 368 void rds_ib_destroy_nodev_conns(void);
+221 -78
net/rds/ib_cm.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 35 35 #include <linux/slab.h> 36 36 #include <linux/vmalloc.h> 37 37 #include <linux/ratelimit.h> 38 + #include <net/addrconf.h> 38 39 39 40 #include "rds_single_path.h" 40 41 #include "rds.h" ··· 96 95 */ 97 96 void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) 98 97 { 99 - const struct rds_ib_connect_private *dp = NULL; 100 98 struct rds_ib_connection *ic = conn->c_transport_data; 99 + const union rds_ib_conn_priv *dp = NULL; 101 100 struct ib_qp_attr qp_attr; 101 + __be64 ack_seq = 0; 102 + __be32 credit = 0; 103 + u8 major = 0; 104 + u8 minor = 0; 102 105 int err; 103 106 104 - if (event->param.conn.private_data_len >= sizeof(*dp)) { 105 - dp = event->param.conn.private_data; 106 - 107 - /* make sure it isn't empty data */ 108 - if (dp->dp_protocol_major) { 109 - rds_ib_set_protocol(conn, 110 - RDS_PROTOCOL(dp->dp_protocol_major, 111 - dp->dp_protocol_minor)); 112 - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); 107 + dp = event->param.conn.private_data; 108 + if (conn->c_isv6) { 109 + if (event->param.conn.private_data_len >= 110 + sizeof(struct rds6_ib_connect_private)) { 111 + major = dp->ricp_v6.dp_protocol_major; 112 + minor = dp->ricp_v6.dp_protocol_minor; 113 + credit = dp->ricp_v6.dp_credit; 114 + /* dp structure start is not guaranteed to be 8 bytes 115 + * aligned. Since dp_ack_seq is 64-bit extended load 116 + * operations can be used so go through get_unaligned 117 + * to avoid unaligned errors. 118 + */ 119 + ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq); 113 120 } 121 + } else if (event->param.conn.private_data_len >= 122 + sizeof(struct rds_ib_connect_private)) { 123 + major = dp->ricp_v4.dp_protocol_major; 124 + minor = dp->ricp_v4.dp_protocol_minor; 125 + credit = dp->ricp_v4.dp_credit; 126 + ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq); 127 + } 128 + 129 + /* make sure it isn't empty data */ 130 + if (major) { 131 + rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor)); 132 + rds_ib_set_flow_control(conn, be32_to_cpu(credit)); 114 133 } 115 134 116 135 if (conn->c_version < RDS_PROTOCOL(3, 1)) { 117 - pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n", 136 + pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n", 118 137 &conn->c_laddr, &conn->c_faddr, 119 138 RDS_PROTOCOL_MAJOR(conn->c_version), 120 139 RDS_PROTOCOL_MINOR(conn->c_version)); ··· 142 121 rds_conn_destroy(conn); 143 122 return; 144 123 } else { 145 - pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n", 124 + pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n", 146 125 ic->i_active_side ? "Active" : "Passive", 147 126 &conn->c_laddr, &conn->c_faddr, 148 127 RDS_PROTOCOL_MAJOR(conn->c_version), ··· 171 150 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); 172 151 173 152 /* update ib_device with this local ipaddr */ 174 - err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); 153 + err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr); 175 154 if (err) 176 155 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", 177 156 err); ··· 179 158 /* If the peer gave us the last packet it saw, process this as if 180 159 * we had received a regular ACK. */ 181 160 if (dp) { 182 - /* dp structure start is not guaranteed to be 8 bytes aligned. 183 - * Since dp_ack_seq is 64-bit extended load operations can be 184 - * used so go through get_unaligned to avoid unaligned errors. 185 - */ 186 - __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); 187 - 188 - if (dp_ack_seq) 189 - rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq), 161 + if (ack_seq) 162 + rds_send_drop_acked(conn, be64_to_cpu(ack_seq), 190 163 NULL); 191 164 } 192 165 ··· 188 173 } 189 174 190 175 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, 191 - struct rdma_conn_param *conn_param, 192 - struct rds_ib_connect_private *dp, 193 - u32 protocol_version, 194 - u32 max_responder_resources, 195 - u32 max_initiator_depth) 176 + struct rdma_conn_param *conn_param, 177 + union rds_ib_conn_priv *dp, 178 + u32 protocol_version, 179 + u32 max_responder_resources, 180 + u32 max_initiator_depth, 181 + bool isv6) 196 182 { 197 183 struct rds_ib_connection *ic = conn->c_transport_data; 198 184 struct rds_ib_device *rds_ibdev = ic->rds_ibdev; ··· 209 193 210 194 if (dp) { 211 195 memset(dp, 0, sizeof(*dp)); 212 - dp->dp_saddr = conn->c_laddr; 213 - dp->dp_daddr = conn->c_faddr; 214 - dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); 215 - dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); 216 - dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); 217 - dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic)); 196 + if (isv6) { 197 + dp->ricp_v6.dp_saddr = conn->c_laddr; 198 + dp->ricp_v6.dp_daddr = conn->c_faddr; 199 + dp->ricp_v6.dp_protocol_major = 200 + RDS_PROTOCOL_MAJOR(protocol_version); 201 + dp->ricp_v6.dp_protocol_minor = 202 + RDS_PROTOCOL_MINOR(protocol_version); 203 + dp->ricp_v6.dp_protocol_minor_mask = 204 + cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); 205 + dp->ricp_v6.dp_ack_seq = 206 + cpu_to_be64(rds_ib_piggyb_ack(ic)); 207 + 208 + conn_param->private_data = &dp->ricp_v6; 209 + conn_param->private_data_len = sizeof(dp->ricp_v6); 210 + } else { 211 + dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3]; 212 + dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3]; 213 + dp->ricp_v4.dp_protocol_major = 214 + RDS_PROTOCOL_MAJOR(protocol_version); 215 + dp->ricp_v4.dp_protocol_minor = 216 + RDS_PROTOCOL_MINOR(protocol_version); 217 + dp->ricp_v4.dp_protocol_minor_mask = 218 + cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); 219 + dp->ricp_v4.dp_ack_seq = 220 + cpu_to_be64(rds_ib_piggyb_ack(ic)); 221 + 222 + conn_param->private_data = &dp->ricp_v4; 223 + conn_param->private_data_len = sizeof(dp->ricp_v4); 224 + } 218 225 219 226 /* Advertise flow control */ 220 227 if (ic->i_flowctl) { 221 228 unsigned int credits; 222 229 223 - credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); 224 - dp->dp_credit = cpu_to_be32(credits); 225 - atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); 230 + credits = IB_GET_POST_CREDITS 231 + (atomic_read(&ic->i_credits)); 232 + if (isv6) 233 + dp->ricp_v6.dp_credit = cpu_to_be32(credits); 234 + else 235 + dp->ricp_v4.dp_credit = cpu_to_be32(credits); 236 + atomic_sub(IB_SET_POST_CREDITS(credits), 237 + &ic->i_credits); 226 238 } 227 - 228 - conn_param->private_data = dp; 229 - conn_param->private_data_len = sizeof(*dp); 230 239 } 231 240 } 232 241 ··· 390 349 break; 391 350 default: 392 351 rdsdebug("Fatal QP Event %u (%s) " 393 - "- connection %pI4->%pI4, reconnecting\n", 352 + "- connection %pI6c->%pI6c, reconnecting\n", 394 353 event->event, ib_event_msg(event->event), 395 354 &conn->c_laddr, &conn->c_faddr); 396 355 rds_conn_drop(conn); ··· 621 580 return ret; 622 581 } 623 582 624 - static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) 583 + static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) 625 584 { 626 - const struct rds_ib_connect_private *dp = event->param.conn.private_data; 627 - u16 common; 585 + const union rds_ib_conn_priv *dp = event->param.conn.private_data; 586 + u8 data_len, major, minor; 628 587 u32 version = 0; 588 + __be16 mask; 589 + u16 common; 629 590 630 591 /* 631 592 * rdma_cm private data is odd - when there is any private data in the ··· 646 603 return 0; 647 604 } 648 605 606 + if (isv6) { 607 + data_len = sizeof(struct rds6_ib_connect_private); 608 + major = dp->ricp_v6.dp_protocol_major; 609 + minor = dp->ricp_v6.dp_protocol_minor; 610 + mask = dp->ricp_v6.dp_protocol_minor_mask; 611 + } else { 612 + data_len = sizeof(struct rds_ib_connect_private); 613 + major = dp->ricp_v4.dp_protocol_major; 614 + minor = dp->ricp_v4.dp_protocol_minor; 615 + mask = dp->ricp_v4.dp_protocol_minor_mask; 616 + } 617 + 649 618 /* Even if len is crap *now* I still want to check it. -ASG */ 650 - if (event->param.conn.private_data_len < sizeof (*dp) || 651 - dp->dp_protocol_major == 0) 619 + if (event->param.conn.private_data_len < data_len || major == 0) 652 620 return RDS_PROTOCOL_3_0; 653 621 654 - common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; 655 - if (dp->dp_protocol_major == 3 && common) { 622 + common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS; 623 + if (major == 3 && common) { 656 624 version = RDS_PROTOCOL_3_0; 657 625 while ((common >>= 1) != 0) 658 626 version++; 659 - } else 660 - printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", 661 - &dp->dp_saddr, 662 - dp->dp_protocol_major, 663 - dp->dp_protocol_minor); 627 + } else { 628 + if (isv6) 629 + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n", 630 + &dp->ricp_v6.dp_saddr, major, minor); 631 + else 632 + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", 633 + &dp->ricp_v4.dp_saddr, major, minor); 634 + } 664 635 return version; 665 636 } 666 637 638 + /* Given an IPv6 address, find the IB net_device which hosts that address and 639 + * return its index. This is used by the rds_ib_cm_handle_connect() code to 640 + * find the interface index of where an incoming request comes from when 641 + * the request is using a link local address. 642 + * 643 + * Note one problem in this search. It is possible that two interfaces have 644 + * the same link local address. Unfortunately, this cannot be solved unless 645 + * the underlying layer gives us the interface which an incoming RDMA connect 646 + * request comes from. 647 + */ 648 + static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr) 649 + { 650 + struct net_device *dev; 651 + int idx = 0; 652 + 653 + rcu_read_lock(); 654 + for_each_netdev_rcu(net, dev) { 655 + if (dev->type == ARPHRD_INFINIBAND && 656 + ipv6_chk_addr(net, addr, dev, 0)) { 657 + idx = dev->ifindex; 658 + break; 659 + } 660 + } 661 + rcu_read_unlock(); 662 + 663 + return idx; 664 + } 665 + 667 666 int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, 668 - struct rdma_cm_event *event) 667 + struct rdma_cm_event *event, bool isv6) 669 668 { 670 669 __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; 671 670 __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; 672 - const struct rds_ib_connect_private *dp = event->param.conn.private_data; 673 - struct rds_ib_connect_private dp_rep; 671 + const struct rds_ib_conn_priv_cmn *dp_cmn; 674 672 struct rds_connection *conn = NULL; 675 673 struct rds_ib_connection *ic = NULL; 676 674 struct rdma_conn_param conn_param; 675 + const union rds_ib_conn_priv *dp; 676 + union rds_ib_conn_priv dp_rep; 677 + struct in6_addr s_mapped_addr; 678 + struct in6_addr d_mapped_addr; 679 + const struct in6_addr *saddr6; 680 + const struct in6_addr *daddr6; 681 + int destroy = 1; 682 + u32 ifindex = 0; 677 683 u32 version; 678 - int err = 1, destroy = 1; 684 + int err = 1; 679 685 680 686 /* Check whether the remote protocol version matches ours. */ 681 - version = rds_ib_protocol_compatible(event); 687 + version = rds_ib_protocol_compatible(event, isv6); 682 688 if (!version) 683 689 goto out; 684 690 685 - rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid " 686 - "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr, 691 + dp = event->param.conn.private_data; 692 + if (isv6) { 693 + dp_cmn = &dp->ricp_v6.dp_cmn; 694 + saddr6 = &dp->ricp_v6.dp_saddr; 695 + daddr6 = &dp->ricp_v6.dp_daddr; 696 + /* If the local address is link local, need to find the 697 + * interface index in order to create a proper RDS 698 + * connection. 699 + */ 700 + if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) { 701 + /* Using init_net for now .. */ 702 + ifindex = __rds_find_ifindex(&init_net, daddr6); 703 + /* No index found... Need to bail out. */ 704 + if (ifindex == 0) { 705 + err = -EOPNOTSUPP; 706 + goto out; 707 + } 708 + } 709 + } else { 710 + dp_cmn = &dp->ricp_v4.dp_cmn; 711 + ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr); 712 + ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr); 713 + saddr6 = &s_mapped_addr; 714 + daddr6 = &d_mapped_addr; 715 + } 716 + 717 + rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid " 718 + "0x%llx\n", saddr6, daddr6, 687 719 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), 688 720 (unsigned long long)be64_to_cpu(lguid), 689 721 (unsigned long long)be64_to_cpu(fguid)); 690 722 691 723 /* RDS/IB is not currently netns aware, thus init_net */ 692 - conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, 693 - &rds_ib_transport, GFP_KERNEL); 724 + conn = rds_conn_create(&init_net, daddr6, saddr6, 725 + &rds_ib_transport, GFP_KERNEL, ifindex); 694 726 if (IS_ERR(conn)) { 695 727 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); 696 728 conn = NULL; ··· 796 678 ic = conn->c_transport_data; 797 679 798 680 rds_ib_set_protocol(conn, version); 799 - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); 681 + rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit)); 800 682 801 683 /* If the peer gave us the last packet it saw, process this as if 802 684 * we had received a regular ACK. */ 803 - if (dp->dp_ack_seq) 804 - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); 685 + if (dp_cmn->ricpc_ack_seq) 686 + rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq), 687 + NULL); 805 688 806 689 BUG_ON(cm_id->context); 807 690 BUG_ON(ic->i_cm_id); ··· 821 702 } 822 703 823 704 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, 824 - event->param.conn.responder_resources, 825 - event->param.conn.initiator_depth); 705 + event->param.conn.responder_resources, 706 + event->param.conn.initiator_depth, isv6); 826 707 827 708 /* rdma_accept() calls rdma_reject() internally if it fails */ 828 709 if (rdma_accept(cm_id, &conn_param)) ··· 837 718 } 838 719 839 720 840 - int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) 721 + int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) 841 722 { 842 723 struct rds_connection *conn = cm_id->context; 843 724 struct rds_ib_connection *ic = conn->c_transport_data; 844 725 struct rdma_conn_param conn_param; 845 - struct rds_ib_connect_private dp; 726 + union rds_ib_conn_priv dp; 846 727 int ret; 847 728 848 729 /* If the peer doesn't do protocol negotiation, we must ··· 857 738 } 858 739 859 740 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, 860 - UINT_MAX, UINT_MAX); 741 + UINT_MAX, UINT_MAX, isv6); 861 742 ret = rdma_connect(cm_id, &conn_param); 862 743 if (ret) 863 744 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); ··· 877 758 int rds_ib_conn_path_connect(struct rds_conn_path *cp) 878 759 { 879 760 struct rds_connection *conn = cp->cp_conn; 880 - struct rds_ib_connection *ic = conn->c_transport_data; 881 - struct sockaddr_in src, dest; 761 + struct sockaddr_storage src, dest; 762 + rdma_cm_event_handler handler; 763 + struct rds_ib_connection *ic; 882 764 int ret; 765 + 766 + ic = conn->c_transport_data; 883 767 884 768 /* XXX I wonder what affect the port space has */ 885 769 /* delegate cm event handler to rdma_transport */ 886 - ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn, 770 + handler = rds_rdma_cm_event_handler; 771 + ic->i_cm_id = rdma_create_id(&init_net, handler, conn, 887 772 RDMA_PS_TCP, IB_QPT_RC); 888 773 if (IS_ERR(ic->i_cm_id)) { 889 774 ret = PTR_ERR(ic->i_cm_id); ··· 898 775 899 776 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn); 900 777 901 - src.sin_family = AF_INET; 902 - src.sin_addr.s_addr = (__force u32)conn->c_laddr; 903 - src.sin_port = (__force u16)htons(0); 778 + if (ipv6_addr_v4mapped(&conn->c_faddr)) { 779 + struct sockaddr_in *sin; 904 780 905 - dest.sin_family = AF_INET; 906 - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; 907 - dest.sin_port = (__force u16)htons(RDS_PORT); 781 + sin = (struct sockaddr_in *)&src; 782 + sin->sin_family = AF_INET; 783 + sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; 784 + sin->sin_port = 0; 785 + 786 + sin = (struct sockaddr_in *)&dest; 787 + sin->sin_family = AF_INET; 788 + sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; 789 + sin->sin_port = htons(RDS_PORT); 790 + } else { 791 + struct sockaddr_in6 *sin6; 792 + 793 + sin6 = (struct sockaddr_in6 *)&src; 794 + sin6->sin6_family = AF_INET6; 795 + sin6->sin6_addr = conn->c_laddr; 796 + sin6->sin6_port = 0; 797 + sin6->sin6_scope_id = conn->c_dev_if; 798 + 799 + sin6 = (struct sockaddr_in6 *)&dest; 800 + sin6->sin6_family = AF_INET6; 801 + sin6->sin6_addr = conn->c_faddr; 802 + sin6->sin6_port = htons(RDS_CM_PORT); 803 + sin6->sin6_scope_id = conn->c_dev_if; 804 + } 908 805 909 806 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, 910 807 (struct sockaddr *)&dest,
+8 -7
net/rds/ib_rdma.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 100 100 kfree_rcu(to_free, rcu); 101 101 } 102 102 103 - int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) 103 + int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, 104 + struct in6_addr *ipaddr) 104 105 { 105 106 struct rds_ib_device *rds_ibdev_old; 106 107 107 - rds_ibdev_old = rds_ib_get_device(ipaddr); 108 + rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]); 108 109 if (!rds_ibdev_old) 109 - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); 110 + return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); 110 111 111 112 if (rds_ibdev_old != rds_ibdev) { 112 - rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); 113 + rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]); 113 114 rds_ib_dev_put(rds_ibdev_old); 114 - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); 115 + return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); 115 116 } 116 117 rds_ib_dev_put(rds_ibdev_old); 117 118 ··· 545 544 struct rds_ib_connection *ic = rs->rs_conn->c_transport_data; 546 545 int ret; 547 546 548 - rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); 547 + rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]); 549 548 if (!rds_ibdev) { 550 549 ret = -ENODEV; 551 550 goto out;
+9 -9
net/rds/ib_recv.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 266 266 rds_ib_stats_inc(s_ib_rx_total_incs); 267 267 } 268 268 INIT_LIST_HEAD(&ibinc->ii_frags); 269 - rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); 269 + rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr); 270 270 271 271 return ibinc; 272 272 } ··· 418 418 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); 419 419 if (ret) { 420 420 rds_ib_conn_error(conn, "recv post on " 421 - "%pI4 returned %d, disconnecting and " 421 + "%pI6c returned %d, disconnecting and " 422 422 "reconnecting\n", &conn->c_faddr, 423 423 ret); 424 424 break; ··· 848 848 849 849 if (data_len < sizeof(struct rds_header)) { 850 850 rds_ib_conn_error(conn, "incoming message " 851 - "from %pI4 didn't include a " 851 + "from %pI6c didn't include a " 852 852 "header, disconnecting and " 853 853 "reconnecting\n", 854 854 &conn->c_faddr); ··· 861 861 /* Validate the checksum. */ 862 862 if (!rds_message_verify_checksum(ihdr)) { 863 863 rds_ib_conn_error(conn, "incoming message " 864 - "from %pI4 has corrupted header - " 864 + "from %pI6c has corrupted header - " 865 865 "forcing a reconnect\n", 866 866 &conn->c_faddr); 867 867 rds_stats_inc(s_recv_drop_bad_checksum); ··· 941 941 ic->i_recv_data_rem = 0; 942 942 ic->i_ibinc = NULL; 943 943 944 - if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 944 + if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) { 945 945 rds_ib_cong_recv(conn, ibinc); 946 - else { 947 - rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, 946 + } else { 947 + rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr, 948 948 &ibinc->ii_inc, GFP_ATOMIC); 949 949 state->ack_next = be64_to_cpu(hdr->h_sequence); 950 950 state->ack_next_valid = 1; ··· 988 988 } else { 989 989 /* We expect errors as the qp is drained during shutdown */ 990 990 if (rds_conn_up(conn) || rds_conn_connecting(conn)) 991 - rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", 991 + rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", 992 992 &conn->c_laddr, &conn->c_faddr, 993 993 wc->status, 994 994 ib_wc_status_msg(wc->status));
+5 -5
net/rds/ib_send.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 305 305 306 306 /* We expect errors as the qp is drained during shutdown */ 307 307 if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { 308 - rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n", 308 + rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n", 309 309 &conn->c_laddr, &conn->c_faddr, wc->status, 310 310 ib_wc_status_msg(wc->status)); 311 311 } ··· 730 730 first, &first->s_wr, ret, failed_wr); 731 731 BUG_ON(failed_wr != &first->s_wr); 732 732 if (ret) { 733 - printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " 733 + printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c " 734 734 "returned %d\n", &conn->c_faddr, ret); 735 735 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 736 736 rds_ib_sub_signaled(ic, nr_sig); ··· 827 827 send, &send->s_atomic_wr, ret, failed_wr); 828 828 BUG_ON(failed_wr != &send->s_atomic_wr.wr); 829 829 if (ret) { 830 - printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " 830 + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c " 831 831 "returned %d\n", &conn->c_faddr, ret); 832 832 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 833 833 rds_ib_sub_signaled(ic, nr_sig); ··· 967 967 first, &first->s_rdma_wr.wr, ret, failed_wr); 968 968 BUG_ON(failed_wr != &first->s_rdma_wr.wr); 969 969 if (ret) { 970 - printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " 970 + printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c " 971 971 "returned %d\n", &conn->c_faddr, ret); 972 972 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); 973 973 rds_ib_sub_signaled(ic, nr_sig);
+4 -3
net/rds/loop.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 35 35 #include <linux/in.h> 36 36 #include <net/net_namespace.h> 37 37 #include <net/netns/generic.h> 38 + #include <linux/ipv6.h> 38 39 39 40 #include "rds_single_path.h" 40 41 #include "rds.h" ··· 89 88 90 89 BUG_ON(hdr_off || sg || off); 91 90 92 - rds_inc_init(&rm->m_inc, conn, conn->c_laddr); 91 + rds_inc_init(&rm->m_inc, conn, &conn->c_laddr); 93 92 /* For the embedded inc. Matching put is in loop_inc_free() */ 94 93 rds_message_addref(rm); 95 94 96 - rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, 95 + rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc, 97 96 GFP_KERNEL); 98 97 99 98 rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
+3 -3
net/rds/rdma.c
··· 1 1 /* 2 - * Copyright (c) 2007 Oracle. All rights reserved. 2 + * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 183 183 long i; 184 184 int ret; 185 185 186 - if (rs->rs_bound_addr == 0 || !rs->rs_transport) { 186 + if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) { 187 187 ret = -ENOTCONN; /* XXX not a great errno */ 188 188 goto out; 189 189 } ··· 574 574 575 575 args = CMSG_DATA(cmsg); 576 576 577 - if (rs->rs_bound_addr == 0) { 577 + if (ipv6_addr_any(&rs->rs_bound_addr)) { 578 578 ret = -ENOTCONN; /* XXX not a great errno */ 579 579 goto out_ret; 580 580 }
+40 -16
net/rds/rdma_transport.c
··· 1 1 /* 2 - * Copyright (c) 2009 Oracle. All rights reserved. 2 + * Copyright (c) 2009, 2018 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 39 39 40 40 static struct rdma_cm_id *rds_rdma_listen_id; 41 41 42 - int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 43 - struct rdma_cm_event *event) 42 + static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, 43 + struct rdma_cm_event *event, 44 + bool isv6) 44 45 { 45 46 /* this can be null in the listening path */ 46 47 struct rds_connection *conn = cm_id->context; ··· 73 72 74 73 switch (event->event) { 75 74 case RDMA_CM_EVENT_CONNECT_REQUEST: 76 - ret = trans->cm_handle_connect(cm_id, event); 75 + ret = trans->cm_handle_connect(cm_id, event, isv6); 77 76 break; 78 77 79 78 case RDMA_CM_EVENT_ADDR_RESOLVED: ··· 91 90 92 91 ibic = conn->c_transport_data; 93 92 if (ibic && ibic->i_cm_id == cm_id) 94 - ret = trans->cm_initiate_connect(cm_id); 93 + ret = trans->cm_initiate_connect(cm_id, isv6); 95 94 else 96 95 rds_conn_drop(conn); 97 96 } ··· 117 116 118 117 case RDMA_CM_EVENT_DISCONNECTED: 119 118 rdsdebug("DISCONNECT event - dropping connection " 120 - "%pI4->%pI4\n", &conn->c_laddr, 119 + "%pI6c->%pI6c\n", &conn->c_laddr, 121 120 &conn->c_faddr); 122 121 rds_conn_drop(conn); 123 122 break; 124 123 125 124 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 126 125 if (conn) { 127 - pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n", 126 + pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI6c->%pI6c\n", 128 127 &conn->c_laddr, &conn->c_faddr); 129 128 rds_conn_drop(conn); 130 129 } ··· 147 146 return ret; 148 147 } 149 148 150 - static int rds_rdma_listen_init(void) 149 + int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, 150 + struct rdma_cm_event *event) 151 151 { 152 - struct sockaddr_in sin; 152 + return rds_rdma_cm_event_handler_cmn(cm_id, event, false); 153 + } 154 + 155 + static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, 156 + struct sockaddr *sa, 157 + struct rdma_cm_id **ret_cm_id) 158 + { 153 159 struct rdma_cm_id *cm_id; 154 160 int ret; 155 161 156 - cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL, 162 + cm_id = rdma_create_id(&init_net, handler, NULL, 157 163 RDMA_PS_TCP, IB_QPT_RC); 158 164 if (IS_ERR(cm_id)) { 159 165 ret = PTR_ERR(cm_id); ··· 169 161 return ret; 170 162 } 171 163 172 - sin.sin_family = AF_INET; 173 - sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); 174 - sin.sin_port = (__force u16)htons(RDS_PORT); 175 - 176 164 /* 177 165 * XXX I bet this binds the cm_id to a device. If we want to support 178 166 * fail-over we'll have to take this into consideration. 179 167 */ 180 - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); 168 + ret = rdma_bind_addr(cm_id, sa); 181 169 if (ret) { 182 170 printk(KERN_ERR "RDS/RDMA: failed to setup listener, " 183 171 "rdma_bind_addr() returned %d\n", ret); ··· 189 185 190 186 rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); 191 187 192 - rds_rdma_listen_id = cm_id; 188 + *ret_cm_id = cm_id; 193 189 cm_id = NULL; 194 190 out: 195 191 if (cm_id) 196 192 rdma_destroy_id(cm_id); 193 + return ret; 194 + } 195 + 196 + /* Initialize the RDS RDMA listeners. We create two listeners for 197 + * compatibility reason. The one on RDS_PORT is used for IPv4 198 + * requests only. The one on RDS_CM_PORT is used for IPv6 requests 199 + * only. So only IPv6 enabled RDS module will communicate using this 200 + * port. 201 + */ 202 + static int rds_rdma_listen_init(void) 203 + { 204 + int ret; 205 + struct sockaddr_in sin; 206 + 207 + sin.sin_family = PF_INET; 208 + sin.sin_addr.s_addr = htonl(INADDR_ANY); 209 + sin.sin_port = htons(RDS_PORT); 210 + ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler, 211 + (struct sockaddr *)&sin, 212 + &rds_rdma_listen_id); 197 213 return ret; 198 214 } 199 215
+47 -23
net/rds/rds.h
··· 10 10 #include <linux/rds.h> 11 11 #include <linux/rhashtable.h> 12 12 #include <linux/refcount.h> 13 + #include <linux/in6.h> 13 14 14 15 #include "info.h" 15 16 ··· 31 30 * userspace from listening. 32 31 */ 33 32 #define RDS_PORT 18634 33 + #define RDS_CM_PORT 16385 34 34 35 35 #ifdef ATOMIC64_INIT 36 36 #define KERNEL_HAS_ATOMIC64 ··· 63 61 64 62 struct rds_cong_map { 65 63 struct rb_node m_rb_node; 66 - __be32 m_addr; 64 + struct in6_addr m_addr; 67 65 wait_queue_head_t m_waitq; 68 66 struct list_head m_conn_list; 69 67 unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; ··· 138 136 /* One rds_connection per RDS address pair */ 139 137 struct rds_connection { 140 138 struct hlist_node c_hash_node; 141 - __be32 c_laddr; 142 - __be32 c_faddr; 139 + struct in6_addr c_laddr; 140 + struct in6_addr c_faddr; 141 + int c_dev_if; /* c_laddrs's interface index */ 143 142 unsigned int c_loopback:1, 143 + c_isv6:1, 144 144 c_ping_triggered:1, 145 - c_pad_to_32:30; 145 + c_pad_to_32:29; 146 146 int c_npaths; 147 147 struct rds_connection *c_passive; 148 148 struct rds_transport *c_trans; ··· 273 269 struct rds_conn_path *i_conn_path; 274 270 struct rds_header i_hdr; 275 271 unsigned long i_rx_jiffies; 276 - __be32 i_saddr; 272 + struct in6_addr i_saddr; 277 273 278 274 rds_rdma_cookie_t i_rdma_cookie; 279 275 struct timeval i_rx_tstamp; ··· 390 386 struct list_head m_conn_item; 391 387 struct rds_incoming m_inc; 392 388 u64 m_ack_seq; 393 - __be32 m_daddr; 389 + struct in6_addr m_daddr; 394 390 unsigned long m_flags; 395 391 396 392 /* Never access m_rs without holding m_rs_lock. ··· 523 519 t_mp_capable:1; 524 520 unsigned int t_type; 525 521 526 - int (*laddr_check)(struct net *net, __be32 addr); 522 + int (*laddr_check)(struct net *net, const struct in6_addr *addr, 523 + __u32 scope_id); 527 524 int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); 528 525 void (*conn_free)(void *data); 529 526 int (*conn_path_connect)(struct rds_conn_path *cp); ··· 540 535 void (*inc_free)(struct rds_incoming *inc); 541 536 542 537 int (*cm_handle_connect)(struct rdma_cm_id *cm_id, 543 - struct rdma_cm_event *event); 544 - int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); 538 + struct rdma_cm_event *event, bool isv6); 539 + int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6); 545 540 void (*cm_connect_complete)(struct rds_connection *conn, 546 541 struct rdma_cm_event *event); 547 542 ··· 556 551 bool (*t_unloading)(struct rds_connection *conn); 557 552 }; 558 553 554 + /* Bind hash table key length. It is the sum of the size of a struct 555 + * in6_addr, a scope_id and a port. 556 + */ 557 + #define RDS_BOUND_KEY_LEN \ 558 + (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16)) 559 + 559 560 struct rds_sock { 560 561 struct sock rs_sk; 561 562 ··· 573 562 * support. 574 563 */ 575 564 struct rhash_head rs_bound_node; 576 - u64 rs_bound_key; 577 - __be32 rs_bound_addr; 578 - __be32 rs_conn_addr; 579 - __be16 rs_bound_port; 565 + u8 rs_bound_key[RDS_BOUND_KEY_LEN]; 566 + struct sockaddr_in6 rs_bound_sin6; 567 + #define rs_bound_addr rs_bound_sin6.sin6_addr 568 + #define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3] 569 + #define rs_bound_port rs_bound_sin6.sin6_port 570 + #define rs_bound_scope_id rs_bound_sin6.sin6_scope_id 571 + struct in6_addr rs_conn_addr; 572 + #define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3] 580 573 __be16 rs_conn_port; 581 574 struct rds_transport *rs_transport; 582 575 ··· 716 701 /* bind.c */ 717 702 int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); 718 703 void rds_remove_bound(struct rds_sock *rs); 719 - struct rds_sock *rds_find_bound(__be32 addr, __be16 port); 704 + struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, 705 + __u32 scope_id); 720 706 int rds_bind_lock_init(void); 721 707 void rds_bind_lock_destroy(void); 722 708 ··· 741 725 int rds_conn_init(void); 742 726 void rds_conn_exit(void); 743 727 struct rds_connection *rds_conn_create(struct net *net, 744 - __be32 laddr, __be32 faddr, 745 - struct rds_transport *trans, gfp_t gfp); 728 + const struct in6_addr *laddr, 729 + const struct in6_addr *faddr, 730 + struct rds_transport *trans, gfp_t gfp, 731 + int dev_if); 746 732 struct rds_connection *rds_conn_create_outgoing(struct net *net, 747 - __be32 laddr, __be32 faddr, 748 - struct rds_transport *trans, gfp_t gfp); 733 + const struct in6_addr *laddr, 734 + const struct in6_addr *faddr, 735 + struct rds_transport *trans, 736 + gfp_t gfp, int dev_if); 749 737 void rds_conn_shutdown(struct rds_conn_path *cpath); 750 738 void rds_conn_destroy(struct rds_connection *conn); 751 739 void rds_conn_drop(struct rds_connection *conn); ··· 860 840 861 841 /* recv.c */ 862 842 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 863 - __be32 saddr); 843 + struct in6_addr *saddr); 864 844 void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, 865 - __be32 saddr); 845 + struct in6_addr *saddr); 866 846 void rds_inc_put(struct rds_incoming *inc); 867 - void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, 847 + void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, 848 + struct in6_addr *daddr, 868 849 struct rds_incoming *inc, gfp_t gfp); 869 850 int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 870 851 int msg_flags); ··· 880 859 void rds_send_path_reset(struct rds_conn_path *conn); 881 860 int rds_send_xmit(struct rds_conn_path *cp); 882 861 struct sockaddr_in; 883 - void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); 862 + void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest); 884 863 typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); 885 864 void rds_send_drop_acked(struct rds_connection *conn, u64 ack, 886 865 is_acked_func is_acked); ··· 967 946 void rds_recv_worker(struct work_struct *); 968 947 void rds_connect_path_complete(struct rds_conn_path *conn, int curr); 969 948 void rds_connect_complete(struct rds_connection *conn); 949 + int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2); 970 950 971 951 /* transport.c */ 972 952 void rds_trans_register(struct rds_transport *trans); 973 953 void rds_trans_unregister(struct rds_transport *trans); 974 - struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); 954 + struct rds_transport *rds_trans_get_preferred(struct net *net, 955 + const struct in6_addr *addr, 956 + __u32 scope_id); 975 957 void rds_trans_put(struct rds_transport *trans); 976 958 unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, 977 959 unsigned int avail);
+34 -17
net/rds/recv.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 41 41 #include "rds.h" 42 42 43 43 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, 44 - __be32 saddr) 44 + struct in6_addr *saddr) 45 45 { 46 46 int i; 47 47 48 48 refcount_set(&inc->i_refcount, 1); 49 49 INIT_LIST_HEAD(&inc->i_item); 50 50 inc->i_conn = conn; 51 - inc->i_saddr = saddr; 51 + inc->i_saddr = *saddr; 52 52 inc->i_rdma_cookie = 0; 53 53 inc->i_rx_tstamp.tv_sec = 0; 54 54 inc->i_rx_tstamp.tv_usec = 0; ··· 59 59 EXPORT_SYMBOL_GPL(rds_inc_init); 60 60 61 61 void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, 62 - __be32 saddr) 62 + struct in6_addr *saddr) 63 63 { 64 64 refcount_set(&inc->i_refcount, 1); 65 65 INIT_LIST_HEAD(&inc->i_item); 66 66 inc->i_conn = cp->cp_conn; 67 67 inc->i_conn_path = cp; 68 - inc->i_saddr = saddr; 68 + inc->i_saddr = *saddr; 69 69 inc->i_rdma_cookie = 0; 70 70 inc->i_rx_tstamp.tv_sec = 0; 71 71 inc->i_rx_tstamp.tv_usec = 0; ··· 110 110 111 111 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); 112 112 113 - rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " 113 + rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d " 114 114 "now_cong %d delta %d\n", 115 115 rs, &rs->rs_bound_addr, 116 116 ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, ··· 260 260 struct rds_conn_path *cp; 261 261 262 262 if (conn->c_npaths > 1 && 263 - IS_CANONICAL(conn->c_laddr, conn->c_faddr)) { 263 + rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) { 264 264 for (i = 0; i < conn->c_npaths; i++) { 265 265 cp = &conn->c_path[i]; 266 266 rds_conn_path_connect_if_down(cp); ··· 284 284 * conn. This lets loopback, who only has one conn for both directions, 285 285 * tell us which roles the addrs in the conn are playing for this message. 286 286 */ 287 - void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, 287 + void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, 288 + struct in6_addr *daddr, 288 289 struct rds_incoming *inc, gfp_t gfp) 289 290 { 290 291 struct rds_sock *rs = NULL; ··· 340 339 341 340 if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { 342 341 if (inc->i_hdr.h_sport == 0) { 343 - rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); 342 + rdsdebug("ignore ping with 0 sport from %pI6c\n", 343 + saddr); 344 344 goto out; 345 345 } 346 346 rds_stats_inc(s_recv_ping); ··· 364 362 goto out; 365 363 } 366 364 367 - rs = rds_find_bound(daddr, inc->i_hdr.h_dport); 365 + rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_dev_if); 368 366 if (!rs) { 369 367 rds_stats_inc(s_recv_drop_no_sock); 370 368 goto out; ··· 627 625 struct rds_sock *rs = rds_sk_to_rs(sk); 628 626 long timeo; 629 627 int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; 628 + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); 630 629 DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); 631 630 struct rds_incoming *inc = NULL; 632 631 ··· 676 673 break; 677 674 } 678 675 679 - rdsdebug("copying inc %p from %pI4:%u to user\n", inc, 676 + rdsdebug("copying inc %p from %pI6c:%u to user\n", inc, 680 677 &inc->i_conn->c_faddr, 681 678 ntohs(inc->i_hdr.h_sport)); 682 679 ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); ··· 710 707 711 708 rds_stats_inc(s_recv_delivered); 712 709 713 - if (sin) { 714 - sin->sin_family = AF_INET; 715 - sin->sin_port = inc->i_hdr.h_sport; 716 - sin->sin_addr.s_addr = inc->i_saddr; 717 - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 718 - msg->msg_namelen = sizeof(*sin); 710 + if (msg->msg_name) { 711 + if (ipv6_addr_v4mapped(&inc->i_saddr)) { 712 + sin = (struct sockaddr_in *)msg->msg_name; 713 + 714 + sin->sin_family = AF_INET; 715 + sin->sin_port = inc->i_hdr.h_sport; 716 + sin->sin_addr.s_addr = 717 + inc->i_saddr.s6_addr32[3]; 718 + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); 719 + msg->msg_namelen = sizeof(*sin); 720 + } else { 721 + sin6 = (struct sockaddr_in6 *)msg->msg_name; 722 + 723 + sin6->sin6_family = AF_INET6; 724 + sin6->sin6_port = inc->i_hdr.h_sport; 725 + sin6->sin6_addr = inc->i_saddr; 726 + sin6->sin6_flowinfo = 0; 727 + sin6->sin6_scope_id = rs->rs_bound_scope_id; 728 + msg->msg_namelen = sizeof(*sin6); 729 + } 719 730 } 720 731 break; 721 732 }
+52 -15
net/rds/send.c
··· 709 709 } 710 710 EXPORT_SYMBOL_GPL(rds_send_drop_acked); 711 711 712 - void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) 712 + void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest) 713 713 { 714 714 struct rds_message *rm, *tmp; 715 715 struct rds_connection *conn; ··· 721 721 spin_lock_irqsave(&rs->rs_lock, flags); 722 722 723 723 list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { 724 - if (dest && (dest->sin_addr.s_addr != rm->m_daddr || 725 - dest->sin_port != rm->m_inc.i_hdr.h_dport)) 724 + if (dest && 725 + (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) || 726 + dest->sin6_port != rm->m_inc.i_hdr.h_dport)) 726 727 continue; 727 728 728 729 list_move(&rm->m_sock_item, &list); ··· 1060 1059 { 1061 1060 struct sock *sk = sock->sk; 1062 1061 struct rds_sock *rs = rds_sk_to_rs(sk); 1062 + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); 1063 1063 DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); 1064 - __be32 daddr; 1065 1064 __be16 dport; 1066 1065 struct rds_message *rm = NULL; 1067 1066 struct rds_connection *conn; ··· 1070 1069 int nonblock = msg->msg_flags & MSG_DONTWAIT; 1071 1070 long timeo = sock_sndtimeo(sk, nonblock); 1072 1071 struct rds_conn_path *cpath; 1072 + struct in6_addr daddr; 1073 + __u32 scope_id = 0; 1073 1074 size_t total_payload_len = payload_len, rdma_payload_len = 0; 1074 1075 bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && 1075 1076 sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); 1076 1077 int num_sgs = ceil(payload_len, PAGE_SIZE); 1078 + int namelen; 1077 1079 1078 1080 /* Mirror Linux UDP mirror of BSD error message compatibility */ 1079 1081 /* XXX: Perhaps MSG_MORE someday */ ··· 1085 1081 goto out; 1086 1082 } 1087 1083 1088 - if (msg->msg_namelen) { 1089 - /* XXX fail non-unicast destination IPs? */ 1090 - if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { 1084 + namelen = msg->msg_namelen; 1085 + if (namelen != 0) { 1086 + if (namelen < sizeof(*usin)) { 1091 1087 ret = -EINVAL; 1092 1088 goto out; 1093 1089 } 1094 - daddr = usin->sin_addr.s_addr; 1095 - dport = usin->sin_port; 1090 + switch (namelen) { 1091 + case sizeof(*usin): 1092 + if (usin->sin_family != AF_INET || 1093 + usin->sin_addr.s_addr == htonl(INADDR_ANY) || 1094 + usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || 1095 + IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) { 1096 + ret = -EINVAL; 1097 + goto out; 1098 + } 1099 + ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr); 1100 + dport = usin->sin_port; 1101 + break; 1102 + 1103 + case sizeof(*sin6): { 1104 + ret = -EPROTONOSUPPORT; 1105 + goto out; 1106 + } 1107 + 1108 + default: 1109 + ret = -EINVAL; 1110 + goto out; 1111 + } 1096 1112 } else { 1097 1113 /* We only care about consistency with ->connect() */ 1098 1114 lock_sock(sk); 1099 1115 daddr = rs->rs_conn_addr; 1100 1116 dport = rs->rs_conn_port; 1117 + scope_id = rs->rs_bound_scope_id; 1101 1118 release_sock(sk); 1102 1119 } 1103 1120 1104 1121 lock_sock(sk); 1105 - if (daddr == 0 || rs->rs_bound_addr == 0) { 1122 + if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) { 1106 1123 release_sock(sk); 1107 - ret = -ENOTCONN; /* XXX not a great errno */ 1124 + ret = -ENOTCONN; 1108 1125 goto out; 1126 + } else if (namelen != 0) { 1127 + /* Cannot send to an IPv4 address using an IPv6 source 1128 + * address and cannot send to an IPv6 address using an 1129 + * IPv4 source address. 1130 + */ 1131 + if (ipv6_addr_v4mapped(&daddr) ^ 1132 + ipv6_addr_v4mapped(&rs->rs_bound_addr)) { 1133 + release_sock(sk); 1134 + ret = -EOPNOTSUPP; 1135 + goto out; 1136 + } 1109 1137 } 1110 1138 release_sock(sk); 1111 1139 ··· 1191 1155 1192 1156 /* rds_conn_create has a spinlock that runs with IRQ off. 1193 1157 * Caching the conn in the socket helps a lot. */ 1194 - if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) 1158 + if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) 1195 1159 conn = rs->rs_conn; 1196 1160 else { 1197 1161 conn = rds_conn_create_outgoing(sock_net(sock->sk), 1198 - rs->rs_bound_addr, daddr, 1199 - rs->rs_transport, 1200 - sock->sk->sk_allocation); 1162 + &rs->rs_bound_addr, &daddr, 1163 + rs->rs_transport, 1164 + sock->sk->sk_allocation, 1165 + scope_id); 1201 1166 if (IS_ERR(conn)) { 1202 1167 ret = PTR_ERR(conn); 1203 1168 goto out;
+29 -3
net/rds/tcp.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 37 37 #include <net/tcp.h> 38 38 #include <net/net_namespace.h> 39 39 #include <net/netns/generic.h> 40 + #include <net/tcp.h> 41 + #include <net/addrconf.h> 40 42 41 43 #include "rds.h" 42 44 #include "tcp.h" ··· 264 262 spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); 265 263 } 266 264 267 - static int rds_tcp_laddr_check(struct net *net, __be32 addr) 265 + static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, 266 + __u32 scope_id) 268 267 { 269 - if (inet_addr_type(net, addr) == RTN_LOCAL) 268 + struct net_device *dev = NULL; 269 + int ret; 270 + 271 + if (ipv6_addr_v4mapped(addr)) { 272 + if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL) 273 + return 0; 274 + return -EADDRNOTAVAIL; 275 + } 276 + 277 + /* If the scope_id is specified, check only those addresses 278 + * hosted on the specified interface. 279 + */ 280 + if (scope_id != 0) { 281 + rcu_read_lock(); 282 + dev = dev_get_by_index_rcu(net, scope_id); 283 + /* scope_id is not valid... */ 284 + if (!dev) { 285 + rcu_read_unlock(); 286 + return -EADDRNOTAVAIL; 287 + } 288 + rcu_read_unlock(); 289 + } 290 + ret = ipv6_chk_addr(net, addr, dev, 0); 291 + if (ret) 270 292 return 0; 271 293 return -EADDRNOTAVAIL; 272 294 }
+20 -14
net/rds/tcp_connect.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 66 66 * RDS connection as RDS_CONN_UP until the reconnect, 67 67 * to avoid RDS datagram loss. 68 68 */ 69 - if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) && 69 + if (rds_addr_cmp(&cp->cp_conn->c_laddr, 70 + &cp->cp_conn->c_faddr) >= 0 && 70 71 rds_conn_path_transition(cp, RDS_CONN_CONNECTING, 71 72 RDS_CONN_ERROR)) { 72 73 rds_conn_path_drop(cp, false); ··· 89 88 int rds_tcp_conn_path_connect(struct rds_conn_path *cp) 90 89 { 91 90 struct socket *sock = NULL; 92 - struct sockaddr_in src, dest; 91 + struct sockaddr_in sin; 92 + struct sockaddr *addr; 93 + int addrlen; 93 94 int ret; 94 95 struct rds_connection *conn = cp->cp_conn; 95 96 struct rds_tcp_connection *tc = cp->cp_transport_data; ··· 115 112 116 113 rds_tcp_tune(sock); 117 114 118 - src.sin_family = AF_INET; 119 - src.sin_addr.s_addr = (__force u32)conn->c_laddr; 120 - src.sin_port = (__force u16)htons(0); 115 + sin.sin_family = AF_INET; 116 + sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; 117 + sin.sin_port = 0; 118 + addr = (struct sockaddr *)&sin; 119 + addrlen = sizeof(sin); 121 120 122 - ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); 121 + ret = sock->ops->bind(sock, addr, addrlen); 123 122 if (ret) { 124 - rdsdebug("bind failed with %d at address %pI4\n", 123 + rdsdebug("bind failed with %d at address %pI6c\n", 125 124 ret, &conn->c_laddr); 126 125 goto out; 127 126 } 128 127 129 - dest.sin_family = AF_INET; 130 - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; 131 - dest.sin_port = (__force u16)htons(RDS_TCP_PORT); 128 + sin.sin_family = AF_INET; 129 + sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; 130 + sin.sin_port = htons(RDS_TCP_PORT); 131 + addr = (struct sockaddr *)&sin; 132 + addrlen = sizeof(sin); 132 133 133 134 /* 134 135 * once we call connect() we can start getting callbacks and they 135 136 * own the socket 136 137 */ 137 138 rds_tcp_set_callbacks(sock, cp); 138 - ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), 139 - O_NONBLOCK); 139 + ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK); 140 140 141 - rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); 141 + rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); 142 142 if (ret == -EINPROGRESS) 143 143 ret = 0; 144 144 if (ret == 0) {
+10 -8
net/rds/tcp_listen.c
··· 1 1 /* 2 - * Copyright (c) 2006, 2018 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 83 83 struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) 84 84 { 85 85 int i; 86 - bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr); 87 86 int npaths = max_t(int, 1, conn->c_npaths); 88 87 89 88 /* for mprds, all paths MUST be initiated by the peer 90 89 * with the smaller address. 91 90 */ 92 - if (!peer_is_smaller) { 91 + if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) { 93 92 /* Make sure we initiate at least one path if this 94 93 * has not already been done; rds_start_mprds() will 95 94 * take care of additional paths, if necessary. ··· 163 164 164 165 inet = inet_sk(new_sock->sk); 165 166 166 - rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", 167 - &inet->inet_saddr, ntohs(inet->inet_sport), 168 - &inet->inet_daddr, ntohs(inet->inet_dport)); 167 + rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n", 168 + &new_sock->sk->sk_v6_rcv_saddr, ntohs(inet->inet_sport), 169 + &new_sock->sk->sk_v6_daddr, ntohs(inet->inet_dport)); 169 170 170 171 conn = rds_conn_create(sock_net(sock->sk), 171 - inet->inet_saddr, inet->inet_daddr, 172 - &rds_tcp_transport, GFP_KERNEL); 172 + &new_sock->sk->sk_v6_rcv_saddr, 173 + &new_sock->sk->sk_v6_daddr, 174 + &rds_tcp_transport, GFP_KERNEL, 175 + new_sock->sk->sk_bound_dev_if); 176 + 173 177 if (IS_ERR(conn)) { 174 178 ret = PTR_ERR(conn); 175 179 goto out;
+5 -4
net/rds/tcp_recv.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 179 179 tc->t_tinc = tinc; 180 180 rdsdebug("alloced tinc %p\n", tinc); 181 181 rds_inc_path_init(&tinc->ti_inc, cp, 182 - cp->cp_conn->c_faddr); 182 + &cp->cp_conn->c_faddr); 183 183 tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = 184 184 local_clock(); 185 185 ··· 239 239 if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) 240 240 rds_tcp_cong_recv(conn, tinc); 241 241 else 242 - rds_recv_incoming(conn, conn->c_faddr, 243 - conn->c_laddr, &tinc->ti_inc, 242 + rds_recv_incoming(conn, &conn->c_faddr, 243 + &conn->c_laddr, 244 + &tinc->ti_inc, 244 245 arg->gfp); 245 246 246 247 tc->t_tinc_hdr_rem = sizeof(struct rds_header);
+2 -2
net/rds/tcp_send.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 153 153 * an incoming RST. 154 154 */ 155 155 if (rds_conn_path_up(cp)) { 156 - pr_warn("RDS/tcp: send to %pI4 on cp [%d]" 156 + pr_warn("RDS/tcp: send to %pI6c on cp [%d]" 157 157 "returned %d, " 158 158 "disconnecting and reconnecting\n", 159 159 &conn->c_faddr, cp->cp_index, ret);
+58 -11
net/rds/threads.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 82 82 return; 83 83 } 84 84 85 - rdsdebug("conn %p for %pI4 to %pI4 complete\n", 86 - cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); 85 + rdsdebug("conn %p for %pI6c to %pI6c complete\n", 86 + cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr); 87 87 88 88 cp->cp_reconnect_jiffies = 0; 89 89 set_bit(0, &cp->cp_conn->c_map_queued); ··· 125 125 unsigned long rand; 126 126 struct rds_connection *conn = cp->cp_conn; 127 127 128 - rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", 129 - conn, &conn->c_laddr, &conn->c_faddr, 130 - cp->cp_reconnect_jiffies); 128 + rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n", 129 + conn, &conn->c_laddr, &conn->c_faddr, 130 + cp->cp_reconnect_jiffies); 131 131 132 132 /* let peer with smaller addr initiate reconnect, to avoid duels */ 133 133 if (conn->c_trans->t_type == RDS_TRANS_TCP && 134 - !IS_CANONICAL(conn->c_laddr, conn->c_faddr)) 134 + rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0) 135 135 return; 136 136 137 137 set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); ··· 145 145 } 146 146 147 147 get_random_bytes(&rand, sizeof(rand)); 148 - rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", 148 + rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n", 149 149 rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, 150 150 conn, &conn->c_laddr, &conn->c_faddr); 151 151 rcu_read_lock(); ··· 167 167 int ret; 168 168 169 169 if (cp->cp_index > 0 && 170 - !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr)) 170 + rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0) 171 171 return; 172 172 clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); 173 173 ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); 174 174 if (ret) { 175 175 ret = conn->c_trans->conn_path_connect(cp); 176 - rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", 177 - conn, &conn->c_laddr, &conn->c_faddr, ret); 176 + rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n", 177 + conn, &conn->c_laddr, &conn->c_faddr, ret); 178 178 179 179 if (ret) { 180 180 if (rds_conn_path_transition(cp, ··· 259 259 260 260 return 0; 261 261 } 262 + 263 + /* Compare two IPv6 addresses. Return 0 if the two addresses are equal. 264 + * Return 1 if the first is greater. Return -1 if the second is greater. 265 + */ 266 + int rds_addr_cmp(const struct in6_addr *addr1, 267 + const struct in6_addr *addr2) 268 + { 269 + #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 270 + const __be64 *a1, *a2; 271 + u64 x, y; 272 + 273 + a1 = (__be64 *)addr1; 274 + a2 = (__be64 *)addr2; 275 + 276 + if (*a1 != *a2) { 277 + if (be64_to_cpu(*a1) < be64_to_cpu(*a2)) 278 + return -1; 279 + else 280 + return 1; 281 + } else { 282 + x = be64_to_cpu(*++a1); 283 + y = be64_to_cpu(*++a2); 284 + if (x < y) 285 + return -1; 286 + else if (x > y) 287 + return 1; 288 + else 289 + return 0; 290 + } 291 + #else 292 + u32 a, b; 293 + int i; 294 + 295 + for (i = 0; i < 4; i++) { 296 + if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) { 297 + a = ntohl(addr1->s6_addr32[i]); 298 + b = ntohl(addr2->s6_addr32[i]); 299 + if (a < b) 300 + return -1; 301 + else if (a > b) 302 + return 1; 303 + } 304 + } 305 + return 0; 306 + #endif 307 + } 308 + EXPORT_SYMBOL_GPL(rds_addr_cmp);
+11 -4
net/rds/transport.c
··· 1 1 /* 2 - * Copyright (c) 2006 Oracle. All rights reserved. 2 + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. 3 3 * 4 4 * This software is available to you under a choice of one of two 5 5 * licenses. You may choose to be licensed under the terms of the GNU ··· 33 33 #include <linux/kernel.h> 34 34 #include <linux/module.h> 35 35 #include <linux/in.h> 36 + #include <linux/ipv6.h> 36 37 37 38 #include "rds.h" 38 39 #include "loop.h" ··· 76 75 module_put(trans->t_owner); 77 76 } 78 77 79 - struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr) 78 + struct rds_transport *rds_trans_get_preferred(struct net *net, 79 + const struct in6_addr *addr, 80 + __u32 scope_id) 80 81 { 81 82 struct rds_transport *ret = NULL; 82 83 struct rds_transport *trans; 83 84 unsigned int i; 84 85 85 - if (IN_LOOPBACK(ntohl(addr))) 86 + if (ipv6_addr_v4mapped(addr)) { 87 + if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET) 88 + return &rds_loop_transport; 89 + } else if (ipv6_addr_loopback(addr)) { 86 90 return &rds_loop_transport; 91 + } 87 92 88 93 down_read(&rds_trans_sem); 89 94 for (i = 0; i < RDS_TRANS_COUNT; i++) { 90 95 trans = transports[i]; 91 96 92 - if (trans && (trans->laddr_check(net, addr) == 0) && 97 + if (trans && (trans->laddr_check(net, addr, scope_id) == 0) && 93 98 (!trans->t_owner || try_module_get(trans->t_owner))) { 94 99 ret = trans; 95 100 break;