Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'tcp_metrics_saddr'

Christoph Paasch says:

====================
Make tcp-metrics source-address aware

Currently tcp-metrics only stores per-destination addresses. This brings
problems, when a host has multiple interfaces (e.g., a smartphone having
WiFi/3G):

For example, a host contacting a server over WiFi will store the tcp-metrics
per destination IP. If then the host contacts the same server over 3G, the
same tcp-metrics will be used, although the path-characteristics are completly
different (e.g., the ssthresh is probably not the same).

In case of TFO this is not a problem, as the server will provide us a new cookie
once he saw our SYN+DATA with an incorrect cookie.
It may be (in case of carrier-grade NAT), that we keep the same public IP but
have a different private IP. Thus, we better reuse the old cookie even if our
source-IP has changed. However, this scenario is probably very uncommon, as
carriers try to provide the same src-IP to the clients behind their CGN.

Patches 1 + 2 add the source-IP to the tcp metrics.

Patches 3 to 5 modify the netlink-api to support the source-IP. From now on,
when using the command "ip tcp_metrics delete address ADDRESS" all entries
which match this destination IP will be deleted.

Today's iproute2 will complain when doing "ip tcp_metrics flush PREFIX" if
several entries are present for the same destination-IP but with different
source-IPs:

root@client:~/test# ip tcp_metrics
10.2.1.2 age 3.640sec rtt 16250us rttvar 15000us cwnd 10
10.2.1.2 age 4.030sec rtt 18750us rttvar 15000us cwnd 10
root@client:~/test# ip tcp_metrics flush 10.2.1.2/16
Failed to send flush request
: No such process

Follow-up patches will modify iproute2 to handle this correctly and allow
specifying the source-IP in the get/del commands.

v2: Added the patch that allows to selectively get/del of tcp-metrics based
on src-IP and moved the patch that adds the new netlink attribute before
the other patches.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+105 -48
+2
include/uapi/linux/tcp_metrics.h
··· 35 35 TCP_METRICS_ATTR_FOPEN_SYN_DROPS, /* u16, count of drops */ 36 36 TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS, /* msecs age */ 37 37 TCP_METRICS_ATTR_FOPEN_COOKIE, /* binary */ 38 + TCP_METRICS_ATTR_SADDR_IPV4, /* u32 */ 39 + TCP_METRICS_ATTR_SADDR_IPV6, /* binary */ 38 40 39 41 __TCP_METRICS_ATTR_MAX, 40 42 };
+103 -48
net/ipv4/tcp_metrics.c
··· 31 31 32 32 struct tcp_metrics_block { 33 33 struct tcp_metrics_block __rcu *tcpm_next; 34 - struct inetpeer_addr tcpm_addr; 34 + struct inetpeer_addr tcpm_saddr; 35 + struct inetpeer_addr tcpm_daddr; 35 36 unsigned long tcpm_stamp; 36 37 u32 tcpm_ts; 37 38 u32 tcpm_ts_stamp; ··· 132 131 } 133 132 134 133 static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, 135 - struct inetpeer_addr *addr, 134 + struct inetpeer_addr *saddr, 135 + struct inetpeer_addr *daddr, 136 136 unsigned int hash, 137 137 bool reclaim) 138 138 { ··· 157 155 if (!tm) 158 156 goto out_unlock; 159 157 } 160 - tm->tcpm_addr = *addr; 158 + tm->tcpm_saddr = *saddr; 159 + tm->tcpm_daddr = *daddr; 161 160 162 161 tcpm_suck_dst(tm, dst, true); 163 162 ··· 192 189 return NULL; 193 190 } 194 191 195 - static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, 192 + static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, 193 + const struct inetpeer_addr *daddr, 196 194 struct net *net, unsigned int hash) 197 195 { 198 196 struct tcp_metrics_block *tm; ··· 201 197 202 198 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 203 199 tm = rcu_dereference(tm->tcpm_next)) { 204 - if (addr_same(&tm->tcpm_addr, addr)) 200 + if (addr_same(&tm->tcpm_saddr, saddr) && 201 + addr_same(&tm->tcpm_daddr, daddr)) 205 202 break; 206 203 depth++; 207 204 } ··· 213 208 struct dst_entry *dst) 214 209 { 215 210 struct tcp_metrics_block *tm; 216 - struct inetpeer_addr addr; 211 + struct inetpeer_addr saddr, daddr; 217 212 unsigned int hash; 218 213 struct net *net; 219 214 220 - addr.family = req->rsk_ops->family; 221 - switch (addr.family) { 215 + saddr.family = req->rsk_ops->family; 216 + daddr.family = req->rsk_ops->family; 217 + switch (daddr.family) { 222 218 case AF_INET: 223 - addr.addr.a4 = inet_rsk(req)->ir_rmt_addr; 224 - hash = (__force unsigned int) addr.addr.a4; 219 + saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; 220 + daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; 221 + hash = (__force unsigned int) daddr.addr.a4; 225 222 break; 226 223 #if IS_ENABLED(CONFIG_IPV6) 227 224 case AF_INET6: 228 - *(struct in6_addr *)addr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; 225 + *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; 226 + *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; 229 227 hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); 230 228 break; 231 229 #endif ··· 241 233 242 234 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 243 235 tm = rcu_dereference(tm->tcpm_next)) { 244 - if (addr_same(&tm->tcpm_addr, &addr)) 236 + if (addr_same(&tm->tcpm_saddr, &saddr) && 237 + addr_same(&tm->tcpm_daddr, &daddr)) 245 238 break; 246 239 } 247 240 tcpm_check_stamp(tm, dst); ··· 252 243 static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) 253 244 { 254 245 struct tcp_metrics_block *tm; 255 - struct inetpeer_addr addr; 246 + struct inetpeer_addr saddr, daddr; 256 247 unsigned int hash; 257 248 struct net *net; 258 249 259 - addr.family = tw->tw_family; 260 - switch (addr.family) { 250 + saddr.family = tw->tw_family; 251 + daddr.family = tw->tw_family; 252 + switch (daddr.family) { 261 253 case AF_INET: 262 - addr.addr.a4 = tw->tw_daddr; 263 - hash = (__force unsigned int) addr.addr.a4; 254 + saddr.addr.a4 = tw->tw_rcv_saddr; 255 + daddr.addr.a4 = tw->tw_daddr; 256 + hash = (__force unsigned int) daddr.addr.a4; 264 257 break; 265 258 #if IS_ENABLED(CONFIG_IPV6) 266 259 case AF_INET6: 267 - *(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr; 260 + *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; 261 + *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; 268 262 hash = ipv6_addr_hash(&tw->tw_v6_daddr); 269 263 break; 270 264 #endif ··· 280 268 281 269 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 282 270 tm = rcu_dereference(tm->tcpm_next)) { 283 - if (addr_same(&tm->tcpm_addr, &addr)) 271 + if (addr_same(&tm->tcpm_saddr, &saddr) && 272 + addr_same(&tm->tcpm_daddr, &daddr)) 284 273 break; 285 274 } 286 275 return tm; ··· 292 279 bool create) 293 280 { 294 281 struct tcp_metrics_block *tm; 295 - struct inetpeer_addr addr; 282 + struct inetpeer_addr saddr, daddr; 296 283 unsigned int hash; 297 284 struct net *net; 298 285 bool reclaim; 299 286 300 - addr.family = sk->sk_family; 301 - switch (addr.family) { 287 + saddr.family = sk->sk_family; 288 + daddr.family = sk->sk_family; 289 + switch (daddr.family) { 302 290 case AF_INET: 303 - addr.addr.a4 = inet_sk(sk)->inet_daddr; 304 - hash = (__force unsigned int) addr.addr.a4; 291 + saddr.addr.a4 = inet_sk(sk)->inet_saddr; 292 + daddr.addr.a4 = inet_sk(sk)->inet_daddr; 293 + hash = (__force unsigned int) daddr.addr.a4; 305 294 break; 306 295 #if IS_ENABLED(CONFIG_IPV6) 307 296 case AF_INET6: 308 - *(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr; 297 + *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; 298 + *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; 309 299 hash = ipv6_addr_hash(&sk->sk_v6_daddr); 310 300 break; 311 301 #endif ··· 319 303 net = dev_net(dst->dev); 320 304 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 321 305 322 - tm = __tcp_get_metrics(&addr, net, hash); 306 + tm = __tcp_get_metrics(&saddr, &daddr, net, hash); 323 307 reclaim = false; 324 308 if (tm == TCP_METRICS_RECLAIM_PTR) { 325 309 reclaim = true; 326 310 tm = NULL; 327 311 } 328 312 if (!tm && create) 329 - tm = tcpm_new(dst, &addr, hash, reclaim); 313 + tm = tcpm_new(dst, &saddr, &daddr, hash, reclaim); 330 314 else 331 315 tcpm_check_stamp(tm, dst); 332 316 ··· 740 724 struct nlattr *nest; 741 725 int i; 742 726 743 - switch (tm->tcpm_addr.family) { 727 + switch (tm->tcpm_daddr.family) { 744 728 case AF_INET: 745 729 if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, 746 - tm->tcpm_addr.addr.a4) < 0) 730 + tm->tcpm_daddr.addr.a4) < 0) 731 + goto nla_put_failure; 732 + if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, 733 + tm->tcpm_saddr.addr.a4) < 0) 747 734 goto nla_put_failure; 748 735 break; 749 736 case AF_INET6: 750 737 if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, 751 - tm->tcpm_addr.addr.a6) < 0) 738 + tm->tcpm_daddr.addr.a6) < 0) 739 + goto nla_put_failure; 740 + if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, 741 + tm->tcpm_saddr.addr.a6) < 0) 752 742 goto nla_put_failure; 753 743 break; 754 744 default: ··· 877 855 return skb->len; 878 856 } 879 857 880 - static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, 881 - unsigned int *hash, int optional) 858 + static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, 859 + unsigned int *hash, int optional, int v4, int v6) 882 860 { 883 861 struct nlattr *a; 884 862 885 - a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV4]; 863 + a = info->attrs[v4]; 886 864 if (a) { 887 865 addr->family = AF_INET; 888 866 addr->addr.a4 = nla_get_be32(a); 889 - *hash = (__force unsigned int) addr->addr.a4; 867 + if (hash) 868 + *hash = (__force unsigned int) addr->addr.a4; 890 869 return 0; 891 870 } 892 - a = info->attrs[TCP_METRICS_ATTR_ADDR_IPV6]; 871 + a = info->attrs[v6]; 893 872 if (a) { 894 873 if (nla_len(a) != sizeof(struct in6_addr)) 895 874 return -EINVAL; 896 875 addr->family = AF_INET6; 897 876 memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); 898 - *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); 877 + if (hash) 878 + *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); 899 879 return 0; 900 880 } 901 881 return optional ? 1 : -EAFNOSUPPORT; 902 882 } 903 883 884 + static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, 885 + unsigned int *hash, int optional) 886 + { 887 + return __parse_nl_addr(info, addr, hash, optional, 888 + TCP_METRICS_ATTR_ADDR_IPV4, 889 + TCP_METRICS_ATTR_ADDR_IPV6); 890 + } 891 + 892 + static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr) 893 + { 894 + return __parse_nl_addr(info, addr, NULL, 0, 895 + TCP_METRICS_ATTR_SADDR_IPV4, 896 + TCP_METRICS_ATTR_SADDR_IPV6); 897 + } 898 + 904 899 static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) 905 900 { 906 901 struct tcp_metrics_block *tm; 907 - struct inetpeer_addr addr; 902 + struct inetpeer_addr saddr, daddr; 908 903 unsigned int hash; 909 904 struct sk_buff *msg; 910 905 struct net *net = genl_info_net(info); 911 906 void *reply; 912 907 int ret; 908 + bool src = true; 913 909 914 - ret = parse_nl_addr(info, &addr, &hash, 0); 910 + ret = parse_nl_addr(info, &daddr, &hash, 0); 915 911 if (ret < 0) 916 912 return ret; 913 + 914 + ret = parse_nl_saddr(info, &saddr); 915 + if (ret < 0) 916 + src = false; 917 917 918 918 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); 919 919 if (!msg) ··· 951 907 rcu_read_lock(); 952 908 for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 953 909 tm = rcu_dereference(tm->tcpm_next)) { 954 - if (addr_same(&tm->tcpm_addr, &addr)) { 910 + if (addr_same(&tm->tcpm_daddr, &daddr) && 911 + (!src || addr_same(&tm->tcpm_saddr, &saddr))) { 955 912 ret = tcp_metrics_fill_info(msg, tm); 956 913 break; 957 914 } ··· 1005 960 static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) 1006 961 { 1007 962 struct tcpm_hash_bucket *hb; 1008 - struct tcp_metrics_block *tm; 963 + struct tcp_metrics_block *tm, *tmlist = NULL; 1009 964 struct tcp_metrics_block __rcu **pp; 1010 - struct inetpeer_addr addr; 965 + struct inetpeer_addr saddr, daddr; 1011 966 unsigned int hash; 1012 967 struct net *net = genl_info_net(info); 1013 968 int ret; 969 + bool src = true; 1014 970 1015 - ret = parse_nl_addr(info, &addr, &hash, 1); 971 + ret = parse_nl_addr(info, &daddr, &hash, 1); 1016 972 if (ret < 0) 1017 973 return ret; 1018 974 if (ret > 0) 1019 975 return tcp_metrics_flush_all(net); 976 + ret = parse_nl_saddr(info, &saddr); 977 + if (ret < 0) 978 + src = false; 1020 979 1021 980 hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); 1022 981 hb = net->ipv4.tcp_metrics_hash + hash; 1023 982 pp = &hb->chain; 1024 983 spin_lock_bh(&tcp_metrics_lock); 1025 - for (tm = deref_locked_genl(*pp); tm; 1026 - pp = &tm->tcpm_next, tm = deref_locked_genl(*pp)) { 1027 - if (addr_same(&tm->tcpm_addr, &addr)) { 984 + for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { 985 + if (addr_same(&tm->tcpm_daddr, &daddr) && 986 + (!src || addr_same(&tm->tcpm_saddr, &saddr))) { 1028 987 *pp = tm->tcpm_next; 1029 - break; 988 + tm->tcpm_next = tmlist; 989 + tmlist = tm; 990 + } else { 991 + pp = &tm->tcpm_next; 1030 992 } 1031 993 } 1032 994 spin_unlock_bh(&tcp_metrics_lock); 1033 - if (!tm) 995 + if (!tmlist) 1034 996 return -ESRCH; 1035 - kfree_rcu(tm, rcu_head); 997 + for (tm = tmlist; tm; tm = tmlist) { 998 + tmlist = tm->tcpm_next; 999 + kfree_rcu(tm, rcu_head); 1000 + } 1036 1001 return 0; 1037 1002 } 1038 1003