Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'metrics_restructure'

This patch series works towards the goal of minimizing the amount
of things that can change in an ipv4 route.

In a regime where the routing cache is removed, route changes will
lead to cloning in the FIB tables or similar.

The largest trigger of route metrics writes, TCP, now has it's own
cache of dynamic metric state. The timewait timestamps are stored
there now as well.

As a result of that, pre-cowing metrics is no longer necessary,
and therefore FLOWI_FLAG_PRECOW_METRICS is removed.

Redirect and PMTU handling is moved back into the ipv4 routes. I'm
sorry for all the headaches trying to do this in the inetpeer has
caused, it was the wrong approach for sure.

Since metrics become read-only for ipv4 we no longer need the inetpeer
hung off of the ipv4 routes either. So those disappear too.

Also, timewait sockets no longer need to hold onto an inetpeer either.

After this series, we still have some details to resolve wrt. PMTU and
redirects for a route-cache-less system:

1) With just the plain route cache removal, PMTU will continue to
work mostly fine. This is because of how the local route users
call down into the PMTU update code with the route they already
hold.

However, if we wish to cache pre-computed routes in fib_info
nexthops (which we want for performance), then we need to add
route cloning for PMTU events.

2) Redirects require more work. First, redirects must be changed to
be handled like PMTU. Wherein we call down into the sockets and
other entities, and then they call back into the routing code with
the route they were using.

So we'll be adding an ->update_nexthop() method alongside
->update_pmtu().

And then, like for PMTU, we'll need cloning support once we start
caching routes in the fib_info nexthops.

But that's it, we can completely pull the trigger and remove the
routing cache with minimal disruptions.

As it is, this patch series alone helps a lot of things. For one,
routing cache entry creation should be a lot faster, because we no
longer do inetpeer lookups (even to check if an entry exists).

This patch series also opens the door for non-DST_HOST ipv4 routes,
because nothing fundamentally cares about rt->rt_dst any more. It
can be removed with the base routing cache removal patch. In fact,
that was the primary goal of this patch series.

Signed-off-by: David S. Miller <davem@davemloft.net>

+837 -731
+1 -2
include/linux/rtnetlink.h
··· 619 619 extern void rtnl_set_sk_err(struct net *net, u32 group, int error); 620 620 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); 621 621 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, 622 - u32 id, u32 ts, u32 tsage, long expires, 623 - u32 error); 622 + u32 id, long expires, u32 error); 624 623 625 624 extern void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change); 626 625
-1
include/linux/tcp.h
··· 506 506 u32 tw_rcv_wnd; 507 507 u32 tw_ts_recent; 508 508 long tw_ts_recent_stamp; 509 - struct inet_peer *tw_peer; 510 509 #ifdef CONFIG_TCP_MD5SIG 511 510 struct tcp_md5sig_key *tw_md5_key; 512 511 #endif
-6
include/net/dst.h
··· 209 209 return msecs_to_jiffies(dst_metric(dst, metric)); 210 210 } 211 211 212 - static inline void set_dst_metric_rtt(struct dst_entry *dst, int metric, 213 - unsigned long rtt) 214 - { 215 - dst_metric_set(dst, metric, jiffies_to_msecs(rtt)); 216 - } 217 - 218 212 static inline u32 219 213 dst_allfrag(const struct dst_entry *dst) 220 214 {
+2 -3
include/net/flow.h
··· 20 20 __u8 flowic_proto; 21 21 __u8 flowic_flags; 22 22 #define FLOWI_FLAG_ANYSRC 0x01 23 - #define FLOWI_FLAG_PRECOW_METRICS 0x02 24 - #define FLOWI_FLAG_CAN_SLEEP 0x04 25 - #define FLOWI_FLAG_RT_NOCACHE 0x08 23 + #define FLOWI_FLAG_CAN_SLEEP 0x02 24 + #define FLOWI_FLAG_RT_NOCACHE 0x04 26 25 __u32 flowic_secid; 27 26 }; 28 27
-1
include/net/inet_connection_sock.h
··· 43 43 struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, 44 44 struct request_sock *req, 45 45 struct dst_entry *dst); 46 - struct inet_peer *(*get_peer)(struct sock *sk); 47 46 u16 net_header_len; 48 47 u16 net_frag_header_len; 49 48 u16 sockaddr_len;
-2
include/net/inet_sock.h
··· 245 245 246 246 if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) 247 247 flags |= FLOWI_FLAG_ANYSRC; 248 - if (sk->sk_protocol == IPPROTO_TCP) 249 - flags |= FLOWI_FLAG_PRECOW_METRICS; 250 248 return flags; 251 249 } 252 250
+1 -7
include/net/inetpeer.h
··· 36 36 u32 metrics[RTAX_MAX]; 37 37 u32 rate_tokens; /* rate limiting for ICMP */ 38 38 unsigned long rate_last; 39 - unsigned long pmtu_expires; 40 - u32 pmtu_orig; 41 - u32 pmtu_learned; 42 - struct inetpeer_addr_base redirect_learned; 43 39 union { 44 40 struct list_head gc_list; 45 41 struct rcu_head gc_rcu; 46 42 }; 47 43 /* 48 44 * Once inet_peer is queued for deletion (refcnt == -1), following fields 49 - * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp 45 + * are not available: rid, ip_id_count 50 46 * We can share memory with rcu_head to help keep inet_peer small. 51 47 */ 52 48 union { 53 49 struct { 54 50 atomic_t rid; /* Frag reception counter */ 55 51 atomic_t ip_id_count; /* IP ID for the next packet */ 56 - __u32 tcp_ts; 57 - __u32 tcp_ts_stamp; 58 52 }; 59 53 struct rcu_head rcu; 60 54 struct inet_peer *gc_next;
+3
include/net/netns/ipv4.h
··· 7 7 8 8 #include <net/inet_frag.h> 9 9 10 + struct tcpm_hash_bucket; 10 11 struct ctl_table_header; 11 12 struct ipv4_devconf; 12 13 struct fib_rules_ops; ··· 40 39 struct sock **icmp_sk; 41 40 struct sock *tcp_sock; 42 41 struct inet_peer_base *peers; 42 + struct tcpm_hash_bucket *tcp_metrics_hash; 43 + unsigned int tcp_metrics_hash_mask; 43 44 struct netns_frags frags; 44 45 #ifdef CONFIG_NETFILTER 45 46 struct xt_table *iptable_filter;
+1 -60
include/net/route.h
··· 40 40 #define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) 41 41 42 42 struct fib_nh; 43 - struct inet_peer; 44 43 struct fib_info; 45 44 struct rtable { 46 45 struct dst_entry dst; ··· 64 65 __be32 rt_gateway; 65 66 66 67 /* Miscellaneous cached information */ 67 - u32 rt_peer_genid; 68 - unsigned long _peer; /* long-living peer info */ 68 + u32 rt_pmtu; 69 69 struct fib_info *fi; /* for client ref to shared metrics */ 70 70 }; 71 - 72 - static inline struct inet_peer *rt_peer_ptr(struct rtable *rt) 73 - { 74 - return inetpeer_ptr(rt->_peer); 75 - } 76 - 77 - static inline bool rt_has_peer(struct rtable *rt) 78 - { 79 - return inetpeer_ptr_is_peer(rt->_peer); 80 - } 81 - 82 - static inline void __rt_set_peer(struct rtable *rt, struct inet_peer *peer) 83 - { 84 - __inetpeer_ptr_set_peer(&rt->_peer, peer); 85 - } 86 - 87 - static inline bool rt_set_peer(struct rtable *rt, struct inet_peer *peer) 88 - { 89 - return inetpeer_ptr_set_peer(&rt->_peer, peer); 90 - } 91 - 92 - static inline void rt_init_peer(struct rtable *rt, struct inet_peer_base *base) 93 - { 94 - inetpeer_init_ptr(&rt->_peer, base); 95 - } 96 - 97 - static inline void rt_transfer_peer(struct rtable *rt, struct rtable *ort) 98 - { 99 - rt->_peer = ort->_peer; 100 - if (rt_has_peer(ort)) { 101 - struct inet_peer *peer = rt_peer_ptr(ort); 102 - atomic_inc(&peer->refcnt); 103 - } 104 - } 105 71 106 72 static inline bool rt_is_input_route(const struct rtable *rt) 107 73 { ··· 242 278 243 279 if (inet_sk(sk)->transparent) 244 280 flow_flags |= FLOWI_FLAG_ANYSRC; 245 - if (protocol == IPPROTO_TCP) 246 - flow_flags |= FLOWI_FLAG_PRECOW_METRICS; 247 281 if (can_sleep) 248 282 flow_flags |= FLOWI_FLAG_CAN_SLEEP; 249 283 ··· 288 326 return ip_route_output_flow(sock_net(sk), fl4, sk); 289 327 } 290 328 return rt; 291 - } 292 - 293 - extern void rt_bind_peer(struct rtable *rt, __be32 daddr, int create); 294 - 295 - static inline struct inet_peer *__rt_get_peer(struct rtable *rt, __be32 daddr, int create) 296 - { 297 - if (rt_has_peer(rt)) 298 - return rt_peer_ptr(rt); 299 - 300 - rt_bind_peer(rt, daddr, create); 301 - return (rt_has_peer(rt) ? rt_peer_ptr(rt) : NULL); 302 - } 303 - 304 - static inline struct inet_peer *rt_get_peer(struct rtable *rt, __be32 daddr) 305 - { 306 - return __rt_get_peer(rt, daddr, 0); 307 - } 308 - 309 - static inline struct inet_peer *rt_get_peer_create(struct rtable *rt, __be32 daddr) 310 - { 311 - return __rt_get_peer(rt, daddr, 1); 312 329 } 313 330 314 331 static inline int inet_iif(const struct sk_buff *skb)
+9
include/net/tcp.h
··· 388 388 extern void tcp_enter_loss(struct sock *sk, int how); 389 389 extern void tcp_clear_retrans(struct tcp_sock *tp); 390 390 extern void tcp_update_metrics(struct sock *sk); 391 + extern void tcp_init_metrics(struct sock *sk); 392 + extern void tcp_metrics_init(void); 393 + extern bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check); 394 + extern bool tcp_remember_stamp(struct sock *sk); 395 + extern bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw); 396 + extern void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst); 397 + extern void tcp_disable_fack(struct tcp_sock *tp); 391 398 extern void tcp_close(struct sock *sk, long timeout); 392 399 extern void tcp_init_sock(struct sock *sk); 393 400 extern unsigned int tcp_poll(struct file * file, struct socket *sock, ··· 562 555 { 563 556 return (tp->srtt >> 3) + tp->rttvar; 564 557 } 558 + 559 + extern void tcp_set_rto(struct sock *sk); 565 560 566 561 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) 567 562 {
+1 -3
net/core/rtnetlink.c
··· 615 615 EXPORT_SYMBOL(rtnetlink_put_metrics); 616 616 617 617 int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, 618 - u32 ts, u32 tsage, long expires, u32 error) 618 + long expires, u32 error) 619 619 { 620 620 struct rta_cacheinfo ci = { 621 621 .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), ··· 623 623 .rta_clntref = atomic_read(&(dst->__refcnt)), 624 624 .rta_error = error, 625 625 .rta_id = id, 626 - .rta_ts = ts, 627 - .rta_tsage = tsage, 628 626 }; 629 627 630 628 if (expires)
+6 -7
net/decnet/dn_route.c
··· 1590 1590 goto errout; 1591 1591 1592 1592 expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; 1593 - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0, expires, 1593 + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, 1594 1594 rt->dst.error) < 0) 1595 1595 goto errout; 1596 1596 ··· 1812 1812 char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN]; 1813 1813 1814 1814 seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n", 1815 - rt->dst.dev ? rt->dst.dev->name : "*", 1816 - dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1), 1817 - dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2), 1818 - atomic_read(&rt->dst.__refcnt), 1819 - rt->dst.__use, 1820 - (int) dst_metric(&rt->dst, RTAX_RTT)); 1815 + rt->dst.dev ? rt->dst.dev->name : "*", 1816 + dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1), 1817 + dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2), 1818 + atomic_read(&rt->dst.__refcnt), 1819 + rt->dst.__use, 0); 1821 1820 return 0; 1822 1821 } 1823 1822
+1 -1
net/ipv4/Makefile
··· 7 7 ip_output.o ip_sockglue.o inet_hashtables.o \ 8 8 inet_timewait_sock.o inet_connection_sock.o \ 9 9 tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ 10 - tcp_minisocks.o tcp_cong.o \ 10 + tcp_minisocks.o tcp_cong.o tcp_metrics.o \ 11 11 datagram.o raw.o udp.o udplite.o \ 12 12 arp.o icmp.o devinet.o af_inet.o igmp.o \ 13 13 fib_frontend.o fib_semantics.o fib_trie.o \
+2
net/ipv4/fib_semantics.c
··· 794 794 val = nla_get_u32(nla); 795 795 if (type == RTAX_ADVMSS && val > 65535 - 40) 796 796 val = 65535 - 40; 797 + if (type == RTAX_MTU && val > 65535 - 15) 798 + val = 65535 - 15; 797 799 fi->fib_metrics[type - 1] = val; 798 800 } 799 801 }
+2 -1
net/ipv4/icmp.c
··· 254 254 255 255 /* Limit if icmp type is enabled in ratemask. */ 256 256 if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { 257 - struct inet_peer *peer = rt_get_peer_create(rt, fl4->daddr); 257 + struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); 258 258 rc = inet_peer_xrlim_allow(peer, 259 259 net->ipv4.sysctl_icmp_ratelimit); 260 + inet_putpeer(peer); 260 261 } 261 262 out: 262 263 return rc;
+1 -1
net/ipv4/inet_connection_sock.c
··· 375 375 const struct inet_request_sock *ireq = inet_rsk(req); 376 376 struct ip_options_rcu *opt = inet_rsk(req)->opt; 377 377 struct net *net = sock_net(sk); 378 - int flags = inet_sk_flowi_flags(sk) & ~FLOWI_FLAG_PRECOW_METRICS; 378 + int flags = inet_sk_flowi_flags(sk); 379 379 380 380 if (nocache) 381 381 flags |= FLOWI_FLAG_RT_NOCACHE;
-4
net/ipv4/inetpeer.c
··· 508 508 (daddr->family == AF_INET) ? 509 509 secure_ip_id(daddr->addr.a4) : 510 510 secure_ipv6_id(daddr->addr.a6)); 511 - p->tcp_ts_stamp = 0; 512 511 p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; 513 512 p->rate_tokens = 0; 514 513 p->rate_last = 0; 515 - p->pmtu_expires = 0; 516 - p->pmtu_orig = 0; 517 - memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); 518 514 INIT_LIST_HEAD(&p->gc_list); 519 515 520 516 /* Link the node. */
+74 -275
net/ipv4/route.c
··· 158 158 159 159 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 160 160 { 161 - struct rtable *rt = (struct rtable *) dst; 162 - struct inet_peer *peer; 163 - u32 *p = NULL; 164 - 165 - peer = rt_get_peer_create(rt, rt->rt_dst); 166 - if (peer) { 167 - u32 *old_p = __DST_METRICS_PTR(old); 168 - unsigned long prev, new; 169 - 170 - p = peer->metrics; 171 - if (inet_metrics_new(peer)) 172 - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); 173 - 174 - new = (unsigned long) p; 175 - prev = cmpxchg(&dst->_metrics, old, new); 176 - 177 - if (prev != old) { 178 - p = __DST_METRICS_PTR(prev); 179 - if (prev & DST_METRICS_READ_ONLY) 180 - p = NULL; 181 - } else { 182 - if (rt->fi) { 183 - fib_info_put(rt->fi); 184 - rt->fi = NULL; 185 - } 186 - } 187 - } 188 - return p; 161 + WARN_ON(1); 162 + return NULL; 189 163 } 190 164 191 165 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, ··· 397 423 int len; 398 424 399 425 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" 400 - "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 401 - r->dst.dev ? r->dst.dev->name : "*", 402 - (__force u32)r->rt_dst, 403 - (__force u32)r->rt_gateway, 404 - r->rt_flags, atomic_read(&r->dst.__refcnt), 405 - r->dst.__use, 0, (__force u32)r->rt_src, 406 - dst_metric_advmss(&r->dst) + 40, 407 - dst_metric(&r->dst, RTAX_WINDOW), 408 - (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 409 - dst_metric(&r->dst, RTAX_RTTVAR)), 410 - r->rt_key_tos, 411 - -1, 0, 0, &len); 426 + "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 427 + r->dst.dev ? r->dst.dev->name : "*", 428 + (__force u32)r->rt_dst, 429 + (__force u32)r->rt_gateway, 430 + r->rt_flags, atomic_read(&r->dst.__refcnt), 431 + r->dst.__use, 0, (__force u32)r->rt_src, 432 + dst_metric_advmss(&r->dst) + 40, 433 + dst_metric(&r->dst, RTAX_WINDOW), 0, 434 + r->rt_key_tos, 435 + -1, 0, 0, &len); 412 436 413 437 seq_printf(seq, "%*s\n", 127 - len, ""); 414 438 } ··· 643 671 static inline int rt_valuable(struct rtable *rth) 644 672 { 645 673 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 646 - (rt_has_peer(rth) && rt_peer_ptr(rth)->pmtu_expires); 674 + rth->dst.expires; 647 675 } 648 676 649 677 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) ··· 889 917 890 918 get_random_bytes(&shuffle, sizeof(shuffle)); 891 919 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 892 - inetpeer_invalidate_family(AF_INET); 893 920 } 894 921 895 922 /* ··· 1215 1244 return rt; 1216 1245 } 1217 1246 1218 - static atomic_t __rt_peer_genid = ATOMIC_INIT(0); 1219 - 1220 - static u32 rt_peer_genid(void) 1221 - { 1222 - return atomic_read(&__rt_peer_genid); 1223 - } 1224 - 1225 - void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) 1226 - { 1227 - struct inet_peer_base *base; 1228 - struct inet_peer *peer; 1229 - 1230 - base = inetpeer_base_ptr(rt->_peer); 1231 - if (!base) 1232 - return; 1233 - 1234 - peer = inet_getpeer_v4(base, daddr, create); 1235 - if (peer) { 1236 - if (!rt_set_peer(rt, peer)) 1237 - inet_putpeer(peer); 1238 - else 1239 - rt->rt_peer_genid = rt_peer_genid(); 1240 - } 1241 - } 1242 - 1243 1247 /* 1244 1248 * Peer allocation may fail only in serious out-of-memory conditions. However 1245 1249 * we still can generate some output. ··· 1237 1291 1238 1292 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1239 1293 { 1240 - struct rtable *rt = (struct rtable *) dst; 1294 + struct net *net = dev_net(dst->dev); 1295 + struct inet_peer *peer; 1241 1296 1242 - if (rt && !(rt->dst.flags & DST_NOPEER)) { 1243 - struct inet_peer *peer = rt_get_peer_create(rt, rt->rt_dst); 1244 - 1245 - /* If peer is attached to destination, it is never detached, 1246 - so that we need not to grab a lock to dereference it. 1247 - */ 1248 - if (peer) { 1249 - iph->id = htons(inet_getid(peer, more)); 1250 - return; 1251 - } 1252 - } else if (!rt) 1253 - pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0)); 1297 + peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); 1298 + if (peer) { 1299 + iph->id = htons(inet_getid(peer, more)); 1300 + inet_putpeer(peer); 1301 + return; 1302 + } 1254 1303 1255 1304 ip_select_fb_ident(iph); 1256 1305 } ··· 1271 1330 spin_unlock_bh(rt_hash_lock_addr(hash)); 1272 1331 } 1273 1332 1274 - static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) 1275 - { 1276 - struct rtable *rt = (struct rtable *) dst; 1277 - __be32 orig_gw = rt->rt_gateway; 1278 - struct neighbour *n; 1279 - 1280 - dst_confirm(&rt->dst); 1281 - 1282 - rt->rt_gateway = peer->redirect_learned.a4; 1283 - 1284 - n = ipv4_neigh_lookup(&rt->dst, NULL, &rt->rt_gateway); 1285 - if (!n) { 1286 - rt->rt_gateway = orig_gw; 1287 - return; 1288 - } 1289 - if (!(n->nud_state & NUD_VALID)) { 1290 - neigh_event_send(n, NULL); 1291 - } else { 1292 - rt->rt_flags |= RTCF_REDIRECTED; 1293 - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 1294 - } 1295 - neigh_release(n); 1296 - } 1297 - 1298 1333 /* called in rcu_read_lock() section */ 1299 1334 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1300 1335 __be32 saddr, struct net_device *dev) ··· 1279 1362 struct in_device *in_dev = __in_dev_get_rcu(dev); 1280 1363 __be32 skeys[2] = { saddr, 0 }; 1281 1364 int ikeys[2] = { dev->ifindex, 0 }; 1282 - struct inet_peer *peer; 1283 1365 struct net *net; 1284 1366 1285 1367 if (!in_dev) ··· 1311 1395 rthp = &rt_hash_table[hash].chain; 1312 1396 1313 1397 while ((rt = rcu_dereference(*rthp)) != NULL) { 1398 + struct neighbour *n; 1399 + 1314 1400 rthp = &rt->dst.rt_next; 1315 1401 1316 1402 if (rt->rt_key_dst != daddr || ··· 1326 1408 rt->rt_gateway != old_gw) 1327 1409 continue; 1328 1410 1329 - peer = rt_get_peer_create(rt, rt->rt_dst); 1330 - if (peer) { 1331 - if (peer->redirect_learned.a4 != new_gw) { 1332 - peer->redirect_learned.a4 = new_gw; 1333 - atomic_inc(&__rt_peer_genid); 1411 + n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); 1412 + if (n) { 1413 + if (!(n->nud_state & NUD_VALID)) { 1414 + neigh_event_send(n, NULL); 1415 + } else { 1416 + rt->rt_gateway = new_gw; 1417 + rt->rt_flags |= RTCF_REDIRECTED; 1418 + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 1334 1419 } 1335 - check_peer_redir(&rt->dst, peer); 1420 + neigh_release(n); 1336 1421 } 1337 1422 } 1338 1423 } ··· 1353 1432 ; 1354 1433 } 1355 1434 1356 - static bool peer_pmtu_expired(struct inet_peer *peer) 1357 - { 1358 - unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 1359 - 1360 - return orig && 1361 - time_after_eq(jiffies, orig) && 1362 - cmpxchg(&peer->pmtu_expires, orig, 0) == orig; 1363 - } 1364 - 1365 - static bool peer_pmtu_cleaned(struct inet_peer *peer) 1366 - { 1367 - unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 1368 - 1369 - return orig && 1370 - cmpxchg(&peer->pmtu_expires, orig, 0) == orig; 1371 - } 1372 - 1373 1435 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1374 1436 { 1375 1437 struct rtable *rt = (struct rtable *)dst; ··· 1362 1458 if (dst->obsolete > 0) { 1363 1459 ip_rt_put(rt); 1364 1460 ret = NULL; 1365 - } else if (rt->rt_flags & RTCF_REDIRECTED) { 1461 + } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1462 + rt->dst.expires) { 1366 1463 unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1367 1464 rt->rt_oif, 1368 1465 rt_genid(dev_net(dst->dev))); 1369 1466 rt_del(hash, rt); 1370 1467 ret = NULL; 1371 - } else if (rt_has_peer(rt)) { 1372 - struct inet_peer *peer = rt_peer_ptr(rt); 1373 - if (peer_pmtu_expired(peer)) 1374 - dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); 1375 1468 } 1376 1469 } 1377 1470 return ret; ··· 1395 1494 struct rtable *rt = skb_rtable(skb); 1396 1495 struct in_device *in_dev; 1397 1496 struct inet_peer *peer; 1497 + struct net *net; 1398 1498 int log_martians; 1399 1499 1400 1500 rcu_read_lock(); ··· 1407 1505 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1408 1506 rcu_read_unlock(); 1409 1507 1410 - peer = rt_get_peer_create(rt, rt->rt_dst); 1508 + net = dev_net(rt->dst.dev); 1509 + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); 1411 1510 if (!peer) { 1412 1511 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1413 1512 return; ··· 1425 1522 */ 1426 1523 if (peer->rate_tokens >= ip_rt_redirect_number) { 1427 1524 peer->rate_last = jiffies; 1428 - return; 1525 + goto out_put_peer; 1429 1526 } 1430 1527 1431 1528 /* Check for load limit; set rate_last to the latest sent ··· 1446 1543 &rt->rt_dst, &rt->rt_gateway); 1447 1544 #endif 1448 1545 } 1546 + out_put_peer: 1547 + inet_putpeer(peer); 1449 1548 } 1450 1549 1451 1550 static int ip_error(struct sk_buff *skb) ··· 1490 1585 break; 1491 1586 } 1492 1587 1493 - peer = rt_get_peer_create(rt, rt->rt_dst); 1588 + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); 1494 1589 1495 1590 send = true; 1496 1591 if (peer) { ··· 1503 1598 peer->rate_tokens -= ip_rt_error_cost; 1504 1599 else 1505 1600 send = false; 1601 + inet_putpeer(peer); 1506 1602 } 1507 1603 if (send) 1508 1604 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); ··· 1512 1606 return 0; 1513 1607 } 1514 1608 1515 - static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) 1516 - { 1517 - unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); 1518 - 1519 - if (!expires) 1520 - return; 1521 - if (time_before(jiffies, expires)) { 1522 - u32 orig_dst_mtu = dst_mtu(dst); 1523 - if (peer->pmtu_learned < orig_dst_mtu) { 1524 - if (!peer->pmtu_orig) 1525 - peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); 1526 - dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); 1527 - } 1528 - } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) 1529 - dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); 1530 - } 1531 - 1532 1609 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1533 1610 { 1534 1611 struct rtable *rt = (struct rtable *) dst; 1535 - struct inet_peer *peer; 1536 1612 1537 1613 dst_confirm(dst); 1538 1614 1539 - peer = rt_get_peer_create(rt, rt->rt_dst); 1540 - if (peer) { 1541 - unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); 1615 + if (mtu < ip_rt_min_pmtu) 1616 + mtu = ip_rt_min_pmtu; 1542 1617 1543 - if (mtu < ip_rt_min_pmtu) 1544 - mtu = ip_rt_min_pmtu; 1545 - if (!pmtu_expires || mtu < peer->pmtu_learned) { 1546 - 1547 - pmtu_expires = jiffies + ip_rt_mtu_expires; 1548 - if (!pmtu_expires) 1549 - pmtu_expires = 1UL; 1550 - 1551 - peer->pmtu_learned = mtu; 1552 - peer->pmtu_expires = pmtu_expires; 1553 - 1554 - atomic_inc(&__rt_peer_genid); 1555 - rt->rt_peer_genid = rt_peer_genid(); 1556 - } 1557 - check_peer_pmtu(dst, peer); 1558 - } 1618 + rt->rt_pmtu = mtu; 1619 + dst_set_expires(&rt->dst, ip_rt_mtu_expires); 1559 1620 } 1560 1621 1561 1622 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, ··· 1533 1660 struct rtable *rt; 1534 1661 1535 1662 flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, 1536 - protocol, flow_flags | FLOWI_FLAG_PRECOW_METRICS, 1663 + protocol, flow_flags, 1537 1664 iph->daddr, iph->saddr, 0, 0); 1538 1665 rt = __ip_route_output_key(net, &fl4); 1539 1666 if (!IS_ERR(rt)) { ··· 1554 1681 } 1555 1682 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1556 1683 1557 - static void ipv4_validate_peer(struct rtable *rt) 1558 - { 1559 - if (rt->rt_peer_genid != rt_peer_genid()) { 1560 - struct inet_peer *peer = rt_get_peer(rt, rt->rt_dst); 1561 - 1562 - if (peer) { 1563 - check_peer_pmtu(&rt->dst, peer); 1564 - 1565 - if (peer->redirect_learned.a4 && 1566 - peer->redirect_learned.a4 != rt->rt_gateway) 1567 - check_peer_redir(&rt->dst, peer); 1568 - } 1569 - 1570 - rt->rt_peer_genid = rt_peer_genid(); 1571 - } 1572 - } 1573 - 1574 1684 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1575 1685 { 1576 1686 struct rtable *rt = (struct rtable *) dst; 1577 1687 1578 1688 if (rt_is_expired(rt)) 1579 1689 return NULL; 1580 - ipv4_validate_peer(rt); 1581 1690 return dst; 1582 1691 } 1583 1692 ··· 1571 1716 fib_info_put(rt->fi); 1572 1717 rt->fi = NULL; 1573 1718 } 1574 - if (rt_has_peer(rt)) { 1575 - struct inet_peer *peer = rt_peer_ptr(rt); 1576 - inet_putpeer(peer); 1577 - } 1578 1719 } 1579 1720 1580 1721 ··· 1581 1730 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1582 1731 1583 1732 rt = skb_rtable(skb); 1584 - if (rt && rt_has_peer(rt)) { 1585 - struct inet_peer *peer = rt_peer_ptr(rt); 1586 - if (peer_pmtu_cleaned(peer)) 1587 - dst_metric_set(&rt->dst, RTAX_MTU, peer->pmtu_orig); 1588 - } 1733 + if (rt) 1734 + dst_set_expires(&rt->dst, 0); 1589 1735 } 1590 1736 1591 1737 static int ip_rt_bug(struct sk_buff *skb) ··· 1662 1814 static unsigned int ipv4_mtu(const struct dst_entry *dst) 1663 1815 { 1664 1816 const struct rtable *rt = (const struct rtable *) dst; 1665 - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 1817 + unsigned int mtu = rt->rt_pmtu; 1818 + 1819 + if (mtu && time_after_eq(jiffies, rt->dst.expires)) 1820 + mtu = 0; 1821 + 1822 + if (!mtu) 1823 + mtu = dst_metric_raw(dst, RTAX_MTU); 1666 1824 1667 1825 if (mtu && rt_is_output_route(rt)) 1668 1826 return mtu; ··· 1690 1836 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, 1691 1837 struct fib_info *fi) 1692 1838 { 1693 - struct inet_peer_base *base; 1694 - struct inet_peer *peer; 1695 - int create = 0; 1696 - 1697 - /* If a peer entry exists for this destination, we must hook 1698 - * it up in order to get at cached metrics. 1699 - */ 1700 - if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) 1701 - create = 1; 1702 - 1703 - base = inetpeer_base_ptr(rt->_peer); 1704 - BUG_ON(!base); 1705 - 1706 - peer = inet_getpeer_v4(base, rt->rt_dst, create); 1707 - if (peer) { 1708 - __rt_set_peer(rt, peer); 1709 - rt->rt_peer_genid = rt_peer_genid(); 1710 - if (inet_metrics_new(peer)) 1711 - memcpy(peer->metrics, fi->fib_metrics, 1712 - sizeof(u32) * RTAX_MAX); 1713 - dst_init_metrics(&rt->dst, peer->metrics, false); 1714 - 1715 - check_peer_pmtu(&rt->dst, peer); 1716 - 1717 - if (peer->redirect_learned.a4 && 1718 - peer->redirect_learned.a4 != rt->rt_gateway) { 1719 - rt->rt_gateway = peer->redirect_learned.a4; 1720 - rt->rt_flags |= RTCF_REDIRECTED; 1721 - } 1722 - } else { 1723 - if (fi->fib_metrics != (u32 *) dst_default_metrics) { 1724 - rt->fi = fi; 1725 - atomic_inc(&fi->fib_clntref); 1726 - } 1727 - dst_init_metrics(&rt->dst, fi->fib_metrics, true); 1839 + if (fi->fib_metrics != (u32 *) dst_default_metrics) { 1840 + rt->fi = fi; 1841 + atomic_inc(&fi->fib_clntref); 1728 1842 } 1843 + dst_init_metrics(&rt->dst, fi->fib_metrics, true); 1729 1844 } 1730 1845 1731 1846 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, 1732 1847 const struct fib_result *res, 1733 1848 struct fib_info *fi, u16 type, u32 itag) 1734 1849 { 1735 - struct dst_entry *dst = &rt->dst; 1736 - 1737 1850 if (fi) { 1738 1851 if (FIB_RES_GW(*res) && 1739 1852 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1740 1853 rt->rt_gateway = FIB_RES_GW(*res); 1741 1854 rt_init_metrics(rt, fl4, fi); 1742 1855 #ifdef CONFIG_IP_ROUTE_CLASSID 1743 - dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1856 + rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1744 1857 #endif 1745 1858 } 1746 - 1747 - if (dst_mtu(dst) > IP_MAX_MTU) 1748 - dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); 1749 1859 1750 1860 #ifdef CONFIG_IP_ROUTE_CLASSID 1751 1861 #ifdef CONFIG_IP_MULTIPLE_TABLES ··· 1782 1964 rth->rt_iif = dev->ifindex; 1783 1965 rth->rt_oif = 0; 1784 1966 rth->rt_mark = skb->mark; 1967 + rth->rt_pmtu = 0; 1785 1968 rth->rt_gateway = daddr; 1786 - rth->rt_peer_genid = 0; 1787 - rt_init_peer(rth, dev_net(dev)->ipv4.peers); 1788 1969 rth->fi = NULL; 1789 1970 if (our) { 1790 1971 rth->dst.input= ip_local_deliver; ··· 1907 2090 rth->rt_iif = in_dev->dev->ifindex; 1908 2091 rth->rt_oif = 0; 1909 2092 rth->rt_mark = skb->mark; 2093 + rth->rt_pmtu = 0; 1910 2094 rth->rt_gateway = daddr; 1911 - rth->rt_peer_genid = 0; 1912 - rt_init_peer(rth, &res->table->tb_peers); 1913 2095 rth->fi = NULL; 1914 2096 1915 2097 rth->dst.input = ip_forward; ··· 2085 2269 rth->rt_iif = dev->ifindex; 2086 2270 rth->rt_oif = 0; 2087 2271 rth->rt_mark = skb->mark; 2272 + rth->rt_pmtu = 0; 2088 2273 rth->rt_gateway = daddr; 2089 - rth->rt_peer_genid = 0; 2090 - rt_init_peer(rth, net->ipv4.peers); 2091 2274 rth->fi = NULL; 2092 2275 if (res.type == RTN_UNREACHABLE) { 2093 2276 rth->dst.input= ip_error; ··· 2161 2346 rth->rt_mark == skb->mark && 2162 2347 net_eq(dev_net(rth->dst.dev), net) && 2163 2348 !rt_is_expired(rth)) { 2164 - ipv4_validate_peer(rth); 2165 2349 if (noref) { 2166 2350 dst_use_noref(&rth->dst, jiffies); 2167 2351 skb_dst_set_noref(skb, &rth->dst); ··· 2282 2468 rth->rt_iif = orig_oif ? : dev_out->ifindex; 2283 2469 rth->rt_oif = orig_oif; 2284 2470 rth->rt_mark = fl4->flowi4_mark; 2471 + rth->rt_pmtu = 0; 2285 2472 rth->rt_gateway = fl4->daddr; 2286 - rth->rt_peer_genid = 0; 2287 - rt_init_peer(rth, (res->table ? 2288 - &res->table->tb_peers : 2289 - dev_net(dev_out)->ipv4.peers)); 2290 2473 rth->fi = NULL; 2291 2474 2292 2475 RT_CACHE_STAT_INC(out_slow_tot); ··· 2537 2726 (IPTOS_RT_MASK | RTO_ONLINK)) && 2538 2727 net_eq(dev_net(rth->dst.dev), net) && 2539 2728 !rt_is_expired(rth)) { 2540 - ipv4_validate_peer(rth); 2541 2729 dst_use(&rth->dst, jiffies); 2542 2730 RT_CACHE_STAT_INC(out_hit); 2543 2731 rcu_read_unlock_bh(); ··· 2600 2790 new->__use = 1; 2601 2791 new->input = dst_discard; 2602 2792 new->output = dst_discard; 2603 - dst_copy_metrics(new, &ort->dst); 2604 2793 2605 2794 new->dev = ort->dst.dev; 2606 2795 if (new->dev) ··· 2612 2803 rt->rt_iif = ort->rt_iif; 2613 2804 rt->rt_oif = ort->rt_oif; 2614 2805 rt->rt_mark = ort->rt_mark; 2806 + rt->rt_pmtu = ort->rt_pmtu; 2615 2807 2616 2808 rt->rt_genid = rt_genid(net); 2617 2809 rt->rt_flags = ort->rt_flags; ··· 2620 2810 rt->rt_dst = ort->rt_dst; 2621 2811 rt->rt_src = ort->rt_src; 2622 2812 rt->rt_gateway = ort->rt_gateway; 2623 - rt_transfer_peer(rt, ort); 2624 2813 rt->fi = ort->fi; 2625 2814 if (rt->fi) 2626 2815 atomic_inc(&rt->fi->fib_clntref); ··· 2657 2848 struct rtmsg *r; 2658 2849 struct nlmsghdr *nlh; 2659 2850 unsigned long expires = 0; 2660 - u32 id = 0, ts = 0, tsage = 0, error; 2851 + u32 error; 2661 2852 2662 2853 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2663 2854 if (nlh == NULL) ··· 2710 2901 goto nla_put_failure; 2711 2902 2712 2903 error = rt->dst.error; 2713 - if (rt_has_peer(rt)) { 2714 - const struct inet_peer *peer = rt_peer_ptr(rt); 2715 - inet_peer_refcheck(peer); 2716 - id = atomic_read(&peer->ip_id_count) & 0xffff; 2717 - if (peer->tcp_ts_stamp) { 2718 - ts = peer->tcp_ts; 2719 - tsage = get_seconds() - peer->tcp_ts_stamp; 2720 - } 2721 - expires = ACCESS_ONCE(peer->pmtu_expires); 2722 - if (expires) { 2723 - if (time_before(jiffies, expires)) 2724 - expires -= jiffies; 2725 - else 2726 - expires = 0; 2727 - } 2904 + expires = rt->dst.expires; 2905 + if (expires) { 2906 + if (time_before(jiffies, expires)) 2907 + expires -= jiffies; 2908 + else 2909 + expires = 0; 2728 2910 } 2729 2911 2730 2912 if (rt_is_input_route(rt)) { ··· 2744 2944 goto nla_put_failure; 2745 2945 } 2746 2946 2747 - if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2748 - expires, error) < 0) 2947 + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2749 2948 goto nla_put_failure; 2750 2949 2751 2950 return nlmsg_end(skb, nlh);
+2
net/ipv4/tcp.c
··· 3563 3563 pr_info("Hash tables configured (established %u bind %u)\n", 3564 3564 tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); 3565 3565 3566 + tcp_metrics_init(); 3567 + 3566 3568 tcp_register_congestion_control(&tcp_reno); 3567 3569 3568 3570 memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
+2 -186
net/ipv4/tcp_input.c
··· 93 93 int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 94 94 int sysctl_tcp_frto __read_mostly = 2; 95 95 int sysctl_tcp_frto_response __read_mostly; 96 - int sysctl_tcp_nometrics_save __read_mostly; 97 96 98 97 int sysctl_tcp_thin_dupack __read_mostly; 99 98 ··· 700 701 /* Calculate rto without backoff. This is the second half of Van Jacobson's 701 702 * routine referred to above. 702 703 */ 703 - static inline void tcp_set_rto(struct sock *sk) 704 + void tcp_set_rto(struct sock *sk) 704 705 { 705 706 const struct tcp_sock *tp = tcp_sk(sk); 706 707 /* Old crap is replaced with new one. 8) ··· 725 726 * guarantees that rto is higher. 726 727 */ 727 728 tcp_bound_rto(sk); 728 - } 729 - 730 - /* Save metrics learned by this TCP session. 731 - This function is called only, when TCP finishes successfully 732 - i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. 733 - */ 734 - void tcp_update_metrics(struct sock *sk) 735 - { 736 - struct tcp_sock *tp = tcp_sk(sk); 737 - struct dst_entry *dst = __sk_dst_get(sk); 738 - 739 - if (sysctl_tcp_nometrics_save) 740 - return; 741 - 742 - if (dst && (dst->flags & DST_HOST)) { 743 - const struct inet_connection_sock *icsk = inet_csk(sk); 744 - int m; 745 - unsigned long rtt; 746 - 747 - dst_confirm(dst); 748 - 749 - if (icsk->icsk_backoff || !tp->srtt) { 750 - /* This session failed to estimate rtt. Why? 751 - * Probably, no packets returned in time. 752 - * Reset our results. 753 - */ 754 - if (!(dst_metric_locked(dst, RTAX_RTT))) 755 - dst_metric_set(dst, RTAX_RTT, 0); 756 - return; 757 - } 758 - 759 - rtt = dst_metric_rtt(dst, RTAX_RTT); 760 - m = rtt - tp->srtt; 761 - 762 - /* If newly calculated rtt larger than stored one, 763 - * store new one. Otherwise, use EWMA. Remember, 764 - * rtt overestimation is always better than underestimation. 765 - */ 766 - if (!(dst_metric_locked(dst, RTAX_RTT))) { 767 - if (m <= 0) 768 - set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt); 769 - else 770 - set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3)); 771 - } 772 - 773 - if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { 774 - unsigned long var; 775 - if (m < 0) 776 - m = -m; 777 - 778 - /* Scale deviation to rttvar fixed point */ 779 - m >>= 1; 780 - if (m < tp->mdev) 781 - m = tp->mdev; 782 - 783 - var = dst_metric_rtt(dst, RTAX_RTTVAR); 784 - if (m >= var) 785 - var = m; 786 - else 787 - var -= (var - m) >> 2; 788 - 789 - set_dst_metric_rtt(dst, RTAX_RTTVAR, var); 790 - } 791 - 792 - if (tcp_in_initial_slowstart(tp)) { 793 - /* Slow start still did not finish. */ 794 - if (dst_metric(dst, RTAX_SSTHRESH) && 795 - !dst_metric_locked(dst, RTAX_SSTHRESH) && 796 - (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) 797 - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1); 798 - if (!dst_metric_locked(dst, RTAX_CWND) && 799 - tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) 800 - dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd); 801 - } else if (tp->snd_cwnd > tp->snd_ssthresh && 802 - icsk->icsk_ca_state == TCP_CA_Open) { 803 - /* Cong. avoidance phase, cwnd is reliable. */ 804 - if (!dst_metric_locked(dst, RTAX_SSTHRESH)) 805 - dst_metric_set(dst, RTAX_SSTHRESH, 806 - max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); 807 - if (!dst_metric_locked(dst, RTAX_CWND)) 808 - dst_metric_set(dst, RTAX_CWND, 809 - (dst_metric(dst, RTAX_CWND) + 810 - tp->snd_cwnd) >> 1); 811 - } else { 812 - /* Else slow start did not finish, cwnd is non-sense, 813 - ssthresh may be also invalid. 814 - */ 815 - if (!dst_metric_locked(dst, RTAX_CWND)) 816 - dst_metric_set(dst, RTAX_CWND, 817 - (dst_metric(dst, RTAX_CWND) + 818 - tp->snd_ssthresh) >> 1); 819 - if (dst_metric(dst, RTAX_SSTHRESH) && 820 - !dst_metric_locked(dst, RTAX_SSTHRESH) && 821 - tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH)) 822 - dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh); 823 - } 824 - 825 - if (!dst_metric_locked(dst, RTAX_REORDERING)) { 826 - if (dst_metric(dst, RTAX_REORDERING) < tp->reordering && 827 - tp->reordering != sysctl_tcp_reordering) 828 - dst_metric_set(dst, RTAX_REORDERING, tp->reordering); 829 - } 830 - } 831 729 } 832 730 833 731 __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) ··· 763 867 * Packet counting of FACK is based on in-order assumptions, therefore TCP 764 868 * disables it when reordering is detected 765 869 */ 766 - static void tcp_disable_fack(struct tcp_sock *tp) 870 + void tcp_disable_fack(struct tcp_sock *tp) 767 871 { 768 872 /* RFC3517 uses different metric in lost marker => reset on change */ 769 873 if (tcp_is_fack(tp)) ··· 775 879 static void tcp_dsack_seen(struct tcp_sock *tp) 776 880 { 777 881 tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; 778 - } 779 - 780 - /* Initialize metrics on socket. */ 781 - 782 - static void tcp_init_metrics(struct sock *sk) 783 - { 784 - struct tcp_sock *tp = tcp_sk(sk); 785 - struct dst_entry *dst = __sk_dst_get(sk); 786 - 787 - if (dst == NULL) 788 - goto reset; 789 - 790 - dst_confirm(dst); 791 - 792 - if (dst_metric_locked(dst, RTAX_CWND)) 793 - tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); 794 - if (dst_metric(dst, RTAX_SSTHRESH)) { 795 - tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); 796 - if (tp->snd_ssthresh > tp->snd_cwnd_clamp) 797 - tp->snd_ssthresh = tp->snd_cwnd_clamp; 798 - } else { 799 - /* ssthresh may have been reduced unnecessarily during. 800 - * 3WHS. Restore it back to its initial default. 801 - */ 802 - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 803 - } 804 - if (dst_metric(dst, RTAX_REORDERING) && 805 - tp->reordering != dst_metric(dst, RTAX_REORDERING)) { 806 - tcp_disable_fack(tp); 807 - tcp_disable_early_retrans(tp); 808 - tp->reordering = dst_metric(dst, RTAX_REORDERING); 809 - } 810 - 811 - if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0) 812 - goto reset; 813 - 814 - /* Initial rtt is determined from SYN,SYN-ACK. 815 - * The segment is small and rtt may appear much 816 - * less than real one. Use per-dst memory 817 - * to make it more realistic. 818 - * 819 - * A bit of theory. RTT is time passed after "normal" sized packet 820 - * is sent until it is ACKed. In normal circumstances sending small 821 - * packets force peer to delay ACKs and calculation is correct too. 822 - * The algorithm is adaptive and, provided we follow specs, it 823 - * NEVER underestimate RTT. BUT! If peer tries to make some clever 824 - * tricks sort of "quick acks" for time long enough to decrease RTT 825 - * to low value, and then abruptly stops to do it and starts to delay 826 - * ACKs, wait for troubles. 827 - */ 828 - if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) { 829 - tp->srtt = dst_metric_rtt(dst, RTAX_RTT); 830 - tp->rtt_seq = tp->snd_nxt; 831 - } 832 - if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) { 833 - tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR); 834 - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 835 - } 836 - tcp_set_rto(sk); 837 - reset: 838 - if (tp->srtt == 0) { 839 - /* RFC6298: 5.7 We've failed to get a valid RTT sample from 840 - * 3WHS. This is most likely due to retransmission, 841 - * including spurious one. Reset the RTO back to 3secs 842 - * from the more aggressive 1sec to avoid more spurious 843 - * retransmission. 844 - */ 845 - tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; 846 - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; 847 - } 848 - /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been 849 - * retransmitted. In light of RFC6298 more aggressive 1sec 850 - * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK 851 - * retransmission has occurred. 852 - */ 853 - if (tp->total_retrans > 1) 854 - tp->snd_cwnd = 1; 855 - else 856 - tp->snd_cwnd = tcp_init_cwnd(tp, dst); 857 - tp->snd_cwnd_stamp = tcp_time_stamp; 858 882 } 859 883 860 884 static void tcp_update_reordering(struct sock *sk, const int metric,
+5 -41
net/ipv4/tcp_ipv4.c
··· 209 209 } 210 210 211 211 if (tcp_death_row.sysctl_tw_recycle && 212 - !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) { 213 - struct inet_peer *peer = rt_get_peer(rt, fl4->daddr); 214 - /* 215 - * VJ's idea. We save last timestamp seen from 216 - * the destination in peer table, when entering state 217 - * TIME-WAIT * and initialize rx_opt.ts_recent from it, 218 - * when trying new connection. 219 - */ 220 - if (peer) { 221 - inet_peer_refcheck(peer); 222 - if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { 223 - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; 224 - tp->rx_opt.ts_recent = peer->tcp_ts; 225 - } 226 - } 227 - } 212 + !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) 213 + tcp_fetch_timewait_stamp(sk, &rt->dst); 228 214 229 215 inet->inet_dport = usin->sin_port; 230 216 inet->inet_daddr = daddr; ··· 1361 1375 isn = cookie_v4_init_sequence(sk, skb, &req->mss); 1362 1376 req->cookie_ts = tmp_opt.tstamp_ok; 1363 1377 } else if (!isn) { 1364 - struct inet_peer *peer = NULL; 1365 1378 struct flowi4 fl4; 1366 1379 1367 1380 /* VJ's idea. We save last timestamp seen ··· 1375 1390 if (tmp_opt.saw_tstamp && 1376 1391 tcp_death_row.sysctl_tw_recycle && 1377 1392 (dst = inet_csk_route_req(sk, &fl4, req, want_cookie)) != NULL && 1378 - fl4.daddr == saddr && 1379 - (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) { 1380 - inet_peer_refcheck(peer); 1381 - if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1382 - (s32)(peer->tcp_ts - req->ts_recent) > 1383 - TCP_PAWS_WINDOW) { 1393 + fl4.daddr == saddr) { 1394 + if (!tcp_peer_is_proven(req, dst, true)) { 1384 1395 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1385 1396 goto drop_and_release; 1386 1397 } ··· 1385 1404 else if (!sysctl_tcp_syncookies && 1386 1405 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1387 1406 (sysctl_max_syn_backlog >> 2)) && 1388 - (!peer || !peer->tcp_ts_stamp) && 1389 - (!dst || !dst_metric(dst, RTAX_RTT))) { 1407 + !tcp_peer_is_proven(req, dst, false)) { 1390 1408 /* Without syncookies last quarter of 1391 1409 * backlog is filled with destinations, 1392 1410 * proven to be alive. ··· 1847 1867 goto discard_it; 1848 1868 } 1849 1869 1850 - struct inet_peer *tcp_v4_get_peer(struct sock *sk) 1851 - { 1852 - struct rtable *rt = (struct rtable *) __sk_dst_get(sk); 1853 - struct inet_sock *inet = inet_sk(sk); 1854 - 1855 - /* If we don't have a valid cached route, or we're doing IP 1856 - * options which make the IPv4 header destination address 1857 - * different from our peer's, do not bother with this. 1858 - */ 1859 - if (!rt || inet->cork.fl.u.ip4.daddr != inet->inet_daddr) 1860 - return NULL; 1861 - return rt_get_peer_create(rt, inet->inet_daddr); 1862 - } 1863 - EXPORT_SYMBOL(tcp_v4_get_peer); 1864 - 1865 1870 static struct timewait_sock_ops tcp_timewait_sock_ops = { 1866 1871 .twsk_obj_size = sizeof(struct tcp_timewait_sock), 1867 1872 .twsk_unique = tcp_twsk_unique, ··· 1859 1894 .rebuild_header = inet_sk_rebuild_header, 1860 1895 .conn_request = tcp_v4_conn_request, 1861 1896 .syn_recv_sock = tcp_v4_syn_recv_sock, 1862 - .get_peer = tcp_v4_get_peer, 1863 1897 .net_header_len = sizeof(struct iphdr), 1864 1898 .setsockopt = ip_setsockopt, 1865 1899 .getsockopt = ip_getsockopt,
+697
net/ipv4/tcp_metrics.c
··· 1 + #include <linux/rcupdate.h> 2 + #include <linux/spinlock.h> 3 + #include <linux/jiffies.h> 4 + #include <linux/bootmem.h> 5 + #include <linux/module.h> 6 + #include <linux/cache.h> 7 + #include <linux/slab.h> 8 + #include <linux/init.h> 9 + #include <linux/tcp.h> 10 + 11 + #include <net/inet_connection_sock.h> 12 + #include <net/net_namespace.h> 13 + #include <net/request_sock.h> 14 + #include <net/inetpeer.h> 15 + #include <net/sock.h> 16 + #include <net/ipv6.h> 17 + #include <net/dst.h> 18 + #include <net/tcp.h> 19 + 20 + int sysctl_tcp_nometrics_save __read_mostly; 21 + 22 + enum tcp_metric_index { 23 + TCP_METRIC_RTT, 24 + TCP_METRIC_RTTVAR, 25 + TCP_METRIC_SSTHRESH, 26 + TCP_METRIC_CWND, 27 + TCP_METRIC_REORDERING, 28 + 29 + /* Always last. */ 30 + TCP_METRIC_MAX, 31 + }; 32 + 33 + struct tcp_metrics_block { 34 + struct tcp_metrics_block __rcu *tcpm_next; 35 + struct inetpeer_addr tcpm_addr; 36 + unsigned long tcpm_stamp; 37 + u32 tcpm_ts; 38 + u32 tcpm_ts_stamp; 39 + u32 tcpm_lock; 40 + u32 tcpm_vals[TCP_METRIC_MAX]; 41 + }; 42 + 43 + static bool tcp_metric_locked(struct tcp_metrics_block *tm, 44 + enum tcp_metric_index idx) 45 + { 46 + return tm->tcpm_lock & (1 << idx); 47 + } 48 + 49 + static u32 tcp_metric_get(struct tcp_metrics_block *tm, 50 + enum tcp_metric_index idx) 51 + { 52 + return tm->tcpm_vals[idx]; 53 + } 54 + 55 + static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm, 56 + enum tcp_metric_index idx) 57 + { 58 + return msecs_to_jiffies(tm->tcpm_vals[idx]); 59 + } 60 + 61 + static void tcp_metric_set(struct tcp_metrics_block *tm, 62 + enum tcp_metric_index idx, 63 + u32 val) 64 + { 65 + tm->tcpm_vals[idx] = val; 66 + } 67 + 68 + static void tcp_metric_set_msecs(struct tcp_metrics_block *tm, 69 + enum tcp_metric_index idx, 70 + u32 val) 71 + { 72 + tm->tcpm_vals[idx] = jiffies_to_msecs(val); 73 + } 74 + 75 + static bool addr_same(const struct inetpeer_addr *a, 76 + const struct inetpeer_addr *b) 77 + { 78 + const struct in6_addr *a6, *b6; 79 + 80 + if (a->family != b->family) 81 + return false; 82 + if (a->family == AF_INET) 83 + return a->addr.a4 == b->addr.a4; 84 + 85 + a6 = (const struct in6_addr *) &a->addr.a6[0]; 86 + b6 = (const struct in6_addr *) &b->addr.a6[0]; 87 + 88 + return ipv6_addr_equal(a6, b6); 89 + } 90 + 91 + struct tcpm_hash_bucket { 92 + struct tcp_metrics_block __rcu *chain; 93 + }; 94 + 95 + static DEFINE_SPINLOCK(tcp_metrics_lock); 96 + 97 + static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) 98 + { 99 + u32 val; 100 + 101 + val = 0; 102 + if (dst_metric_locked(dst, RTAX_RTT)) 103 + val |= 1 << TCP_METRIC_RTT; 104 + if (dst_metric_locked(dst, RTAX_RTTVAR)) 105 + val |= 1 << TCP_METRIC_RTTVAR; 106 + if (dst_metric_locked(dst, RTAX_SSTHRESH)) 107 + val |= 1 << TCP_METRIC_SSTHRESH; 108 + if (dst_metric_locked(dst, RTAX_CWND)) 109 + val |= 1 << TCP_METRIC_CWND; 110 + if (dst_metric_locked(dst, RTAX_REORDERING)) 111 + val |= 1 << TCP_METRIC_REORDERING; 112 + tm->tcpm_lock = val; 113 + 114 + tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); 115 + tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); 116 + tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); 117 + tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); 118 + tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); 119 + tm->tcpm_ts = 0; 120 + tm->tcpm_ts_stamp = 0; 121 + } 122 + 123 + static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, 124 + struct inetpeer_addr *addr, 125 + unsigned int hash, 126 + bool reclaim) 127 + { 128 + struct tcp_metrics_block *tm; 129 + struct net *net; 130 + 131 + spin_lock_bh(&tcp_metrics_lock); 132 + net = dev_net(dst->dev); 133 + if (unlikely(reclaim)) { 134 + struct tcp_metrics_block *oldest; 135 + 136 + oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); 137 + for (tm = rcu_dereference(oldest->tcpm_next); tm; 138 + tm = rcu_dereference(tm->tcpm_next)) { 139 + if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) 140 + oldest = tm; 141 + } 142 + tm = oldest; 143 + } else { 144 + tm = kmalloc(sizeof(*tm), GFP_ATOMIC); 145 + if (!tm) 146 + goto out_unlock; 147 + } 148 + tm->tcpm_addr = *addr; 149 + tm->tcpm_stamp = jiffies; 150 + 151 + tcpm_suck_dst(tm, dst); 152 + 153 + if (likely(!reclaim)) { 154 + tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; 155 + rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); 156 + } 157 + 158 + out_unlock: 159 + spin_unlock_bh(&tcp_metrics_lock); 160 + return tm; 161 + } 162 + 163 + #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) 164 + 165 + static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) 166 + { 167 + if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) 168 + tcpm_suck_dst(tm, dst); 169 + } 170 + 171 + #define TCP_METRICS_RECLAIM_DEPTH 5 172 + #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL 173 + 174 + static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) 175 + { 176 + if (tm) 177 + return tm; 178 + if (depth > TCP_METRICS_RECLAIM_DEPTH) 179 + return TCP_METRICS_RECLAIM_PTR; 180 + return NULL; 181 + } 182 + 183 + static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, 184 + struct net *net, unsigned int hash) 185 + { 186 + struct tcp_metrics_block *tm; 187 + int depth = 0; 188 + 189 + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 190 + tm = rcu_dereference(tm->tcpm_next)) { 191 + if (addr_same(&tm->tcpm_addr, addr)) 192 + break; 193 + depth++; 194 + } 195 + return tcp_get_encode(tm, depth); 196 + } 197 + 198 + static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, 199 + struct dst_entry *dst) 200 + { 201 + struct tcp_metrics_block *tm; 202 + struct inetpeer_addr addr; 203 + unsigned int hash; 204 + struct net *net; 205 + 206 + addr.family = req->rsk_ops->family; 207 + switch (addr.family) { 208 + case AF_INET: 209 + addr.addr.a4 = inet_rsk(req)->rmt_addr; 210 + hash = (__force unsigned int) addr.addr.a4; 211 + break; 212 + case AF_INET6: 213 + *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr; 214 + hash = ((__force unsigned int) addr.addr.a6[0] ^ 215 + (__force unsigned int) addr.addr.a6[1] ^ 216 + (__force unsigned int) addr.addr.a6[2] ^ 217 + (__force unsigned int) addr.addr.a6[3]); 218 + break; 219 + default: 220 + return NULL; 221 + } 222 + 223 + hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8); 224 + 225 + net = dev_net(dst->dev); 226 + hash &= net->ipv4.tcp_metrics_hash_mask; 227 + 228 + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 229 + tm = rcu_dereference(tm->tcpm_next)) { 230 + if (addr_same(&tm->tcpm_addr, &addr)) 231 + break; 232 + } 233 + tcpm_check_stamp(tm, dst); 234 + return tm; 235 + } 236 + 237 + static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) 238 + { 239 + struct inet6_timewait_sock *tw6; 240 + struct tcp_metrics_block *tm; 241 + struct inetpeer_addr addr; 242 + unsigned int hash; 243 + struct net *net; 244 + 245 + addr.family = tw->tw_family; 246 + switch (addr.family) { 247 + case AF_INET: 248 + addr.addr.a4 = tw->tw_daddr; 249 + hash = (__force unsigned int) addr.addr.a4; 250 + break; 251 + case AF_INET6: 252 + tw6 = inet6_twsk((struct sock *)tw); 253 + *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; 254 + hash = ((__force unsigned int) addr.addr.a6[0] ^ 255 + (__force unsigned int) addr.addr.a6[1] ^ 256 + (__force unsigned int) addr.addr.a6[2] ^ 257 + (__force unsigned int) addr.addr.a6[3]); 258 + break; 259 + default: 260 + return NULL; 261 + } 262 + 263 + hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8); 264 + 265 + net = twsk_net(tw); 266 + hash &= net->ipv4.tcp_metrics_hash_mask; 267 + 268 + for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; 269 + tm = rcu_dereference(tm->tcpm_next)) { 270 + if (addr_same(&tm->tcpm_addr, &addr)) 271 + break; 272 + } 273 + return tm; 274 + } 275 + 276 + static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, 277 + struct dst_entry *dst, 278 + bool create) 279 + { 280 + struct tcp_metrics_block *tm; 281 + struct inetpeer_addr addr; 282 + unsigned int hash; 283 + struct net *net; 284 + bool reclaim; 285 + 286 + addr.family = sk->sk_family; 287 + switch (addr.family) { 288 + case AF_INET: 289 + addr.addr.a4 = inet_sk(sk)->inet_daddr; 290 + hash = (__force unsigned int) addr.addr.a4; 291 + break; 292 + case AF_INET6: 293 + *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr; 294 + hash = ((__force unsigned int) addr.addr.a6[0] ^ 295 + (__force unsigned int) addr.addr.a6[1] ^ 296 + (__force unsigned int) addr.addr.a6[2] ^ 297 + (__force unsigned int) addr.addr.a6[3]); 298 + break; 299 + default: 300 + return NULL; 301 + } 302 + 303 + hash ^= (hash >> 24) ^ (hash >> 16) ^ (hash >> 8); 304 + 305 + net = dev_net(dst->dev); 306 + hash &= net->ipv4.tcp_metrics_hash_mask; 307 + 308 + tm = __tcp_get_metrics(&addr, net, hash); 309 + reclaim = false; 310 + if (tm == TCP_METRICS_RECLAIM_PTR) { 311 + reclaim = true; 312 + tm = NULL; 313 + } 314 + if (!tm && create) 315 + tm = tcpm_new(dst, &addr, hash, reclaim); 316 + else 317 + tcpm_check_stamp(tm, dst); 318 + 319 + return tm; 320 + } 321 + 322 + /* Save metrics learned by this TCP session. This function is called 323 + * only, when TCP finishes successfully i.e. when it enters TIME-WAIT 324 + * or goes from LAST-ACK to CLOSE. 325 + */ 326 + void tcp_update_metrics(struct sock *sk) 327 + { 328 + const struct inet_connection_sock *icsk = inet_csk(sk); 329 + struct dst_entry *dst = __sk_dst_get(sk); 330 + struct tcp_sock *tp = tcp_sk(sk); 331 + struct tcp_metrics_block *tm; 332 + unsigned long rtt; 333 + u32 val; 334 + int m; 335 + 336 + if (sysctl_tcp_nometrics_save || !dst) 337 + return; 338 + 339 + if (dst->flags & DST_HOST) 340 + dst_confirm(dst); 341 + 342 + rcu_read_lock(); 343 + if (icsk->icsk_backoff || !tp->srtt) { 344 + /* This session failed to estimate rtt. Why? 345 + * Probably, no packets returned in time. Reset our 346 + * results. 347 + */ 348 + tm = tcp_get_metrics(sk, dst, false); 349 + if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) 350 + tcp_metric_set(tm, TCP_METRIC_RTT, 0); 351 + goto out_unlock; 352 + } else 353 + tm = tcp_get_metrics(sk, dst, true); 354 + 355 + if (!tm) 356 + goto out_unlock; 357 + 358 + rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); 359 + m = rtt - tp->srtt; 360 + 361 + /* If newly calculated rtt larger than stored one, store new 362 + * one. Otherwise, use EWMA. Remember, rtt overestimation is 363 + * always better than underestimation. 364 + */ 365 + if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { 366 + if (m <= 0) 367 + rtt = tp->srtt; 368 + else 369 + rtt -= (m >> 3); 370 + tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); 371 + } 372 + 373 + if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { 374 + unsigned long var; 375 + 376 + if (m < 0) 377 + m = -m; 378 + 379 + /* Scale deviation to rttvar fixed point */ 380 + m >>= 1; 381 + if (m < tp->mdev) 382 + m = tp->mdev; 383 + 384 + var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); 385 + if (m >= var) 386 + var = m; 387 + else 388 + var -= (var - m) >> 2; 389 + 390 + tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); 391 + } 392 + 393 + if (tcp_in_initial_slowstart(tp)) { 394 + /* Slow start still did not finish. */ 395 + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { 396 + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); 397 + if (val && (tp->snd_cwnd >> 1) > val) 398 + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, 399 + tp->snd_cwnd >> 1); 400 + } 401 + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { 402 + val = tcp_metric_get(tm, TCP_METRIC_CWND); 403 + if (tp->snd_cwnd > val) 404 + tcp_metric_set(tm, TCP_METRIC_CWND, 405 + tp->snd_cwnd); 406 + } 407 + } else if (tp->snd_cwnd > tp->snd_ssthresh && 408 + icsk->icsk_ca_state == TCP_CA_Open) { 409 + /* Cong. avoidance phase, cwnd is reliable. */ 410 + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) 411 + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, 412 + max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); 413 + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { 414 + val = tcp_metric_get(tm, TCP_METRIC_CWND); 415 + tcp_metric_set(tm, RTAX_CWND, (val + tp->snd_cwnd) >> 1); 416 + } 417 + } else { 418 + /* Else slow start did not finish, cwnd is non-sense, 419 + * ssthresh may be also invalid. 420 + */ 421 + if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { 422 + val = tcp_metric_get(tm, TCP_METRIC_CWND); 423 + tcp_metric_set(tm, TCP_METRIC_CWND, 424 + (val + tp->snd_ssthresh) >> 1); 425 + } 426 + if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { 427 + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); 428 + if (val && tp->snd_ssthresh > val) 429 + tcp_metric_set(tm, TCP_METRIC_SSTHRESH, 430 + tp->snd_ssthresh); 431 + } 432 + if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { 433 + val = tcp_metric_get(tm, TCP_METRIC_REORDERING); 434 + if (val < tp->reordering && 435 + tp->reordering != sysctl_tcp_reordering) 436 + tcp_metric_set(tm, TCP_METRIC_REORDERING, 437 + tp->reordering); 438 + } 439 + } 440 + tm->tcpm_stamp = jiffies; 441 + out_unlock: 442 + rcu_read_unlock(); 443 + } 444 + 445 + /* Initialize metrics on socket. */ 446 + 447 + void tcp_init_metrics(struct sock *sk) 448 + { 449 + struct dst_entry *dst = __sk_dst_get(sk); 450 + struct tcp_sock *tp = tcp_sk(sk); 451 + struct tcp_metrics_block *tm; 452 + u32 val; 453 + 454 + if (dst == NULL) 455 + goto reset; 456 + 457 + dst_confirm(dst); 458 + 459 + rcu_read_lock(); 460 + tm = tcp_get_metrics(sk, dst, true); 461 + if (!tm) { 462 + rcu_read_unlock(); 463 + goto reset; 464 + } 465 + 466 + if (tcp_metric_locked(tm, TCP_METRIC_CWND)) 467 + tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); 468 + 469 + val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); 470 + if (val) { 471 + tp->snd_ssthresh = val; 472 + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) 473 + tp->snd_ssthresh = tp->snd_cwnd_clamp; 474 + } else { 475 + /* ssthresh may have been reduced unnecessarily during. 476 + * 3WHS. Restore it back to its initial default. 477 + */ 478 + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; 479 + } 480 + val = tcp_metric_get(tm, TCP_METRIC_REORDERING); 481 + if (val && tp->reordering != val) { 482 + tcp_disable_fack(tp); 483 + tcp_disable_early_retrans(tp); 484 + tp->reordering = val; 485 + } 486 + 487 + val = tcp_metric_get(tm, TCP_METRIC_RTT); 488 + if (val == 0 || tp->srtt == 0) { 489 + rcu_read_unlock(); 490 + goto reset; 491 + } 492 + /* Initial rtt is determined from SYN,SYN-ACK. 493 + * The segment is small and rtt may appear much 494 + * less than real one. Use per-dst memory 495 + * to make it more realistic. 496 + * 497 + * A bit of theory. RTT is time passed after "normal" sized packet 498 + * is sent until it is ACKed. In normal circumstances sending small 499 + * packets force peer to delay ACKs and calculation is correct too. 500 + * The algorithm is adaptive and, provided we follow specs, it 501 + * NEVER underestimate RTT. BUT! If peer tries to make some clever 502 + * tricks sort of "quick acks" for time long enough to decrease RTT 503 + * to low value, and then abruptly stops to do it and starts to delay 504 + * ACKs, wait for troubles. 505 + */ 506 + val = msecs_to_jiffies(val); 507 + if (val > tp->srtt) { 508 + tp->srtt = val; 509 + tp->rtt_seq = tp->snd_nxt; 510 + } 511 + val = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); 512 + if (val > tp->mdev) { 513 + tp->mdev = val; 514 + tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); 515 + } 516 + rcu_read_unlock(); 517 + 518 + tcp_set_rto(sk); 519 + reset: 520 + if (tp->srtt == 0) { 521 + /* RFC6298: 5.7 We've failed to get a valid RTT sample from 522 + * 3WHS. This is most likely due to retransmission, 523 + * including spurious one. Reset the RTO back to 3secs 524 + * from the more aggressive 1sec to avoid more spurious 525 + * retransmission. 526 + */ 527 + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; 528 + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; 529 + } 530 + /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been 531 + * retransmitted. In light of RFC6298 more aggressive 1sec 532 + * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK 533 + * retransmission has occurred. 534 + */ 535 + if (tp->total_retrans > 1) 536 + tp->snd_cwnd = 1; 537 + else 538 + tp->snd_cwnd = tcp_init_cwnd(tp, dst); 539 + tp->snd_cwnd_stamp = tcp_time_stamp; 540 + } 541 + 542 + bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check) 543 + { 544 + struct tcp_metrics_block *tm; 545 + bool ret; 546 + 547 + if (!dst) 548 + return false; 549 + 550 + rcu_read_lock(); 551 + tm = __tcp_get_metrics_req(req, dst); 552 + if (paws_check) { 553 + if (tm && 554 + (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && 555 + (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW) 556 + ret = false; 557 + else 558 + ret = true; 559 + } else { 560 + if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) 561 + ret = true; 562 + else 563 + ret = false; 564 + } 565 + rcu_read_unlock(); 566 + 567 + return ret; 568 + } 569 + EXPORT_SYMBOL_GPL(tcp_peer_is_proven); 570 + 571 + void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) 572 + { 573 + struct tcp_metrics_block *tm; 574 + 575 + rcu_read_lock(); 576 + tm = tcp_get_metrics(sk, dst, true); 577 + if (tm) { 578 + struct tcp_sock *tp = tcp_sk(sk); 579 + 580 + if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { 581 + tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; 582 + tp->rx_opt.ts_recent = tm->tcpm_ts; 583 + } 584 + } 585 + rcu_read_unlock(); 586 + } 587 + EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); 588 + 589 + /* VJ's idea. Save last timestamp seen from this destination and hold 590 + * it at least for normal timewait interval to use for duplicate 591 + * segment detection in subsequent connections, before they enter 592 + * synchronized state. 593 + */ 594 + bool tcp_remember_stamp(struct sock *sk) 595 + { 596 + struct dst_entry *dst = __sk_dst_get(sk); 597 + bool ret = false; 598 + 599 + if (dst) { 600 + struct tcp_metrics_block *tm; 601 + 602 + rcu_read_lock(); 603 + tm = tcp_get_metrics(sk, dst, true); 604 + if (tm) { 605 + struct tcp_sock *tp = tcp_sk(sk); 606 + 607 + if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || 608 + ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && 609 + tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { 610 + tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; 611 + tm->tcpm_ts = tp->rx_opt.ts_recent; 612 + } 613 + ret = true; 614 + } 615 + rcu_read_unlock(); 616 + } 617 + return ret; 618 + } 619 + 620 + bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) 621 + { 622 + struct tcp_metrics_block *tm; 623 + bool ret = false; 624 + 625 + rcu_read_lock(); 626 + tm = __tcp_get_metrics_tw(tw); 627 + if (tw) { 628 + const struct tcp_timewait_sock *tcptw; 629 + struct sock *sk = (struct sock *) tw; 630 + 631 + tcptw = tcp_twsk(sk); 632 + if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || 633 + ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && 634 + tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { 635 + tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; 636 + tm->tcpm_ts = tcptw->tw_ts_recent; 637 + } 638 + ret = true; 639 + } 640 + rcu_read_unlock(); 641 + 642 + return ret; 643 + } 644 + 645 + static unsigned long tcpmhash_entries; 646 + static int __init set_tcpmhash_entries(char *str) 647 + { 648 + ssize_t ret; 649 + 650 + if (!str) 651 + return 0; 652 + 653 + ret = kstrtoul(str, 0, &tcpmhash_entries); 654 + if (ret) 655 + return 0; 656 + 657 + return 1; 658 + } 659 + __setup("tcpmhash_entries=", set_tcpmhash_entries); 660 + 661 + static int __net_init tcp_net_metrics_init(struct net *net) 662 + { 663 + int slots, size; 664 + 665 + slots = tcpmhash_entries; 666 + if (!slots) { 667 + if (totalram_pages >= 128 * 1024) 668 + slots = 16 * 1024; 669 + else 670 + slots = 8 * 1024; 671 + } 672 + 673 + size = slots * sizeof(struct tcpm_hash_bucket); 674 + 675 + net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL); 676 + if (!net->ipv4.tcp_metrics_hash) 677 + return -ENOMEM; 678 + 679 + net->ipv4.tcp_metrics_hash_mask = (slots - 1); 680 + 681 + return 0; 682 + } 683 + 684 + static void __net_exit tcp_net_metrics_exit(struct net *net) 685 + { 686 + kfree(net->ipv4.tcp_metrics_hash); 687 + } 688 + 689 + static __net_initdata struct pernet_operations tcp_net_metrics_ops = { 690 + .init = tcp_net_metrics_init, 691 + .exit = tcp_net_metrics_exit, 692 + }; 693 + 694 + void __init tcp_metrics_init(void) 695 + { 696 + register_pernet_subsys(&tcp_net_metrics_ops); 697 + }
+2 -60
net/ipv4/tcp_minisocks.c
··· 49 49 }; 50 50 EXPORT_SYMBOL_GPL(tcp_death_row); 51 51 52 - /* VJ's idea. Save last timestamp seen from this destination 53 - * and hold it at least for normal timewait interval to use for duplicate 54 - * segment detection in subsequent connections, before they enter synchronized 55 - * state. 56 - */ 57 - 58 - static bool tcp_remember_stamp(struct sock *sk) 59 - { 60 - const struct inet_connection_sock *icsk = inet_csk(sk); 61 - struct tcp_sock *tp = tcp_sk(sk); 62 - struct inet_peer *peer; 63 - 64 - peer = icsk->icsk_af_ops->get_peer(sk); 65 - if (peer) { 66 - if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || 67 - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && 68 - peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { 69 - peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; 70 - peer->tcp_ts = tp->rx_opt.ts_recent; 71 - } 72 - return true; 73 - } 74 - 75 - return false; 76 - } 77 - 78 - static bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) 79 - { 80 - const struct tcp_timewait_sock *tcptw; 81 - struct sock *sk = (struct sock *) tw; 82 - struct inet_peer *peer; 83 - 84 - tcptw = tcp_twsk(sk); 85 - peer = tcptw->tw_peer; 86 - if (peer) { 87 - if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || 88 - ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && 89 - peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { 90 - peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; 91 - peer->tcp_ts = tcptw->tw_ts_recent; 92 - } 93 - return true; 94 - } 95 - return false; 96 - } 97 - 98 52 static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) 99 53 { 100 54 if (seq == s_win) ··· 267 313 const struct inet_connection_sock *icsk = inet_csk(sk); 268 314 const struct tcp_sock *tp = tcp_sk(sk); 269 315 bool recycle_ok = false; 270 - bool recycle_on = false; 271 316 272 - if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) { 317 + if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) 273 318 recycle_ok = tcp_remember_stamp(sk); 274 - recycle_on = true; 275 - } 276 319 277 320 if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) 278 321 tw = inet_twsk_alloc(sk, state); ··· 278 327 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); 279 328 const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); 280 329 struct inet_sock *inet = inet_sk(sk); 281 - struct inet_peer *peer = NULL; 282 330 283 331 tw->tw_transparent = inet->transparent; 284 332 tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; ··· 300 350 tw->tw_ipv6only = np->ipv6only; 301 351 } 302 352 #endif 303 - 304 - if (recycle_on) 305 - peer = icsk->icsk_af_ops->get_peer(sk); 306 - tcptw->tw_peer = peer; 307 - if (peer) 308 - atomic_inc(&peer->refcnt); 309 353 310 354 #ifdef CONFIG_TCP_MD5SIG 311 355 /* ··· 352 408 353 409 void tcp_twsk_destructor(struct sock *sk) 354 410 { 411 + #ifdef CONFIG_TCP_MD5SIG 355 412 struct tcp_timewait_sock *twsk = tcp_twsk(sk); 356 413 357 - if (twsk->tw_peer) 358 - inet_putpeer(twsk->tw_peer); 359 - #ifdef CONFIG_TCP_MD5SIG 360 414 if (twsk->tw_md5_key) { 361 415 tcp_free_md5sig_pool(); 362 416 kfree_rcu(twsk->tw_md5_key, rcu);
+1 -7
net/ipv4/xfrm4_policy.c
··· 90 90 xdst->u.dst.dev = dev; 91 91 dev_hold(dev); 92 92 93 - rt_transfer_peer(&xdst->u.rt, rt); 94 - 95 93 /* Sheit... I remember I did this right. Apparently, 96 94 * it was magically lost, so this code needs audit */ 97 95 xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | ··· 98 100 xdst->u.rt.rt_src = rt->rt_src; 99 101 xdst->u.rt.rt_dst = rt->rt_dst; 100 102 xdst->u.rt.rt_gateway = rt->rt_gateway; 103 + xdst->u.rt.rt_pmtu = rt->rt_pmtu; 101 104 102 105 return 0; 103 106 } ··· 207 208 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 208 209 209 210 dst_destroy_metrics_generic(dst); 210 - 211 - if (rt_has_peer(&xdst->u.rt)) { 212 - struct inet_peer *peer = rt_peer_ptr(&xdst->u.rt); 213 - inet_putpeer(peer); 214 - } 215 211 216 212 xfrm_dst_destroy(xdst); 217 213 }
+3 -1
net/ipv6/icmp.c
··· 194 194 if (rt->rt6i_dst.plen < 128) 195 195 tmo >>= ((128 - rt->rt6i_dst.plen)>>5); 196 196 197 - peer = rt6_get_peer_create(rt); 197 + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); 198 198 res = inet_peer_xrlim_allow(peer, tmo); 199 + if (peer) 200 + inet_putpeer(peer); 199 201 } 200 202 dst_release(dst); 201 203 return res;
+8 -2
net/ipv6/ip6_output.c
··· 466 466 else 467 467 target = &hdr->daddr; 468 468 469 - peer = rt6_get_peer_create(rt); 469 + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); 470 470 471 471 /* Limit redirects both by destination (here) 472 472 and by source (inside ndisc_send_redirect) 473 473 */ 474 474 if (inet_peer_xrlim_allow(peer, 1*HZ)) 475 475 ndisc_send_redirect(skb, target); 476 + if (peer) 477 + inet_putpeer(peer); 476 478 } else { 477 479 int addrtype = ipv6_addr_type(&hdr->saddr); 478 480 ··· 594 592 int old, new; 595 593 596 594 if (rt && !(rt->dst.flags & DST_NOPEER)) { 597 - struct inet_peer *peer = rt6_get_peer_create(rt); 595 + struct inet_peer *peer; 596 + struct net *net; 598 597 598 + net = dev_net(rt->dst.dev); 599 + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); 599 600 if (peer) { 600 601 fhdr->identification = htonl(inet_getid(peer, 0)); 602 + inet_putpeer(peer); 601 603 return; 602 604 } 603 605 }
+6 -2
net/ipv6/ndisc.c
··· 1486 1486 int rd_len; 1487 1487 int err; 1488 1488 u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; 1489 + bool ret; 1489 1490 1490 1491 if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { 1491 1492 ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", ··· 1520 1519 "Redirect: destination is not a neighbour\n"); 1521 1520 goto release; 1522 1521 } 1523 - peer = rt6_get_peer_create(rt); 1524 - if (!inet_peer_xrlim_allow(peer, 1*HZ)) 1522 + peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1); 1523 + ret = inet_peer_xrlim_allow(peer, 1*HZ); 1524 + if (peer) 1525 + inet_putpeer(peer); 1526 + if (!ret) 1525 1527 goto release; 1526 1528 1527 1529 if (dev->addr_len) {
+2 -14
net/ipv6/route.c
··· 1093 1093 memset(&fl6, 0, sizeof(fl6)); 1094 1094 fl6.flowi6_oif = oif; 1095 1095 fl6.flowi6_mark = mark; 1096 - fl6.flowi6_flags = FLOWI_FLAG_PRECOW_METRICS; 1096 + fl6.flowi6_flags = 0; 1097 1097 fl6.daddr = iph->daddr; 1098 1098 fl6.saddr = iph->saddr; 1099 1099 fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK; ··· 2348 2348 int iif, int type, u32 pid, u32 seq, 2349 2349 int prefix, int nowait, unsigned int flags) 2350 2350 { 2351 - const struct inet_peer *peer; 2352 2351 struct rtmsg *rtm; 2353 2352 struct nlmsghdr *nlh; 2354 2353 long expires; 2355 2354 u32 table; 2356 2355 struct neighbour *n; 2357 - u32 ts, tsage; 2358 2356 2359 2357 if (prefix) { /* user wants prefix routes only */ 2360 2358 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { ··· 2471 2473 else 2472 2474 expires = INT_MAX; 2473 2475 2474 - peer = NULL; 2475 - if (rt6_has_peer(rt)) 2476 - peer = rt6_peer_ptr(rt); 2477 - ts = tsage = 0; 2478 - if (peer && peer->tcp_ts_stamp) { 2479 - ts = peer->tcp_ts; 2480 - tsage = get_seconds() - peer->tcp_ts_stamp; 2481 - } 2482 - 2483 - if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage, 2484 - expires, rt->dst.error) < 0) 2476 + if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 2485 2477 goto nla_put_failure; 2486 2478 2487 2479 return nlmsg_end(skb, nlh);
+5 -44
net/ipv6/tcp_ipv6.c
··· 277 277 rt = (struct rt6_info *) dst; 278 278 if (tcp_death_row.sysctl_tw_recycle && 279 279 !tp->rx_opt.ts_recent_stamp && 280 - ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) { 281 - struct inet_peer *peer = rt6_get_peer(rt); 282 - /* 283 - * VJ's idea. We save last timestamp seen from 284 - * the destination in peer table, when entering state 285 - * TIME-WAIT * and initialize rx_opt.ts_recent from it, 286 - * when trying new connection. 287 - */ 288 - if (peer) { 289 - inet_peer_refcheck(peer); 290 - if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { 291 - tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; 292 - tp->rx_opt.ts_recent = peer->tcp_ts; 293 - } 294 - } 295 - } 280 + ipv6_addr_equal(&rt->rt6i_dst.addr, &np->daddr)) 281 + tcp_fetch_timewait_stamp(sk, dst); 296 282 297 283 icsk->icsk_ext_hdr_len = 0; 298 284 if (np->opt) ··· 1120 1134 treq->iif = inet6_iif(skb); 1121 1135 1122 1136 if (!isn) { 1123 - struct inet_peer *peer = NULL; 1124 - 1125 1137 if (ipv6_opt_accepted(sk, skb) || 1126 1138 np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || 1127 1139 np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { ··· 1144 1160 */ 1145 1161 if (tmp_opt.saw_tstamp && 1146 1162 tcp_death_row.sysctl_tw_recycle && 1147 - (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL && 1148 - (peer = rt6_get_peer((struct rt6_info *)dst)) != NULL && 1149 - ipv6_addr_equal((struct in6_addr *)peer->daddr.addr.a6, 1150 - &treq->rmt_addr)) { 1151 - inet_peer_refcheck(peer); 1152 - if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && 1153 - (s32)(peer->tcp_ts - req->ts_recent) > 1154 - TCP_PAWS_WINDOW) { 1163 + (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) { 1164 + if (!tcp_peer_is_proven(req, dst, true)) { 1155 1165 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 1156 1166 goto drop_and_release; 1157 1167 } ··· 1154 1176 else if (!sysctl_tcp_syncookies && 1155 1177 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 1156 1178 (sysctl_max_syn_backlog >> 2)) && 1157 - (!peer || !peer->tcp_ts_stamp) && 1158 - (!dst || !dst_metric(dst, RTAX_RTT))) { 1179 + !tcp_peer_is_proven(req, dst, false)) { 1159 1180 /* Without syncookies last quarter of 1160 1181 * backlog is filled with destinations, 1161 1182 * proven to be alive. ··· 1689 1712 goto discard_it; 1690 1713 } 1691 1714 1692 - static struct inet_peer *tcp_v6_get_peer(struct sock *sk) 1693 - { 1694 - struct rt6_info *rt = (struct rt6_info *) __sk_dst_get(sk); 1695 - struct ipv6_pinfo *np = inet6_sk(sk); 1696 - 1697 - /* If we don't have a valid cached route, or we're doing IP 1698 - * options which make the IPv6 header destination address 1699 - * different from our peer's, do not bother with this. 1700 - */ 1701 - if (!rt || !ipv6_addr_equal(&np->daddr, &rt->rt6i_dst.addr)) 1702 - return NULL; 1703 - return rt6_get_peer_create(rt); 1704 - } 1705 - 1706 1715 static struct timewait_sock_ops tcp6_timewait_sock_ops = { 1707 1716 .twsk_obj_size = sizeof(struct tcp6_timewait_sock), 1708 1717 .twsk_unique = tcp_twsk_unique, ··· 1701 1738 .rebuild_header = inet6_sk_rebuild_header, 1702 1739 .conn_request = tcp_v6_conn_request, 1703 1740 .syn_recv_sock = tcp_v6_syn_recv_sock, 1704 - .get_peer = tcp_v6_get_peer, 1705 1741 .net_header_len = sizeof(struct ipv6hdr), 1706 1742 .net_frag_header_len = sizeof(struct frag_hdr), 1707 1743 .setsockopt = ipv6_setsockopt, ··· 1732 1770 .rebuild_header = inet_sk_rebuild_header, 1733 1771 .conn_request = tcp_v6_conn_request, 1734 1772 .syn_recv_sock = tcp_v6_syn_recv_sock, 1735 - .get_peer = tcp_v4_get_peer, 1736 1773 .net_header_len = sizeof(struct iphdr), 1737 1774 .setsockopt = ipv6_setsockopt, 1738 1775 .getsockopt = ipv6_getsockopt,