Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: Implement read-only protection and COW'ing of metrics.

Routing metrics are now copy-on-write.

Initially a route entry points it's metrics at a read-only location.
If a routing table entry exists, it will point there. Else it will
point at the all zero metric place-holder called 'dst_default_metrics'.

The writeability state of the metrics is stored in the low bits of the
metrics pointer, we have two bits left to spare if we want to store
more states.

For the initial implementation, COW is implemented simply via kmalloc.
However future enhancements will change this to place the writable
metrics somewhere else, in order to increase sharing. Very likely
this "somewhere else" will be the inetpeer cache.

Note also that this means that metrics updates may transiently fail
if we cannot COW the metrics successfully.

But even by itself, this patch should decrease memory usage and
increase cache locality especially for routing workloads. In those
cases the read-only metric copies stay in place and never get written
to.

TCP workloads where metrics get updated, and those rare cases where
PMTU triggers occur, will take a very slight performance hit. But
that hit will be alleviated when the long-term writable metrics
move to a more sharable location.

Since the metrics storage went from a u32 array of RTAX_MAX entries to
what is essentially a pointer, some retooling of the dst_entry layout
was necessary.

Most importantly, we need to preserve the alignment of the reference
count so that it doesn't share cache lines with the read-mostly state,
as per Eric Dumazet's alignment assertion checks.

The only non-trivial bit here is the move of the 'flags' member into
the writeable cacheline. This is OK since we are always accessing the
flags around the same moment when we made a modification to the
reference count.

Signed-off-by: David S. Miller <davem@davemloft.net>

+194 -46
+77 -37
include/net/dst.h
··· 40 40 struct rcu_head rcu_head; 41 41 struct dst_entry *child; 42 42 struct net_device *dev; 43 - short error; 44 - short obsolete; 45 - int flags; 46 - #define DST_HOST 0x0001 47 - #define DST_NOXFRM 0x0002 48 - #define DST_NOPOLICY 0x0004 49 - #define DST_NOHASH 0x0008 50 - #define DST_NOCACHE 0x0010 43 + struct dst_ops *ops; 44 + unsigned long _metrics; 51 45 unsigned long expires; 52 - 53 - unsigned short header_len; /* more space at head required */ 54 - unsigned short trailer_len; /* space to reserve at tail */ 55 - 56 - unsigned int rate_tokens; 57 - unsigned long rate_last; /* rate limiting for ICMP */ 58 - 59 46 struct dst_entry *path; 60 - 61 47 struct neighbour *neighbour; 62 48 struct hh_cache *hh; 63 49 #ifdef CONFIG_XFRM ··· 54 68 int (*input)(struct sk_buff*); 55 69 int (*output)(struct sk_buff*); 56 70 57 - struct dst_ops *ops; 58 - 59 - u32 _metrics[RTAX_MAX]; 60 - 71 + short error; 72 + short obsolete; 73 + unsigned short header_len; /* more space at head required */ 74 + unsigned short trailer_len; /* space to reserve at tail */ 61 75 #ifdef CONFIG_IP_ROUTE_CLASSID 62 76 __u32 tclassid; 63 77 #else 64 78 __u32 __pad2; 65 79 #endif 66 - 67 80 68 81 /* 69 82 * Align __refcnt to a 64 bytes alignment ··· 78 93 atomic_t __refcnt; /* client references */ 79 94 int __use; 80 95 unsigned long lastuse; 96 + unsigned long rate_last; /* rate limiting for ICMP */ 97 + unsigned int rate_tokens; 98 + int flags; 99 + #define DST_HOST 0x0001 100 + #define DST_NOXFRM 0x0002 101 + #define DST_NOPOLICY 0x0004 102 + #define DST_NOHASH 0x0008 103 + #define DST_NOCACHE 0x0010 81 104 union { 82 105 struct dst_entry *next; 83 106 struct rtable __rcu *rt_next; ··· 96 103 97 104 #ifdef __KERNEL__ 98 105 106 + extern u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); 107 + 108 + #define DST_METRICS_READ_ONLY 0x1UL 109 + #define __DST_METRICS_PTR(Y) \ 110 + ((u32 *)((Y) & ~DST_METRICS_READ_ONLY)) 111 + #define DST_METRICS_PTR(X) __DST_METRICS_PTR((X)->_metrics) 112 + 113 + static inline bool dst_metrics_read_only(const struct dst_entry *dst) 114 + { 115 + return dst->_metrics & DST_METRICS_READ_ONLY; 116 + } 117 + 118 + extern void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old); 119 + 120 + static inline void dst_destroy_metrics_generic(struct dst_entry *dst) 121 + { 122 + unsigned long val = dst->_metrics; 123 + if (!(val & DST_METRICS_READ_ONLY)) 124 + __dst_destroy_metrics_generic(dst, val); 125 + } 126 + 127 + static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst) 128 + { 129 + unsigned long p = dst->_metrics; 130 + 131 + if (p & DST_METRICS_READ_ONLY) 132 + return dst->ops->cow_metrics(dst, p); 133 + return __DST_METRICS_PTR(p); 134 + } 135 + 136 + /* This may only be invoked before the entry has reached global 137 + * visibility. 138 + */ 139 + static inline void dst_init_metrics(struct dst_entry *dst, 140 + const u32 *src_metrics, 141 + bool read_only) 142 + { 143 + dst->_metrics = ((unsigned long) src_metrics) | 144 + (read_only ? DST_METRICS_READ_ONLY : 0); 145 + } 146 + 147 + static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src) 148 + { 149 + u32 *dst_metrics = dst_metrics_write_ptr(dest); 150 + 151 + if (dst_metrics) { 152 + u32 *src_metrics = DST_METRICS_PTR(src); 153 + 154 + memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32)); 155 + } 156 + } 157 + 158 + static inline u32 *dst_metrics_ptr(struct dst_entry *dst) 159 + { 160 + return DST_METRICS_PTR(dst); 161 + } 162 + 99 163 static inline u32 100 164 dst_metric_raw(const struct dst_entry *dst, const int metric) 101 165 { 102 - return dst->_metrics[metric-1]; 166 + u32 *p = DST_METRICS_PTR(dst); 167 + 168 + return p[metric-1]; 103 169 } 104 170 105 171 static inline u32 ··· 183 131 184 132 static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) 185 133 { 186 - dst->_metrics[metric-1] = val; 187 - } 134 + u32 *p = dst_metrics_write_ptr(dst); 188 135 189 - static inline void dst_import_metrics(struct dst_entry *dst, const u32 *src_metrics) 190 - { 191 - memcpy(dst->_metrics, src_metrics, RTAX_MAX * sizeof(u32)); 192 - } 193 - 194 - static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src) 195 - { 196 - dst_import_metrics(dest, src->_metrics); 197 - } 198 - 199 - static inline u32 *dst_metrics_ptr(struct dst_entry *dst) 200 - { 201 - return dst->_metrics; 136 + if (p) 137 + p[metric-1] = val; 202 138 } 203 139 204 140 static inline u32
+1
include/net/dst_ops.h
··· 18 18 struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); 19 19 unsigned int (*default_advmss)(const struct dst_entry *); 20 20 unsigned int (*default_mtu)(const struct dst_entry *); 21 + u32 * (*cow_metrics)(struct dst_entry *, unsigned long); 21 22 void (*destroy)(struct dst_entry *); 22 23 void (*ifdown)(struct dst_entry *, 23 24 struct net_device *dev, int how);
+2
include/net/route.h
··· 49 49 50 50 struct fib_nh; 51 51 struct inet_peer; 52 + struct fib_info; 52 53 struct rtable { 53 54 struct dst_entry dst; 54 55 ··· 70 69 /* Miscellaneous cached information */ 71 70 __be32 rt_spec_dst; /* RFC1122 specific destination */ 72 71 struct inet_peer *peer; /* long-living peer info */ 72 + struct fib_info *fi; /* for client ref to shared metrics */ 73 73 }; 74 74 75 75 static inline bool rt_is_input_route(struct rtable *rt)
+39
net/core/dst.c
··· 164 164 } 165 165 EXPORT_SYMBOL(dst_discard); 166 166 167 + static const u32 dst_default_metrics[RTAX_MAX]; 168 + 167 169 void *dst_alloc(struct dst_ops *ops) 168 170 { 169 171 struct dst_entry *dst; ··· 182 180 dst->lastuse = jiffies; 183 181 dst->path = dst; 184 182 dst->input = dst->output = dst_discard; 183 + dst_init_metrics(dst, dst_default_metrics, true); 185 184 #if RT_CACHE_DEBUG >= 2 186 185 atomic_inc(&dst_total); 187 186 #endif ··· 284 281 } 285 282 } 286 283 EXPORT_SYMBOL(dst_release); 284 + 285 + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) 286 + { 287 + u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); 288 + 289 + if (p) { 290 + u32 *old_p = __DST_METRICS_PTR(old); 291 + unsigned long prev, new; 292 + 293 + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); 294 + 295 + new = (unsigned long) p; 296 + prev = cmpxchg(&dst->_metrics, old, new); 297 + 298 + if (prev != old) { 299 + kfree(p); 300 + p = __DST_METRICS_PTR(prev); 301 + if (prev & DST_METRICS_READ_ONLY) 302 + p = NULL; 303 + } 304 + } 305 + return p; 306 + } 307 + EXPORT_SYMBOL(dst_cow_metrics_generic); 308 + 309 + /* Caller asserts that dst_metrics_read_only(dst) is false. */ 310 + void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) 311 + { 312 + unsigned long prev, new; 313 + 314 + new = (unsigned long) dst_default_metrics; 315 + prev = cmpxchg(&dst->_metrics, old, new); 316 + if (prev == old) 317 + kfree(__DST_METRICS_PTR(old)); 318 + } 319 + EXPORT_SYMBOL(__dst_destroy_metrics_generic); 287 320 288 321 /** 289 322 * skb_dst_set_noref - sets skb dst, without a reference
+13 -5
net/decnet/dn_route.c
··· 112 112 static struct dst_entry *dn_dst_check(struct dst_entry *, __u32); 113 113 static unsigned int dn_dst_default_advmss(const struct dst_entry *dst); 114 114 static unsigned int dn_dst_default_mtu(const struct dst_entry *dst); 115 + static void dn_dst_destroy(struct dst_entry *); 115 116 static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); 116 117 static void dn_dst_link_failure(struct sk_buff *); 117 118 static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu); ··· 134 133 .check = dn_dst_check, 135 134 .default_advmss = dn_dst_default_advmss, 136 135 .default_mtu = dn_dst_default_mtu, 136 + .cow_metrics = dst_cow_metrics_generic, 137 + .destroy = dn_dst_destroy, 137 138 .negative_advice = dn_dst_negative_advice, 138 139 .link_failure = dn_dst_link_failure, 139 140 .update_pmtu = dn_dst_update_pmtu, 140 141 }; 142 + 143 + static void dn_dst_destroy(struct dst_entry *dst) 144 + { 145 + dst_destroy_metrics_generic(dst); 146 + } 141 147 142 148 static __inline__ unsigned dn_hash(__le16 src, __le16 dst) 143 149 { ··· 822 814 { 823 815 struct dn_fib_info *fi = res->fi; 824 816 struct net_device *dev = rt->dst.dev; 817 + unsigned int mss_metric; 825 818 struct neighbour *n; 826 - unsigned int metric; 827 819 828 820 if (fi) { 829 821 if (DN_FIB_RES_GW(*res) && 830 822 DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 831 823 rt->rt_gateway = DN_FIB_RES_GW(*res); 832 - dst_import_metrics(&rt->dst, fi->fib_metrics); 824 + dst_init_metrics(&rt->dst, fi->fib_metrics, true); 833 825 } 834 826 rt->rt_type = res->type; 835 827 ··· 842 834 843 835 if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu) 844 836 dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu); 845 - metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS); 846 - if (metric) { 837 + mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS); 838 + if (mss_metric) { 847 839 unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst)); 848 - if (metric > mss) 840 + if (mss_metric > mss) 849 841 dst_metric_set(&rt->dst, RTAX_ADVMSS, mss); 850 842 } 851 843 return 0;
+44 -1
net/ipv4/route.c
··· 152 152 { 153 153 } 154 154 155 + static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 156 + { 157 + u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); 158 + 159 + if (p) { 160 + u32 *old_p = __DST_METRICS_PTR(old); 161 + unsigned long prev, new; 162 + 163 + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); 164 + 165 + new = (unsigned long) p; 166 + prev = cmpxchg(&dst->_metrics, old, new); 167 + 168 + if (prev != old) { 169 + kfree(p); 170 + p = __DST_METRICS_PTR(prev); 171 + if (prev & DST_METRICS_READ_ONLY) 172 + p = NULL; 173 + } else { 174 + struct rtable *rt = (struct rtable *) dst; 175 + 176 + if (rt->fi) { 177 + fib_info_put(rt->fi); 178 + rt->fi = NULL; 179 + } 180 + } 181 + } 182 + return p; 183 + } 184 + 155 185 static struct dst_ops ipv4_dst_ops = { 156 186 .family = AF_INET, 157 187 .protocol = cpu_to_be16(ETH_P_IP), ··· 189 159 .check = ipv4_dst_check, 190 160 .default_advmss = ipv4_default_advmss, 191 161 .default_mtu = ipv4_default_mtu, 162 + .cow_metrics = ipv4_cow_metrics, 192 163 .destroy = ipv4_dst_destroy, 193 164 .ifdown = ipv4_dst_ifdown, 194 165 .negative_advice = ipv4_negative_advice, ··· 1472 1441 1473 1442 if (rt->peer) 1474 1443 atomic_inc(&rt->peer->refcnt); 1444 + if (rt->fi) 1445 + atomic_inc(&rt->fi->fib_clntref); 1475 1446 1476 1447 if (arp_bind_neighbour(&rt->dst) || 1477 1448 !(rt->dst.neighbour->nud_state & ··· 1753 1720 struct rtable *rt = (struct rtable *) dst; 1754 1721 struct inet_peer *peer = rt->peer; 1755 1722 1723 + dst_destroy_metrics_generic(dst); 1724 + if (rt->fi) { 1725 + fib_info_put(rt->fi); 1726 + rt->fi = NULL; 1727 + } 1756 1728 if (peer) { 1757 1729 rt->peer = NULL; 1758 1730 inet_putpeer(peer); ··· 1862 1824 if (FIB_RES_GW(*res) && 1863 1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1864 1826 rt->rt_gateway = FIB_RES_GW(*res); 1865 - dst_import_metrics(dst, fi->fib_metrics); 1827 + rt->fi = fi; 1828 + atomic_inc(&fi->fib_clntref); 1829 + dst_init_metrics(dst, fi->fib_metrics, true); 1866 1830 #ifdef CONFIG_IP_ROUTE_CLASSID 1867 1831 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1868 1832 #endif ··· 2792 2752 rt->peer = ort->peer; 2793 2753 if (rt->peer) 2794 2754 atomic_inc(&rt->peer->refcnt); 2755 + rt->fi = ort->fi; 2756 + if (rt->fi) 2757 + atomic_inc(&rt->fi->fib_clntref); 2795 2758 2796 2759 dst_free(new); 2797 2760 }
+4
net/ipv4/xfrm4_policy.c
··· 196 196 { 197 197 struct xfrm_dst *xdst = (struct xfrm_dst *)dst; 198 198 199 + dst_destroy_metrics_generic(dst); 200 + 199 201 if (likely(xdst->u.rt.peer)) 200 202 inet_putpeer(xdst->u.rt.peer); 203 + 201 204 xfrm_dst_destroy(xdst); 202 205 } 203 206 ··· 218 215 .protocol = cpu_to_be16(ETH_P_IP), 219 216 .gc = xfrm4_garbage_collect, 220 217 .update_pmtu = xfrm4_update_pmtu, 218 + .cow_metrics = dst_cow_metrics_generic, 221 219 .destroy = xfrm4_dst_destroy, 222 220 .ifdown = xfrm4_dst_ifdown, 223 221 .local_out = __ip_local_out,
+12 -3
net/ipv6/route.c
··· 105 105 .check = ip6_dst_check, 106 106 .default_advmss = ip6_default_advmss, 107 107 .default_mtu = ip6_default_mtu, 108 + .cow_metrics = dst_cow_metrics_generic, 108 109 .destroy = ip6_dst_destroy, 109 110 .ifdown = ip6_dst_ifdown, 110 111 .negative_advice = ip6_negative_advice, ··· 124 123 .destroy = ip6_dst_destroy, 125 124 .check = ip6_dst_check, 126 125 .update_pmtu = ip6_rt_blackhole_update_pmtu, 126 + }; 127 + 128 + static const u32 ip6_template_metrics[RTAX_MAX] = { 129 + [RTAX_HOPLIMIT - 1] = 255, 127 130 }; 128 131 129 132 static struct rt6_info ip6_null_entry_template = { ··· 198 193 rt->rt6i_idev = NULL; 199 194 in6_dev_put(idev); 200 195 } 196 + dst_destroy_metrics_generic(dst); 201 197 if (peer) { 202 198 BUG_ON(!(rt->rt6i_flags & RTF_CACHE)); 203 199 rt->rt6i_peer = NULL; ··· 2687 2681 net->ipv6.ip6_null_entry->dst.path = 2688 2682 (struct dst_entry *)net->ipv6.ip6_null_entry; 2689 2683 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 2690 - dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255); 2684 + dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 2685 + ip6_template_metrics, true); 2691 2686 2692 2687 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 2693 2688 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, ··· 2699 2692 net->ipv6.ip6_prohibit_entry->dst.path = 2700 2693 (struct dst_entry *)net->ipv6.ip6_prohibit_entry; 2701 2694 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 2702 - dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255); 2695 + dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 2696 + ip6_template_metrics, true); 2703 2697 2704 2698 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 2705 2699 sizeof(*net->ipv6.ip6_blk_hole_entry), ··· 2710 2702 net->ipv6.ip6_blk_hole_entry->dst.path = 2711 2703 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; 2712 2704 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 2713 - dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255); 2705 + dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 2706 + ip6_template_metrics, true); 2714 2707 #endif 2715 2708 2716 2709 net->ipv6.sysctl.flush_delay = 0;
+2
net/ipv6/xfrm6_policy.c
··· 220 220 221 221 if (likely(xdst->u.rt6.rt6i_idev)) 222 222 in6_dev_put(xdst->u.rt6.rt6i_idev); 223 + dst_destroy_metrics_generic(dst); 223 224 if (likely(xdst->u.rt6.rt6i_peer)) 224 225 inet_putpeer(xdst->u.rt6.rt6i_peer); 225 226 xfrm_dst_destroy(xdst); ··· 258 257 .protocol = cpu_to_be16(ETH_P_IPV6), 259 258 .gc = xfrm6_garbage_collect, 260 259 .update_pmtu = xfrm6_update_pmtu, 260 + .cow_metrics = dst_cow_metrics_generic, 261 261 .destroy = xfrm6_dst_destroy, 262 262 .ifdown = xfrm6_dst_ifdown, 263 263 .local_out = __ip6_local_out,