Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: fix __dst_negative_advice() race

__dst_negative_advice() does not enforce proper RCU rules when
sk->dst_cache must be cleared, leading to possible UAF.

RCU rules are that we must first clear sk->sk_dst_cache,
then call dst_release(old_dst).

Note that sk_dst_reset(sk) is implementing this protocol correctly,
while __dst_negative_advice() uses the wrong order.

Given that ip6_negative_advice() has special logic
against RTF_CACHE, this means each of the three ->negative_advice()
existing methods must perform the sk_dst_reset() themselves.

Note the check against NULL dst is centralized in
__dst_negative_advice(), there is no need to duplicate
it in various callbacks.

Many thanks to Clement Lecigne for tracking this issue.

This old bug became visible after the blamed commit, using UDP sockets.

Fixes: a87cb3e48ee8 ("net: Facility to report route quality of connected sockets")
Reported-by: Clement Lecigne <clecigne@google.com>
Diagnosed-by: Clement Lecigne <clecigne@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <tom@herbertland.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240528114353.1794151-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Eric Dumazet and committed by
Jakub Kicinski
92f1655a 068648aa

+30 -47
+1 -1
include/net/dst_ops.h
··· 24 24 void (*destroy)(struct dst_entry *); 25 25 void (*ifdown)(struct dst_entry *, 26 26 struct net_device *dev); 27 - struct dst_entry * (*negative_advice)(struct dst_entry *); 27 + void (*negative_advice)(struct sock *sk, struct dst_entry *); 28 28 void (*link_failure)(struct sk_buff *); 29 29 void (*update_pmtu)(struct dst_entry *dst, struct sock *sk, 30 30 struct sk_buff *skb, u32 mtu,
+3 -10
include/net/sock.h
··· 2063 2063 2064 2064 static inline void __dst_negative_advice(struct sock *sk) 2065 2065 { 2066 - struct dst_entry *ndst, *dst = __sk_dst_get(sk); 2066 + struct dst_entry *dst = __sk_dst_get(sk); 2067 2067 2068 - if (dst && dst->ops->negative_advice) { 2069 - ndst = dst->ops->negative_advice(dst); 2070 - 2071 - if (ndst != dst) { 2072 - rcu_assign_pointer(sk->sk_dst_cache, ndst); 2073 - sk_tx_queue_clear(sk); 2074 - WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 2075 - } 2076 - } 2068 + if (dst && dst->ops->negative_advice) 2069 + dst->ops->negative_advice(sk, dst); 2077 2070 } 2078 2071 2079 2072 static inline void dst_negative_advice(struct sock *sk)
+8 -14
net/ipv4/route.c
··· 129 129 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 130 130 INDIRECT_CALLABLE_SCOPE 131 131 unsigned int ipv4_mtu(const struct dst_entry *dst); 132 - static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 132 + static void ipv4_negative_advice(struct sock *sk, 133 + struct dst_entry *dst); 133 134 static void ipv4_link_failure(struct sk_buff *skb); 134 135 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 135 136 struct sk_buff *skb, u32 mtu, ··· 826 825 __ip_do_redirect(rt, skb, &fl4, true); 827 826 } 828 827 829 - static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 828 + static void ipv4_negative_advice(struct sock *sk, 829 + struct dst_entry *dst) 830 830 { 831 831 struct rtable *rt = dst_rtable(dst); 832 - struct dst_entry *ret = dst; 833 832 834 - if (rt) { 835 - if (dst->obsolete > 0) { 836 - ip_rt_put(rt); 837 - ret = NULL; 838 - } else if ((rt->rt_flags & RTCF_REDIRECTED) || 839 - rt->dst.expires) { 840 - ip_rt_put(rt); 841 - ret = NULL; 842 - } 843 - } 844 - return ret; 833 + if ((dst->obsolete > 0) || 834 + (rt->rt_flags & RTCF_REDIRECTED) || 835 + rt->dst.expires) 836 + sk_dst_reset(sk); 845 837 } 846 838 847 839 /*
+15 -14
net/ipv6/route.c
··· 87 87 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 88 88 INDIRECT_CALLABLE_SCOPE 89 89 unsigned int ip6_mtu(const struct dst_entry *dst); 90 - static struct dst_entry *ip6_negative_advice(struct dst_entry *); 90 + static void ip6_negative_advice(struct sock *sk, 91 + struct dst_entry *dst); 91 92 static void ip6_dst_destroy(struct dst_entry *); 92 93 static void ip6_dst_ifdown(struct dst_entry *, 93 94 struct net_device *dev); ··· 2771 2770 } 2772 2771 EXPORT_INDIRECT_CALLABLE(ip6_dst_check); 2773 2772 2774 - static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 2773 + static void ip6_negative_advice(struct sock *sk, 2774 + struct dst_entry *dst) 2775 2775 { 2776 2776 struct rt6_info *rt = dst_rt6_info(dst); 2777 2777 2778 - if (rt) { 2779 - if (rt->rt6i_flags & RTF_CACHE) { 2780 - rcu_read_lock(); 2781 - if (rt6_check_expired(rt)) { 2782 - rt6_remove_exception_rt(rt); 2783 - dst = NULL; 2784 - } 2785 - rcu_read_unlock(); 2786 - } else { 2787 - dst_release(dst); 2788 - dst = NULL; 2778 + if (rt->rt6i_flags & RTF_CACHE) { 2779 + rcu_read_lock(); 2780 + if (rt6_check_expired(rt)) { 2781 + /* counteract the dst_release() in sk_dst_reset() */ 2782 + dst_hold(dst); 2783 + sk_dst_reset(sk); 2784 + 2785 + rt6_remove_exception_rt(rt); 2789 2786 } 2787 + rcu_read_unlock(); 2788 + return; 2790 2789 } 2791 - return dst; 2790 + sk_dst_reset(sk); 2792 2791 } 2793 2792 2794 2793 static void ip6_link_failure(struct sk_buff *skb)
+3 -8
net/xfrm/xfrm_policy.c
··· 3910 3910 /* Impossible. Such dst must be popped before reaches point of failure. */ 3911 3911 } 3912 3912 3913 - static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) 3913 + static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) 3914 3914 { 3915 - if (dst) { 3916 - if (dst->obsolete) { 3917 - dst_release(dst); 3918 - dst = NULL; 3919 - } 3920 - } 3921 - return dst; 3915 + if (dst->obsolete) 3916 + sk_dst_reset(sk); 3922 3917 } 3923 3918 3924 3919 static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr)