Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

raw: Fix NULL deref in raw_get_next().

Dae R. Jeong reported a NULL deref in raw_get_next() [0].

It seems that the repro was running these sequences in parallel so
that one thread was iterating on a socket that was being freed in
another netns.

unshare(0x40060200)
r0 = syz_open_procfs(0x0, &(0x7f0000002080)='net/raw\x00')
socket$inet_icmp_raw(0x2, 0x3, 0x1)
pread64(r0, &(0x7f0000000000)=""/10, 0xa, 0x10000000007f)

After commit 0daf07e52709 ("raw: convert raw sockets to RCU"), we
use RCU and hlist_nulls_for_each_entry() to iterate over SOCK_RAW
sockets. However, we should use spinlock for slow paths to avoid
the NULL deref.

Also, SOCK_RAW does not use SLAB_TYPESAFE_BY_RCU, and the slab object
is not reused during iteration in the grace period. In fact, the
lockless readers do not check the nulls marker with get_nulls_value().
So, SOCK_RAW should use hlist instead of hlist_nulls.

Instead of adding an unnecessary barrier by sk_nulls_for_each_rcu(),
let's convert hlist_nulls to hlist and use sk_for_each_rcu() for
fast paths and sk_for_each() and spinlock for /proc/net/raw.

[0]:
general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN
KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f]
CPU: 2 PID: 20952 Comm: syz-executor.0 Not tainted 6.2.0-g048ec869bafd-dirty #7
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
RIP: 0010:read_pnet include/net/net_namespace.h:383 [inline]
RIP: 0010:sock_net include/net/sock.h:649 [inline]
RIP: 0010:raw_get_next net/ipv4/raw.c:974 [inline]
RIP: 0010:raw_get_idx net/ipv4/raw.c:986 [inline]
RIP: 0010:raw_seq_start+0x431/0x800 net/ipv4/raw.c:995
Code: ef e8 33 3d 94 f7 49 8b 6d 00 4c 89 ef e8 b7 65 5f f7 49 89 ed 49 83 c5 98 0f 84 9a 00 00 00 48 83 c5 c8 48 89 e8 48 c1 e8 03 <42> 80 3c 30 00 74 08 48 89 ef e8 00 3d 94 f7 4c 8b 7d 00 48 89 ef
RSP: 0018:ffffc9001154f9b0 EFLAGS: 00010206
RAX: 0000000000000005 RBX: 1ffff1100302c8fd RCX: 0000000000000000
RDX: 0000000000000028 RSI: ffffc9001154f988 RDI: ffffc9000f77a338
RBP: 0000000000000029 R08: ffffffff8a50ffb4 R09: fffffbfff24b6bd9
R10: fffffbfff24b6bd9 R11: 0000000000000000 R12: ffff88801db73b78
R13: fffffffffffffff9 R14: dffffc0000000000 R15: 0000000000000030
FS: 00007f843ae8e700(0000) GS:ffff888063700000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000055bb9614b35f CR3: 000000003c672000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
seq_read_iter+0x4c6/0x10f0 fs/seq_file.c:225
seq_read+0x224/0x320 fs/seq_file.c:162
pde_read fs/proc/inode.c:316 [inline]
proc_reg_read+0x23f/0x330 fs/proc/inode.c:328
vfs_read+0x31e/0xd30 fs/read_write.c:468
ksys_pread64 fs/read_write.c:665 [inline]
__do_sys_pread64 fs/read_write.c:675 [inline]
__se_sys_pread64 fs/read_write.c:672 [inline]
__x64_sys_pread64+0x1e9/0x280 fs/read_write.c:672
do_syscall_x64 arch/x86/entry/common.c:51 [inline]
do_syscall_64+0x4e/0xa0 arch/x86/entry/common.c:82
entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x478d29
Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f843ae8dbe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000011
RAX: ffffffffffffffda RBX: 0000000000791408 RCX: 0000000000478d29
RDX: 000000000000000a RSI: 0000000020000000 RDI: 0000000000000003
RBP: 00000000f477909a R08: 0000000000000000 R09: 0000000000000000
R10: 000010000000007f R11: 0000000000000246 R12: 0000000000791740
R13: 0000000000791414 R14: 0000000000791408 R15: 00007ffc2eb48a50
</TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:read_pnet include/net/net_namespace.h:383 [inline]
RIP: 0010:sock_net include/net/sock.h:649 [inline]
RIP: 0010:raw_get_next net/ipv4/raw.c:974 [inline]
RIP: 0010:raw_get_idx net/ipv4/raw.c:986 [inline]
RIP: 0010:raw_seq_start+0x431/0x800 net/ipv4/raw.c:995
Code: ef e8 33 3d 94 f7 49 8b 6d 00 4c 89 ef e8 b7 65 5f f7 49 89 ed 49 83 c5 98 0f 84 9a 00 00 00 48 83 c5 c8 48 89 e8 48 c1 e8 03 <42> 80 3c 30 00 74 08 48 89 ef e8 00 3d 94 f7 4c 8b 7d 00 48 89 ef
RSP: 0018:ffffc9001154f9b0 EFLAGS: 00010206
RAX: 0000000000000005 RBX: 1ffff1100302c8fd RCX: 0000000000000000
RDX: 0000000000000028 RSI: ffffc9001154f988 RDI: ffffc9000f77a338
RBP: 0000000000000029 R08: ffffffff8a50ffb4 R09: fffffbfff24b6bd9
R10: fffffbfff24b6bd9 R11: 0000000000000000 R12: ffff88801db73b78
R13: fffffffffffffff9 R14: dffffc0000000000 R15: 0000000000000030
FS: 00007f843ae8e700(0000) GS:ffff888063700000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f92ff166000 CR3: 000000003c672000 CR4: 00000000003506e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400

Fixes: 0daf07e52709 ("raw: convert raw sockets to RCU")
Reported-by: syzbot <syzkaller@googlegroups.com>
Reported-by: Dae R. Jeong <threeearcat@gmail.com>
Link: https://lore.kernel.org/netdev/ZCA2mGV_cmq7lIfV@dragonet/
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Kuniyuki Iwashima and committed by
Jakub Kicinski
0a78cf72 218c5973

+29 -31
+2 -2
include/net/raw.h
··· 37 37 struct raw_hashinfo { 38 38 spinlock_t lock; 39 39 40 - struct hlist_nulls_head ht[RAW_HTABLE_SIZE] ____cacheline_aligned; 40 + struct hlist_head ht[RAW_HTABLE_SIZE] ____cacheline_aligned; 41 41 }; 42 42 43 43 static inline u32 raw_hashfunc(const struct net *net, u32 proto) ··· 51 51 52 52 spin_lock_init(&hashinfo->lock); 53 53 for (i = 0; i < RAW_HTABLE_SIZE; i++) 54 - INIT_HLIST_NULLS_HEAD(&hashinfo->ht[i], i); 54 + INIT_HLIST_HEAD(&hashinfo->ht[i]); 55 55 } 56 56 57 57 #ifdef CONFIG_PROC_FS
+19 -17
net/ipv4/raw.c
··· 91 91 int raw_hash_sk(struct sock *sk) 92 92 { 93 93 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; 94 - struct hlist_nulls_head *hlist; 94 + struct hlist_head *hlist; 95 95 96 96 hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)]; 97 97 98 98 spin_lock(&h->lock); 99 - __sk_nulls_add_node_rcu(sk, hlist); 99 + sk_add_node_rcu(sk, hlist); 100 100 sock_set_flag(sk, SOCK_RCU_FREE); 101 101 spin_unlock(&h->lock); 102 102 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); ··· 110 110 struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; 111 111 112 112 spin_lock(&h->lock); 113 - if (__sk_nulls_del_node_init_rcu(sk)) 113 + if (sk_del_node_init_rcu(sk)) 114 114 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); 115 115 spin_unlock(&h->lock); 116 116 } ··· 163 163 static int raw_v4_input(struct net *net, struct sk_buff *skb, 164 164 const struct iphdr *iph, int hash) 165 165 { 166 - struct hlist_nulls_head *hlist; 167 - struct hlist_nulls_node *hnode; 168 166 int sdif = inet_sdif(skb); 167 + struct hlist_head *hlist; 169 168 int dif = inet_iif(skb); 170 169 int delivered = 0; 171 170 struct sock *sk; 172 171 173 172 hlist = &raw_v4_hashinfo.ht[hash]; 174 173 rcu_read_lock(); 175 - sk_nulls_for_each(sk, hnode, hlist) { 174 + sk_for_each_rcu(sk, hlist) { 176 175 if (!raw_v4_match(net, sk, iph->protocol, 177 176 iph->saddr, iph->daddr, dif, sdif)) 178 177 continue; ··· 263 264 void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) 264 265 { 265 266 struct net *net = dev_net(skb->dev); 266 - struct hlist_nulls_head *hlist; 267 - struct hlist_nulls_node *hnode; 268 267 int dif = skb->dev->ifindex; 269 268 int sdif = inet_sdif(skb); 269 + struct hlist_head *hlist; 270 270 const struct iphdr *iph; 271 271 struct sock *sk; 272 272 int hash; ··· 274 276 hlist = &raw_v4_hashinfo.ht[hash]; 275 277 276 278 rcu_read_lock(); 277 - sk_nulls_for_each(sk, hnode, hlist) { 279 + sk_for_each_rcu(sk, hlist) { 278 280 iph = (const struct iphdr *)skb->data; 279 281 if (!raw_v4_match(net, sk, iph->protocol, 280 282 iph->daddr, iph->saddr, dif, sdif)) ··· 948 950 { 949 951 struct raw_hashinfo *h = pde_data(file_inode(seq->file)); 950 952 struct raw_iter_state *state = raw_seq_private(seq); 951 - struct hlist_nulls_head *hlist; 952 - struct hlist_nulls_node *hnode; 953 + struct hlist_head *hlist; 953 954 struct sock *sk; 954 955 955 956 for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE; 956 957 ++state->bucket) { 957 958 hlist = &h->ht[state->bucket]; 958 - sk_nulls_for_each(sk, hnode, hlist) { 959 + sk_for_each(sk, hlist) { 959 960 if (sock_net(sk) == seq_file_net(seq)) 960 961 return sk; 961 962 } ··· 967 970 struct raw_iter_state *state = raw_seq_private(seq); 968 971 969 972 do { 970 - sk = sk_nulls_next(sk); 973 + sk = sk_next(sk); 971 974 } while (sk && sock_net(sk) != seq_file_net(seq)); 972 975 973 976 if (!sk) ··· 986 989 } 987 990 988 991 void *raw_seq_start(struct seq_file *seq, loff_t *pos) 989 - __acquires(RCU) 992 + __acquires(&h->lock) 990 993 { 991 - rcu_read_lock(); 994 + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); 995 + 996 + spin_lock(&h->lock); 997 + 992 998 return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 993 999 } 994 1000 EXPORT_SYMBOL_GPL(raw_seq_start); ··· 1010 1010 EXPORT_SYMBOL_GPL(raw_seq_next); 1011 1011 1012 1012 void raw_seq_stop(struct seq_file *seq, void *v) 1013 - __releases(RCU) 1013 + __releases(&h->lock) 1014 1014 { 1015 - rcu_read_unlock(); 1015 + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); 1016 + 1017 + spin_unlock(&h->lock); 1016 1018 } 1017 1019 EXPORT_SYMBOL_GPL(raw_seq_stop); 1018 1020
+4 -6
net/ipv4/raw_diag.c
··· 57 57 static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) 58 58 { 59 59 struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); 60 - struct hlist_nulls_head *hlist; 61 - struct hlist_nulls_node *hnode; 60 + struct hlist_head *hlist; 62 61 struct sock *sk; 63 62 int slot; 64 63 ··· 67 68 rcu_read_lock(); 68 69 for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { 69 70 hlist = &hashinfo->ht[slot]; 70 - sk_nulls_for_each(sk, hnode, hlist) { 71 + sk_for_each_rcu(sk, hlist) { 71 72 if (raw_lookup(net, sk, r)) { 72 73 /* 73 74 * Grab it and keep until we fill ··· 141 142 struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); 142 143 struct net *net = sock_net(skb->sk); 143 144 struct inet_diag_dump_data *cb_data; 144 - struct hlist_nulls_head *hlist; 145 - struct hlist_nulls_node *hnode; 146 145 int num, s_num, slot, s_slot; 146 + struct hlist_head *hlist; 147 147 struct sock *sk = NULL; 148 148 struct nlattr *bc; 149 149 ··· 159 161 num = 0; 160 162 161 163 hlist = &hashinfo->ht[slot]; 162 - sk_nulls_for_each(sk, hnode, hlist) { 164 + sk_for_each_rcu(sk, hlist) { 163 165 struct inet_sock *inet = inet_sk(sk); 164 166 165 167 if (!net_eq(sock_net(sk), net))
+4 -6
net/ipv6/raw.c
··· 141 141 static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) 142 142 { 143 143 struct net *net = dev_net(skb->dev); 144 - struct hlist_nulls_head *hlist; 145 - struct hlist_nulls_node *hnode; 146 144 const struct in6_addr *saddr; 147 145 const struct in6_addr *daddr; 146 + struct hlist_head *hlist; 148 147 struct sock *sk; 149 148 bool delivered = false; 150 149 __u8 hash; ··· 154 155 hash = raw_hashfunc(net, nexthdr); 155 156 hlist = &raw_v6_hashinfo.ht[hash]; 156 157 rcu_read_lock(); 157 - sk_nulls_for_each(sk, hnode, hlist) { 158 + sk_for_each_rcu(sk, hlist) { 158 159 int filtered; 159 160 160 161 if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, ··· 332 333 u8 type, u8 code, int inner_offset, __be32 info) 333 334 { 334 335 struct net *net = dev_net(skb->dev); 335 - struct hlist_nulls_head *hlist; 336 - struct hlist_nulls_node *hnode; 336 + struct hlist_head *hlist; 337 337 struct sock *sk; 338 338 int hash; 339 339 340 340 hash = raw_hashfunc(net, nexthdr); 341 341 hlist = &raw_v6_hashinfo.ht[hash]; 342 342 rcu_read_lock(); 343 - sk_nulls_for_each(sk, hnode, hlist) { 343 + sk_for_each_rcu(sk, hlist) { 344 344 /* Note: ipv6_hdr(skb) != skb->data */ 345 345 const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; 346 346