Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'af_unix-rework-gc'

Kuniyuki Iwashima says:

====================
af_unix: Rework GC.

When we pass a file descriptor to an AF_UNIX socket via SCM_RIGTHS,
the underlying struct file of the inflight fd gets its refcount bumped.
If the fd is of an AF_UNIX socket, we need to track it in case it forms
cyclic references.

Let's say we send a fd of AF_UNIX socket A to B and vice versa and
close() both sockets.

When created, each socket's struct file initially has one reference.
After the fd exchange, both refcounts are bumped up to 2. Then, close()
decreases both to 1. From this point on, no one can touch the file/socket.

However, the struct file has one refcount and thus never calls the
release() function of the AF_UNIX socket.

That's why we need to track all inflight AF_UNIX sockets and run garbage
collection.

This series replaces the current GC implementation that locks each inflight
socket's receive queue and requires trickiness in other places.

The new GC does not lock each socket's queue to minimise its effect and
tries to be lightweight if there is no cyclic reference or no update in
the shape of the inflight fd graph.

The new implementation is based on Tarjan's Strongly Connected Components
algorithm, and we will consider each inflight AF_UNIX socket as a vertex
and its file descriptor as an edge in a directed graph.

For the details, please see each patch.

patch 1 - 3 : Add struct to express inflight socket graphs
patch 4 : Optimse inflight fd counting
patch 5 - 6 : Group SCC possibly forming a cycle
patch 7 - 8 : Support embryo socket
patch 9 - 11 : Make GC lightweight
patch 12 - 13 : Detect dead cycle references
patch 14 : Replace GC algorithm
patch 15 : selftest

After this series is applied, we can remove the two ugly tricks for race,
scm_fp_dup() in unix_attach_fds() and spin_lock dance in unix_peek_fds()
as done in patch 14/15 of v1.

Also, we will add cond_resched_lock() in __unix_gc() and convert it to
use a dedicated kthread instead of global system workqueue as suggested
by Paolo in a v4 thread.

v4: https://lore.kernel.org/netdev/20240301022243.73908-1-kuniyu@amazon.com/
v3: https://lore.kernel.org/netdev/20240223214003.17369-1-kuniyu@amazon.com/
v2: https://lore.kernel.org/netdev/20240216210556.65913-1-kuniyu@amazon.com/
v1: https://lore.kernel.org/netdev/20240203030058.60750-1-kuniyu@amazon.com/
====================

Link: https://lore.kernel.org/r/20240325202425.60930-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+742 -212
+23 -8
include/net/af_unix.h
··· 19 19 20 20 extern spinlock_t unix_gc_lock; 21 21 extern unsigned int unix_tot_inflight; 22 - 23 - void unix_inflight(struct user_struct *user, struct file *fp); 24 - void unix_notinflight(struct user_struct *user, struct file *fp); 22 + void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); 23 + void unix_del_edges(struct scm_fp_list *fpl); 24 + void unix_update_edges(struct unix_sock *receiver); 25 + int unix_prepare_fpl(struct scm_fp_list *fpl); 26 + void unix_destroy_fpl(struct scm_fp_list *fpl); 25 27 void unix_gc(void); 26 28 void wait_for_unix_gc(struct scm_fp_list *fpl); 29 + 30 + struct unix_vertex { 31 + struct list_head edges; 32 + struct list_head entry; 33 + struct list_head scc_entry; 34 + unsigned long out_degree; 35 + unsigned long index; 36 + unsigned long scc_index; 37 + }; 38 + 39 + struct unix_edge { 40 + struct unix_sock *predecessor; 41 + struct unix_sock *successor; 42 + struct list_head vertex_entry; 43 + struct list_head stack_entry; 44 + }; 27 45 28 46 struct sock *unix_peer_get(struct sock *sk); 29 47 ··· 80 62 struct path path; 81 63 struct mutex iolock, bindlock; 82 64 struct sock *peer; 83 - struct list_head link; 84 - unsigned long inflight; 65 + struct sock *listener; 66 + struct unix_vertex *vertex; 85 67 spinlock_t lock; 86 - unsigned long gc_flags; 87 - #define UNIX_GC_CANDIDATE 0 88 - #define UNIX_GC_MAYBE_CYCLE 1 89 68 struct socket_wq peer_wq; 90 69 wait_queue_entry_t peer_wake; 91 70 struct scm_stat scm_stat;
+9
include/net/scm.h
··· 23 23 kgid_t gid; 24 24 }; 25 25 26 + #ifdef CONFIG_UNIX 27 + struct unix_edge; 28 + #endif 29 + 26 30 struct scm_fp_list { 27 31 short count; 28 32 short count_unix; 29 33 short max; 34 + #ifdef CONFIG_UNIX 35 + bool inflight; 36 + struct list_head vertices; 37 + struct unix_edge *edges; 38 + #endif 30 39 struct user_struct *user; 31 40 struct file *fp[SCM_MAX_FD]; 32 41 };
+11
net/core/scm.c
··· 89 89 fpl->count_unix = 0; 90 90 fpl->max = SCM_MAX_FD; 91 91 fpl->user = NULL; 92 + #if IS_ENABLED(CONFIG_UNIX) 93 + fpl->inflight = false; 94 + fpl->edges = NULL; 95 + INIT_LIST_HEAD(&fpl->vertices); 96 + #endif 92 97 } 93 98 fpp = &fpl->fp[fpl->count]; 94 99 ··· 381 376 if (new_fpl) { 382 377 for (i = 0; i < fpl->count; i++) 383 378 get_file(fpl->fp[i]); 379 + 384 380 new_fpl->max = new_fpl->count; 385 381 new_fpl->user = get_uid(fpl->user); 382 + #if IS_ENABLED(CONFIG_UNIX) 383 + new_fpl->inflight = false; 384 + new_fpl->edges = NULL; 385 + INIT_LIST_HEAD(&new_fpl->vertices); 386 + #endif 386 387 } 387 388 return new_fpl; 388 389 }
+14 -13
net/unix/af_unix.c
··· 979 979 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; 980 980 sk->sk_destruct = unix_sock_destructor; 981 981 u = unix_sk(sk); 982 - u->inflight = 0; 982 + u->listener = NULL; 983 + u->vertex = NULL; 983 984 u->path.dentry = NULL; 984 985 u->path.mnt = NULL; 985 986 spin_lock_init(&u->lock); 986 - INIT_LIST_HEAD(&u->link); 987 987 mutex_init(&u->iolock); /* single task reading lock */ 988 988 mutex_init(&u->bindlock); /* single task binding lock */ 989 989 init_waitqueue_head(&u->peer_wait); ··· 1597 1597 newsk->sk_type = sk->sk_type; 1598 1598 init_peercred(newsk); 1599 1599 newu = unix_sk(newsk); 1600 + newu->listener = other; 1600 1601 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); 1601 1602 otheru = unix_sk(other); 1602 1603 ··· 1693 1692 bool kern) 1694 1693 { 1695 1694 struct sock *sk = sock->sk; 1696 - struct sock *tsk; 1697 1695 struct sk_buff *skb; 1696 + struct sock *tsk; 1698 1697 int err; 1699 1698 1700 1699 err = -EOPNOTSUPP; ··· 1719 1718 } 1720 1719 1721 1720 tsk = skb->sk; 1721 + unix_update_edges(unix_sk(tsk)); 1722 1722 skb_free_datagram(sk, skb); 1723 1723 wake_up_interruptible(&unix_sk(sk)->peer_wait); 1724 1724 ··· 1791 1789 1792 1790 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1793 1791 { 1794 - int i; 1795 - 1796 1792 if (too_many_unix_fds(current)) 1797 1793 return -ETOOMANYREFS; 1798 1794 ··· 1802 1802 if (!UNIXCB(skb).fp) 1803 1803 return -ENOMEM; 1804 1804 1805 - for (i = scm->fp->count - 1; i >= 0; i--) 1806 - unix_inflight(scm->fp->user, scm->fp->fp[i]); 1805 + if (unix_prepare_fpl(UNIXCB(skb).fp)) 1806 + return -ENOMEM; 1807 1807 1808 1808 return 0; 1809 1809 } 1810 1810 1811 1811 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) 1812 1812 { 1813 - int i; 1814 - 1815 1813 scm->fp = UNIXCB(skb).fp; 1816 1814 UNIXCB(skb).fp = NULL; 1817 1815 1818 - for (i = scm->fp->count - 1; i >= 0; i--) 1819 - unix_notinflight(scm->fp->user, scm->fp->fp[i]); 1816 + unix_destroy_fpl(scm->fp); 1820 1817 } 1821 1818 1822 1819 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) ··· 1934 1937 struct scm_fp_list *fp = UNIXCB(skb).fp; 1935 1938 struct unix_sock *u = unix_sk(sk); 1936 1939 1937 - if (unlikely(fp && fp->count)) 1940 + if (unlikely(fp && fp->count)) { 1938 1941 atomic_add(fp->count, &u->scm_stat.nr_fds); 1942 + unix_add_edges(fp, u); 1943 + } 1939 1944 } 1940 1945 1941 1946 static void scm_stat_del(struct sock *sk, struct sk_buff *skb) ··· 1945 1946 struct scm_fp_list *fp = UNIXCB(skb).fp; 1946 1947 struct unix_sock *u = unix_sk(sk); 1947 1948 1948 - if (unlikely(fp && fp->count)) 1949 + if (unlikely(fp && fp->count)) { 1949 1950 atomic_sub(fp->count, &u->scm_stat.nr_fds); 1951 + unix_del_edges(fp); 1952 + } 1950 1953 } 1951 1954 1952 1955 /*
+397 -190
net/unix/garbage.c
··· 101 101 return NULL; 102 102 } 103 103 104 + static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) 105 + { 106 + /* If an embryo socket has a fd, 107 + * the listener indirectly holds the fd's refcnt. 108 + */ 109 + if (edge->successor->listener) 110 + return unix_sk(edge->successor->listener)->vertex; 111 + 112 + return edge->successor->vertex; 113 + } 114 + 115 + static bool unix_graph_maybe_cyclic; 116 + static bool unix_graph_grouped; 117 + 118 + static void unix_update_graph(struct unix_vertex *vertex) 119 + { 120 + /* If the receiver socket is not inflight, no cyclic 121 + * reference could be formed. 122 + */ 123 + if (!vertex) 124 + return; 125 + 126 + unix_graph_maybe_cyclic = true; 127 + unix_graph_grouped = false; 128 + } 129 + 130 + static LIST_HEAD(unix_unvisited_vertices); 131 + 132 + enum unix_vertex_index { 133 + UNIX_VERTEX_INDEX_MARK1, 134 + UNIX_VERTEX_INDEX_MARK2, 135 + UNIX_VERTEX_INDEX_START, 136 + }; 137 + 138 + static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; 139 + 140 + static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) 141 + { 142 + struct unix_vertex *vertex = edge->predecessor->vertex; 143 + 144 + if (!vertex) { 145 + vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); 146 + vertex->index = unix_vertex_unvisited_index; 147 + vertex->out_degree = 0; 148 + INIT_LIST_HEAD(&vertex->edges); 149 + INIT_LIST_HEAD(&vertex->scc_entry); 150 + 151 + list_move_tail(&vertex->entry, &unix_unvisited_vertices); 152 + edge->predecessor->vertex = vertex; 153 + } 154 + 155 + vertex->out_degree++; 156 + list_add_tail(&edge->vertex_entry, &vertex->edges); 157 + 158 + unix_update_graph(unix_edge_successor(edge)); 159 + } 160 + 161 + static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) 162 + { 163 + struct unix_vertex *vertex = edge->predecessor->vertex; 164 + 165 + unix_update_graph(unix_edge_successor(edge)); 166 + 167 + list_del(&edge->vertex_entry); 168 + vertex->out_degree--; 169 + 170 + if (!vertex->out_degree) { 171 + edge->predecessor->vertex = NULL; 172 + list_move_tail(&vertex->entry, &fpl->vertices); 173 + } 174 + } 175 + 176 + static void unix_free_vertices(struct scm_fp_list *fpl) 177 + { 178 + struct unix_vertex *vertex, *next_vertex; 179 + 180 + list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { 181 + list_del(&vertex->entry); 182 + kfree(vertex); 183 + } 184 + } 185 + 104 186 DEFINE_SPINLOCK(unix_gc_lock); 105 187 unsigned int unix_tot_inflight; 106 - static LIST_HEAD(gc_candidates); 107 - static LIST_HEAD(gc_inflight_list); 108 188 109 - /* Keep the number of times in flight count for the file 110 - * descriptor if it is for an AF_UNIX socket. 111 - */ 112 - void unix_inflight(struct user_struct *user, struct file *filp) 189 + void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) 113 190 { 114 - struct unix_sock *u = unix_get_socket(filp); 191 + int i = 0, j = 0; 115 192 116 193 spin_lock(&unix_gc_lock); 117 194 118 - if (u) { 119 - if (!u->inflight) { 120 - WARN_ON_ONCE(!list_empty(&u->link)); 121 - list_add_tail(&u->link, &gc_inflight_list); 122 - } else { 123 - WARN_ON_ONCE(list_empty(&u->link)); 124 - } 125 - u->inflight++; 195 + if (!fpl->count_unix) 196 + goto out; 126 197 127 - /* Paired with READ_ONCE() in wait_for_unix_gc() */ 128 - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); 129 - } 198 + do { 199 + struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); 200 + struct unix_edge *edge; 130 201 131 - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); 202 + if (!inflight) 203 + continue; 204 + 205 + edge = fpl->edges + i++; 206 + edge->predecessor = inflight; 207 + edge->successor = receiver; 208 + 209 + unix_add_edge(fpl, edge); 210 + } while (i < fpl->count_unix); 211 + 212 + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); 213 + out: 214 + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); 132 215 133 216 spin_unlock(&unix_gc_lock); 217 + 218 + fpl->inflight = true; 219 + 220 + unix_free_vertices(fpl); 134 221 } 135 222 136 - void unix_notinflight(struct user_struct *user, struct file *filp) 223 + void unix_del_edges(struct scm_fp_list *fpl) 137 224 { 138 - struct unix_sock *u = unix_get_socket(filp); 225 + int i = 0; 139 226 140 227 spin_lock(&unix_gc_lock); 141 228 142 - if (u) { 143 - WARN_ON_ONCE(!u->inflight); 144 - WARN_ON_ONCE(list_empty(&u->link)); 229 + if (!fpl->count_unix) 230 + goto out; 145 231 146 - u->inflight--; 147 - if (!u->inflight) 148 - list_del_init(&u->link); 232 + do { 233 + struct unix_edge *edge = fpl->edges + i++; 149 234 150 - /* Paired with READ_ONCE() in wait_for_unix_gc() */ 151 - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); 152 - } 235 + unix_del_edge(fpl, edge); 236 + } while (i < fpl->count_unix); 153 237 154 - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); 238 + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); 239 + out: 240 + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); 155 241 242 + spin_unlock(&unix_gc_lock); 243 + 244 + fpl->inflight = false; 245 + } 246 + 247 + void unix_update_edges(struct unix_sock *receiver) 248 + { 249 + spin_lock(&unix_gc_lock); 250 + unix_update_graph(unix_sk(receiver->listener)->vertex); 251 + receiver->listener = NULL; 156 252 spin_unlock(&unix_gc_lock); 157 253 } 158 254 159 - static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), 160 - struct sk_buff_head *hitlist) 255 + int unix_prepare_fpl(struct scm_fp_list *fpl) 161 256 { 162 - struct sk_buff *skb; 163 - struct sk_buff *next; 257 + struct unix_vertex *vertex; 258 + int i; 164 259 165 - spin_lock(&x->sk_receive_queue.lock); 166 - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { 167 - /* Do we have file descriptors ? */ 168 - if (UNIXCB(skb).fp) { 169 - bool hit = false; 170 - /* Process the descriptors of this socket */ 171 - int nfd = UNIXCB(skb).fp->count; 172 - struct file **fp = UNIXCB(skb).fp->fp; 260 + if (!fpl->count_unix) 261 + return 0; 173 262 174 - while (nfd--) { 175 - /* Get the socket the fd matches if it indeed does so */ 176 - struct unix_sock *u = unix_get_socket(*fp++); 263 + for (i = 0; i < fpl->count_unix; i++) { 264 + vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); 265 + if (!vertex) 266 + goto err; 177 267 178 - /* Ignore non-candidates, they could have been added 179 - * to the queues after starting the garbage collection 180 - */ 181 - if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { 182 - hit = true; 183 - 184 - func(u); 185 - } 186 - } 187 - if (hit && hitlist != NULL) { 188 - __skb_unlink(skb, &x->sk_receive_queue); 189 - __skb_queue_tail(hitlist, skb); 190 - } 191 - } 268 + list_add(&vertex->entry, &fpl->vertices); 192 269 } 193 - spin_unlock(&x->sk_receive_queue.lock); 270 + 271 + fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), 272 + GFP_KERNEL_ACCOUNT); 273 + if (!fpl->edges) 274 + goto err; 275 + 276 + return 0; 277 + 278 + err: 279 + unix_free_vertices(fpl); 280 + return -ENOMEM; 194 281 } 195 282 196 - static void scan_children(struct sock *x, void (*func)(struct unix_sock *), 197 - struct sk_buff_head *hitlist) 283 + void unix_destroy_fpl(struct scm_fp_list *fpl) 198 284 { 199 - if (x->sk_state != TCP_LISTEN) { 200 - scan_inflight(x, func, hitlist); 201 - } else { 202 - struct sk_buff *skb; 203 - struct sk_buff *next; 204 - struct unix_sock *u; 205 - LIST_HEAD(embryos); 285 + if (fpl->inflight) 286 + unix_del_edges(fpl); 206 287 207 - /* For a listening socket collect the queued embryos 208 - * and perform a scan on them as well. 288 + kvfree(fpl->edges); 289 + unix_free_vertices(fpl); 290 + } 291 + 292 + static bool unix_vertex_dead(struct unix_vertex *vertex) 293 + { 294 + struct unix_edge *edge; 295 + struct unix_sock *u; 296 + long total_ref; 297 + 298 + list_for_each_entry(edge, &vertex->edges, vertex_entry) { 299 + struct unix_vertex *next_vertex = unix_edge_successor(edge); 300 + 301 + /* The vertex's fd can be received by a non-inflight socket. */ 302 + if (!next_vertex) 303 + return false; 304 + 305 + /* The vertex's fd can be received by an inflight socket in 306 + * another SCC. 209 307 */ 210 - spin_lock(&x->sk_receive_queue.lock); 211 - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { 212 - u = unix_sk(skb->sk); 308 + if (next_vertex->scc_index != vertex->scc_index) 309 + return false; 310 + } 213 311 214 - /* An embryo cannot be in-flight, so it's safe 215 - * to use the list link. 216 - */ 217 - WARN_ON_ONCE(!list_empty(&u->link)); 218 - list_add_tail(&u->link, &embryos); 219 - } 220 - spin_unlock(&x->sk_receive_queue.lock); 312 + /* No receiver exists out of the same SCC. */ 221 313 222 - while (!list_empty(&embryos)) { 223 - u = list_entry(embryos.next, struct unix_sock, link); 224 - scan_inflight(&u->sk, func, hitlist); 225 - list_del_init(&u->link); 314 + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); 315 + u = edge->predecessor; 316 + total_ref = file_count(u->sk.sk_socket->file); 317 + 318 + /* If not close()d, total_ref > out_degree. */ 319 + if (total_ref != vertex->out_degree) 320 + return false; 321 + 322 + return true; 323 + } 324 + 325 + enum unix_recv_queue_lock_class { 326 + U_RECVQ_LOCK_NORMAL, 327 + U_RECVQ_LOCK_EMBRYO, 328 + }; 329 + 330 + static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) 331 + { 332 + struct unix_vertex *vertex; 333 + 334 + list_for_each_entry_reverse(vertex, scc, scc_entry) { 335 + struct sk_buff_head *queue; 336 + struct unix_edge *edge; 337 + struct unix_sock *u; 338 + 339 + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); 340 + u = edge->predecessor; 341 + queue = &u->sk.sk_receive_queue; 342 + 343 + spin_lock(&queue->lock); 344 + 345 + if (u->sk.sk_state == TCP_LISTEN) { 346 + struct sk_buff *skb; 347 + 348 + skb_queue_walk(queue, skb) { 349 + struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; 350 + 351 + /* listener -> embryo order, the inversion never happens. */ 352 + spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); 353 + skb_queue_splice_init(embryo_queue, hitlist); 354 + spin_unlock(&embryo_queue->lock); 355 + } 356 + } else { 357 + skb_queue_splice_init(queue, hitlist); 358 + 359 + #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 360 + if (u->oob_skb) { 361 + kfree_skb(u->oob_skb); 362 + u->oob_skb = NULL; 363 + } 364 + #endif 226 365 } 366 + 367 + spin_unlock(&queue->lock); 227 368 } 228 369 } 229 370 230 - static void dec_inflight(struct unix_sock *usk) 371 + static bool unix_scc_cyclic(struct list_head *scc) 231 372 { 232 - usk->inflight--; 373 + struct unix_vertex *vertex; 374 + struct unix_edge *edge; 375 + 376 + /* SCC containing multiple vertices ? */ 377 + if (!list_is_singular(scc)) 378 + return true; 379 + 380 + vertex = list_first_entry(scc, typeof(*vertex), scc_entry); 381 + 382 + /* Self-reference or a embryo-listener circle ? */ 383 + list_for_each_entry(edge, &vertex->edges, vertex_entry) { 384 + if (unix_edge_successor(edge) == vertex) 385 + return true; 386 + } 387 + 388 + return false; 233 389 } 234 390 235 - static void inc_inflight(struct unix_sock *usk) 236 - { 237 - usk->inflight++; 238 - } 391 + static LIST_HEAD(unix_visited_vertices); 392 + static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; 239 393 240 - static void inc_inflight_move_tail(struct unix_sock *u) 394 + static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, 395 + struct sk_buff_head *hitlist) 241 396 { 242 - u->inflight++; 397 + LIST_HEAD(vertex_stack); 398 + struct unix_edge *edge; 399 + LIST_HEAD(edge_stack); 243 400 244 - /* If this still might be part of a cycle, move it to the end 245 - * of the list, so that it's checked even if it was already 246 - * passed over 401 + next_vertex: 402 + /* Push vertex to vertex_stack and mark it as on-stack 403 + * (index >= UNIX_VERTEX_INDEX_START). 404 + * The vertex will be popped when finalising SCC later. 247 405 */ 248 - if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) 249 - list_move_tail(&u->link, &gc_candidates); 406 + list_add(&vertex->scc_entry, &vertex_stack); 407 + 408 + vertex->index = *last_index; 409 + vertex->scc_index = *last_index; 410 + (*last_index)++; 411 + 412 + /* Explore neighbour vertices (receivers of the current vertex's fd). */ 413 + list_for_each_entry(edge, &vertex->edges, vertex_entry) { 414 + struct unix_vertex *next_vertex = unix_edge_successor(edge); 415 + 416 + if (!next_vertex) 417 + continue; 418 + 419 + if (next_vertex->index == unix_vertex_unvisited_index) { 420 + /* Iterative deepening depth first search 421 + * 422 + * 1. Push a forward edge to edge_stack and set 423 + * the successor to vertex for the next iteration. 424 + */ 425 + list_add(&edge->stack_entry, &edge_stack); 426 + 427 + vertex = next_vertex; 428 + goto next_vertex; 429 + 430 + /* 2. Pop the edge directed to the current vertex 431 + * and restore the ancestor for backtracking. 432 + */ 433 + prev_vertex: 434 + edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); 435 + list_del_init(&edge->stack_entry); 436 + 437 + next_vertex = vertex; 438 + vertex = edge->predecessor->vertex; 439 + 440 + /* If the successor has a smaller scc_index, two vertices 441 + * are in the same SCC, so propagate the smaller scc_index 442 + * to skip SCC finalisation. 443 + */ 444 + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); 445 + } else if (next_vertex->index != unix_vertex_grouped_index) { 446 + /* Loop detected by a back/cross edge. 447 + * 448 + * The successor is on vertex_stack, so two vertices are in 449 + * the same SCC. If the successor has a smaller *scc_index*, 450 + * propagate it to skip SCC finalisation. 451 + */ 452 + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); 453 + } else { 454 + /* The successor was already grouped as another SCC */ 455 + } 456 + } 457 + 458 + if (vertex->index == vertex->scc_index) { 459 + struct list_head scc; 460 + bool scc_dead = true; 461 + 462 + /* SCC finalised. 463 + * 464 + * If the scc_index was not updated, all the vertices above on 465 + * vertex_stack are in the same SCC. Group them using scc_entry. 466 + */ 467 + __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); 468 + 469 + list_for_each_entry_reverse(vertex, &scc, scc_entry) { 470 + /* Don't restart DFS from this vertex in unix_walk_scc(). */ 471 + list_move_tail(&vertex->entry, &unix_visited_vertices); 472 + 473 + /* Mark vertex as off-stack. */ 474 + vertex->index = unix_vertex_grouped_index; 475 + 476 + if (scc_dead) 477 + scc_dead = unix_vertex_dead(vertex); 478 + } 479 + 480 + if (scc_dead) 481 + unix_collect_skb(&scc, hitlist); 482 + else if (!unix_graph_maybe_cyclic) 483 + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); 484 + 485 + list_del(&scc); 486 + } 487 + 488 + /* Need backtracking ? */ 489 + if (!list_empty(&edge_stack)) 490 + goto prev_vertex; 491 + } 492 + 493 + static void unix_walk_scc(struct sk_buff_head *hitlist) 494 + { 495 + unsigned long last_index = UNIX_VERTEX_INDEX_START; 496 + 497 + unix_graph_maybe_cyclic = false; 498 + 499 + /* Visit every vertex exactly once. 500 + * __unix_walk_scc() moves visited vertices to unix_visited_vertices. 501 + */ 502 + while (!list_empty(&unix_unvisited_vertices)) { 503 + struct unix_vertex *vertex; 504 + 505 + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); 506 + __unix_walk_scc(vertex, &last_index, hitlist); 507 + } 508 + 509 + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); 510 + swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); 511 + 512 + unix_graph_grouped = true; 513 + } 514 + 515 + static void unix_walk_scc_fast(struct sk_buff_head *hitlist) 516 + { 517 + while (!list_empty(&unix_unvisited_vertices)) { 518 + struct unix_vertex *vertex; 519 + struct list_head scc; 520 + bool scc_dead = true; 521 + 522 + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); 523 + list_add(&scc, &vertex->scc_entry); 524 + 525 + list_for_each_entry_reverse(vertex, &scc, scc_entry) { 526 + list_move_tail(&vertex->entry, &unix_visited_vertices); 527 + 528 + if (scc_dead) 529 + scc_dead = unix_vertex_dead(vertex); 530 + } 531 + 532 + if (scc_dead) 533 + unix_collect_skb(&scc, hitlist); 534 + 535 + list_del(&scc); 536 + } 537 + 538 + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); 250 539 } 251 540 252 541 static bool gc_in_progress; ··· 543 254 static void __unix_gc(struct work_struct *work) 544 255 { 545 256 struct sk_buff_head hitlist; 546 - struct unix_sock *u, *next; 547 - LIST_HEAD(not_cycle_list); 548 - struct list_head cursor; 549 257 550 258 spin_lock(&unix_gc_lock); 551 259 552 - /* First, select candidates for garbage collection. Only 553 - * in-flight sockets are considered, and from those only ones 554 - * which don't have any external reference. 555 - * 556 - * Holding unix_gc_lock will protect these candidates from 557 - * being detached, and hence from gaining an external 558 - * reference. Since there are no possible receivers, all 559 - * buffers currently on the candidates' queues stay there 560 - * during the garbage collection. 561 - * 562 - * We also know that no new candidate can be added onto the 563 - * receive queues. Other, non candidate sockets _can_ be 564 - * added to queue, so we must make sure only to touch 565 - * candidates. 566 - */ 567 - list_for_each_entry_safe(u, next, &gc_inflight_list, link) { 568 - long total_refs; 569 - 570 - total_refs = file_count(u->sk.sk_socket->file); 571 - 572 - WARN_ON_ONCE(!u->inflight); 573 - WARN_ON_ONCE(total_refs < u->inflight); 574 - if (total_refs == u->inflight) { 575 - list_move_tail(&u->link, &gc_candidates); 576 - __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); 577 - __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); 578 - } 260 + if (!unix_graph_maybe_cyclic) { 261 + spin_unlock(&unix_gc_lock); 262 + goto skip_gc; 579 263 } 580 264 581 - /* Now remove all internal in-flight reference to children of 582 - * the candidates. 583 - */ 584 - list_for_each_entry(u, &gc_candidates, link) 585 - scan_children(&u->sk, dec_inflight, NULL); 265 + __skb_queue_head_init(&hitlist); 586 266 587 - /* Restore the references for children of all candidates, 588 - * which have remaining references. Do this recursively, so 589 - * only those remain, which form cyclic references. 590 - * 591 - * Use a "cursor" link, to make the list traversal safe, even 592 - * though elements might be moved about. 593 - */ 594 - list_add(&cursor, &gc_candidates); 595 - while (cursor.next != &gc_candidates) { 596 - u = list_entry(cursor.next, struct unix_sock, link); 597 - 598 - /* Move cursor to after the current position. */ 599 - list_move(&cursor, &u->link); 600 - 601 - if (u->inflight) { 602 - list_move_tail(&u->link, &not_cycle_list); 603 - __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); 604 - scan_children(&u->sk, inc_inflight_move_tail, NULL); 605 - } 606 - } 607 - list_del(&cursor); 608 - 609 - /* Now gc_candidates contains only garbage. Restore original 610 - * inflight counters for these as well, and remove the skbuffs 611 - * which are creating the cycle(s). 612 - */ 613 - skb_queue_head_init(&hitlist); 614 - list_for_each_entry(u, &gc_candidates, link) { 615 - scan_children(&u->sk, inc_inflight, &hitlist); 616 - 617 - #if IS_ENABLED(CONFIG_AF_UNIX_OOB) 618 - if (u->oob_skb) { 619 - kfree_skb(u->oob_skb); 620 - u->oob_skb = NULL; 621 - } 622 - #endif 623 - } 624 - 625 - /* not_cycle_list contains those sockets which do not make up a 626 - * cycle. Restore these to the inflight list. 627 - */ 628 - while (!list_empty(&not_cycle_list)) { 629 - u = list_entry(not_cycle_list.next, struct unix_sock, link); 630 - __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); 631 - list_move_tail(&u->link, &gc_inflight_list); 632 - } 267 + if (unix_graph_grouped) 268 + unix_walk_scc_fast(&hitlist); 269 + else 270 + unix_walk_scc(&hitlist); 633 271 634 272 spin_unlock(&unix_gc_lock); 635 273 636 - /* Here we are. Hitlist is filled. Die. */ 637 274 __skb_queue_purge(&hitlist); 638 - 639 - spin_lock(&unix_gc_lock); 640 - 641 - /* All candidates should have been detached by now. */ 642 - WARN_ON_ONCE(!list_empty(&gc_candidates)); 643 - 644 - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ 275 + skip_gc: 645 276 WRITE_ONCE(gc_in_progress, false); 646 - 647 - spin_unlock(&unix_gc_lock); 648 277 } 649 278 650 279 static DECLARE_WORK(unix_gc_work, __unix_gc);
+1
tools/testing/selftests/net/.gitignore
··· 31 31 rxtimestamp 32 32 sctp_hello 33 33 scm_pidfd 34 + scm_rights 34 35 sk_bind_sendto_listen 35 36 sk_connect_zero_addr 36 37 socket
+1 -1
tools/testing/selftests/net/af_unix/Makefile
··· 1 1 CFLAGS += $(KHDR_INCLUDES) 2 - TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd 2 + TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd scm_rights 3 3 4 4 include ../../lib.mk
+286
tools/testing/selftests/net/af_unix/scm_rights.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright Amazon.com Inc. or its affiliates. */ 3 + #define _GNU_SOURCE 4 + #include <sched.h> 5 + 6 + #include <stdio.h> 7 + #include <string.h> 8 + #include <unistd.h> 9 + #include <sys/types.h> 10 + #include <sys/socket.h> 11 + #include <sys/un.h> 12 + 13 + #include "../../kselftest_harness.h" 14 + 15 + FIXTURE(scm_rights) 16 + { 17 + int fd[16]; 18 + }; 19 + 20 + FIXTURE_VARIANT(scm_rights) 21 + { 22 + char name[16]; 23 + int type; 24 + int flags; 25 + bool test_listener; 26 + }; 27 + 28 + FIXTURE_VARIANT_ADD(scm_rights, dgram) 29 + { 30 + .name = "UNIX ", 31 + .type = SOCK_DGRAM, 32 + .flags = 0, 33 + .test_listener = false, 34 + }; 35 + 36 + FIXTURE_VARIANT_ADD(scm_rights, stream) 37 + { 38 + .name = "UNIX-STREAM ", 39 + .type = SOCK_STREAM, 40 + .flags = 0, 41 + .test_listener = false, 42 + }; 43 + 44 + FIXTURE_VARIANT_ADD(scm_rights, stream_oob) 45 + { 46 + .name = "UNIX-STREAM ", 47 + .type = SOCK_STREAM, 48 + .flags = MSG_OOB, 49 + .test_listener = false, 50 + }; 51 + 52 + FIXTURE_VARIANT_ADD(scm_rights, stream_listener) 53 + { 54 + .name = "UNIX-STREAM ", 55 + .type = SOCK_STREAM, 56 + .flags = 0, 57 + .test_listener = true, 58 + }; 59 + 60 + FIXTURE_VARIANT_ADD(scm_rights, stream_listener_oob) 61 + { 62 + .name = "UNIX-STREAM ", 63 + .type = SOCK_STREAM, 64 + .flags = MSG_OOB, 65 + .test_listener = true, 66 + }; 67 + 68 + static int count_sockets(struct __test_metadata *_metadata, 69 + const FIXTURE_VARIANT(scm_rights) *variant) 70 + { 71 + int sockets = -1, len, ret; 72 + char *line = NULL; 73 + size_t unused; 74 + FILE *f; 75 + 76 + f = fopen("/proc/net/protocols", "r"); 77 + ASSERT_NE(NULL, f); 78 + 79 + len = strlen(variant->name); 80 + 81 + while (getline(&line, &unused, f) != -1) { 82 + int unused2; 83 + 84 + if (strncmp(line, variant->name, len)) 85 + continue; 86 + 87 + ret = sscanf(line + len, "%d %d", &unused2, &sockets); 88 + ASSERT_EQ(2, ret); 89 + 90 + break; 91 + } 92 + 93 + free(line); 94 + 95 + ret = fclose(f); 96 + ASSERT_EQ(0, ret); 97 + 98 + return sockets; 99 + } 100 + 101 + FIXTURE_SETUP(scm_rights) 102 + { 103 + int ret; 104 + 105 + ret = unshare(CLONE_NEWNET); 106 + ASSERT_EQ(0, ret); 107 + 108 + ret = count_sockets(_metadata, variant); 109 + ASSERT_EQ(0, ret); 110 + } 111 + 112 + FIXTURE_TEARDOWN(scm_rights) 113 + { 114 + int ret; 115 + 116 + sleep(1); 117 + 118 + ret = count_sockets(_metadata, variant); 119 + ASSERT_EQ(0, ret); 120 + } 121 + 122 + static void create_listeners(struct __test_metadata *_metadata, 123 + FIXTURE_DATA(scm_rights) *self, 124 + int n) 125 + { 126 + struct sockaddr_un addr = { 127 + .sun_family = AF_UNIX, 128 + }; 129 + socklen_t addrlen; 130 + int i, ret; 131 + 132 + for (i = 0; i < n * 2; i += 2) { 133 + self->fd[i] = socket(AF_UNIX, SOCK_STREAM, 0); 134 + ASSERT_LE(0, self->fd[i]); 135 + 136 + addrlen = sizeof(addr.sun_family); 137 + ret = bind(self->fd[i], (struct sockaddr *)&addr, addrlen); 138 + ASSERT_EQ(0, ret); 139 + 140 + ret = listen(self->fd[i], -1); 141 + ASSERT_EQ(0, ret); 142 + 143 + addrlen = sizeof(addr); 144 + ret = getsockname(self->fd[i], (struct sockaddr *)&addr, &addrlen); 145 + ASSERT_EQ(0, ret); 146 + 147 + self->fd[i + 1] = socket(AF_UNIX, SOCK_STREAM, 0); 148 + ASSERT_LE(0, self->fd[i + 1]); 149 + 150 + ret = connect(self->fd[i + 1], (struct sockaddr *)&addr, addrlen); 151 + ASSERT_EQ(0, ret); 152 + } 153 + } 154 + 155 + static void create_socketpairs(struct __test_metadata *_metadata, 156 + FIXTURE_DATA(scm_rights) *self, 157 + const FIXTURE_VARIANT(scm_rights) *variant, 158 + int n) 159 + { 160 + int i, ret; 161 + 162 + ASSERT_GE(sizeof(self->fd) / sizeof(int), n); 163 + 164 + for (i = 0; i < n * 2; i += 2) { 165 + ret = socketpair(AF_UNIX, variant->type, 0, self->fd + i); 166 + ASSERT_EQ(0, ret); 167 + } 168 + } 169 + 170 + static void __create_sockets(struct __test_metadata *_metadata, 171 + FIXTURE_DATA(scm_rights) *self, 172 + const FIXTURE_VARIANT(scm_rights) *variant, 173 + int n) 174 + { 175 + if (variant->test_listener) 176 + create_listeners(_metadata, self, n); 177 + else 178 + create_socketpairs(_metadata, self, variant, n); 179 + } 180 + 181 + static void __close_sockets(struct __test_metadata *_metadata, 182 + FIXTURE_DATA(scm_rights) *self, 183 + int n) 184 + { 185 + int i, ret; 186 + 187 + ASSERT_GE(sizeof(self->fd) / sizeof(int), n); 188 + 189 + for (i = 0; i < n * 2; i++) { 190 + ret = close(self->fd[i]); 191 + ASSERT_EQ(0, ret); 192 + } 193 + } 194 + 195 + void __send_fd(struct __test_metadata *_metadata, 196 + const FIXTURE_DATA(scm_rights) *self, 197 + const FIXTURE_VARIANT(scm_rights) *variant, 198 + int inflight, int receiver) 199 + { 200 + #define MSG "nop" 201 + #define MSGLEN 3 202 + struct { 203 + struct cmsghdr cmsghdr; 204 + int fd[2]; 205 + } cmsg = { 206 + .cmsghdr = { 207 + .cmsg_len = CMSG_LEN(sizeof(cmsg.fd)), 208 + .cmsg_level = SOL_SOCKET, 209 + .cmsg_type = SCM_RIGHTS, 210 + }, 211 + .fd = { 212 + self->fd[inflight * 2], 213 + self->fd[inflight * 2], 214 + }, 215 + }; 216 + struct iovec iov = { 217 + .iov_base = MSG, 218 + .iov_len = MSGLEN, 219 + }; 220 + struct msghdr msg = { 221 + .msg_name = NULL, 222 + .msg_namelen = 0, 223 + .msg_iov = &iov, 224 + .msg_iovlen = 1, 225 + .msg_control = &cmsg, 226 + .msg_controllen = CMSG_SPACE(sizeof(cmsg.fd)), 227 + }; 228 + int ret; 229 + 230 + ret = sendmsg(self->fd[receiver * 2 + 1], &msg, variant->flags); 231 + ASSERT_EQ(MSGLEN, ret); 232 + } 233 + 234 + #define create_sockets(n) \ 235 + __create_sockets(_metadata, self, variant, n) 236 + #define close_sockets(n) \ 237 + __close_sockets(_metadata, self, n) 238 + #define send_fd(inflight, receiver) \ 239 + __send_fd(_metadata, self, variant, inflight, receiver) 240 + 241 + TEST_F(scm_rights, self_ref) 242 + { 243 + create_sockets(2); 244 + 245 + send_fd(0, 0); 246 + 247 + send_fd(1, 1); 248 + 249 + close_sockets(2); 250 + } 251 + 252 + TEST_F(scm_rights, triangle) 253 + { 254 + create_sockets(6); 255 + 256 + send_fd(0, 1); 257 + send_fd(1, 2); 258 + send_fd(2, 0); 259 + 260 + send_fd(3, 4); 261 + send_fd(4, 5); 262 + send_fd(5, 3); 263 + 264 + close_sockets(6); 265 + } 266 + 267 + TEST_F(scm_rights, cross_edge) 268 + { 269 + create_sockets(8); 270 + 271 + send_fd(0, 1); 272 + send_fd(1, 2); 273 + send_fd(2, 0); 274 + send_fd(1, 3); 275 + send_fd(3, 2); 276 + 277 + send_fd(4, 5); 278 + send_fd(5, 6); 279 + send_fd(6, 4); 280 + send_fd(5, 7); 281 + send_fd(7, 6); 282 + 283 + close_sockets(8); 284 + } 285 + 286 + TEST_HARNESS_MAIN