Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

packet: Enhance AF_PACKET implementation to not require high order contiguous memory allocation (v4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit

Version 4 of this patch.

Change notes:
1) Removed extra memset. Didn't think kcalloc added a GFP_ZERO the way kzalloc did :)

Summary:
It was shown to me recently that systems under high load were driven very deep
into swap when tcpdump was run. The reason this happened was because the
AF_PACKET protocol has a SET_RINGBUFFER socket option that allows the user space
application to specify how many entries an AF_PACKET socket will have and how
large each entry will be. It seems the default setting for tcpdump is to set
the ring buffer to 32 entries of 64 Kb each, which implies 32 order 5
allocation. Thats difficult under good circumstances, and horrid under memory
pressure.

I thought it would be good to make that a bit more usable. I was going to do a
simple conversion of the ring buffer from contigous pages to iovecs, but
unfortunately, the metadata which AF_PACKET places in these buffers can easily
span a page boundary, and given that these buffers get mapped into user space,
and the data layout doesn't easily allow for a change to padding between frames
to avoid that, a simple iovec change is just going to break user space ABI
consistency.

So I've done this, I've added a three tiered mechanism to the af_packet set_ring
socket option. It attempts to allocate memory in the following order:

1) Using __get_free_pages with GFP_NORETRY set, so as to fail quickly without
digging into swap

2) Using vmalloc

3) Using __get_free_pages with GFP_NORETRY clear, causing us to try as hard as
needed to get the memory

The effect is that we don't disturb the system as much when we're under load,
while still being able to conduct tcpdumps effectively.

Tested successfully by me.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Maciej Żenczykowski <zenczykowski@gmail.com>
Reported-by: Maciej Żenczykowski <zenczykowski@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Neil Horman and committed by
David S. Miller
0e3125c7 020f01eb

+69 -16
+69 -16
net/packet/af_packet.c
··· 61 61 #include <linux/kernel.h> 62 62 #include <linux/kmod.h> 63 63 #include <linux/slab.h> 64 + #include <linux/vmalloc.h> 64 65 #include <net/net_namespace.h> 65 66 #include <net/ip.h> 66 67 #include <net/protocol.h> ··· 164 163 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 165 164 int closing, int tx_ring); 166 165 166 + #define PGV_FROM_VMALLOC 1 167 + struct pgv { 168 + char *buffer; 169 + unsigned char flags; 170 + }; 171 + 167 172 struct packet_ring_buffer { 168 - char **pg_vec; 173 + struct pgv *pg_vec; 169 174 unsigned int head; 170 175 unsigned int frames_per_block; 171 176 unsigned int frame_size; ··· 290 283 pg_vec_pos = position / rb->frames_per_block; 291 284 frame_offset = position % rb->frames_per_block; 292 285 293 - h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size); 286 + h.raw = rb->pg_vec[pg_vec_pos].buffer + 287 + (frame_offset * rb->frame_size); 294 288 295 289 if (status != __packet_get_status(po, h.raw)) 296 290 return NULL; ··· 2333 2325 .close = packet_mm_close, 2334 2326 }; 2335 2327 2336 - static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len) 2328 + static void free_pg_vec(struct pgv *pg_vec, unsigned int order, 2329 + unsigned int len) 2337 2330 { 2338 2331 int i; 2339 2332 2340 2333 for (i = 0; i < len; i++) { 2341 - if (likely(pg_vec[i])) 2342 - free_pages((unsigned long) pg_vec[i], order); 2334 + if (likely(pg_vec[i].buffer)) { 2335 + if (pg_vec[i].flags & PGV_FROM_VMALLOC) 2336 + vfree(pg_vec[i].buffer); 2337 + else 2338 + free_pages((unsigned long)pg_vec[i].buffer, 2339 + order); 2340 + pg_vec[i].buffer = NULL; 2341 + } 2343 2342 } 2344 2343 kfree(pg_vec); 2345 2344 } 2346 2345 2347 - static inline char *alloc_one_pg_vec_page(unsigned long order) 2346 + static inline char *alloc_one_pg_vec_page(unsigned long order, 2347 + unsigned char *flags) 2348 2348 { 2349 - gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN; 2349 + char *buffer = NULL; 2350 + gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | 2351 + __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; 2350 2352 2351 - return (char *) __get_free_pages(gfp_flags, order); 2353 + buffer = (char *) __get_free_pages(gfp_flags, order); 2354 + 2355 + if (buffer) 2356 + return buffer; 2357 + 2358 + /* 2359 + * __get_free_pages failed, fall back to vmalloc 2360 + */ 2361 + *flags |= PGV_FROM_VMALLOC; 2362 + buffer = vmalloc((1 << order) * PAGE_SIZE); 2363 + 2364 + if (buffer) 2365 + return buffer; 2366 + 2367 + /* 2368 + * vmalloc failed, lets dig into swap here 2369 + */ 2370 + *flags = 0; 2371 + gfp_flags &= ~__GFP_NORETRY; 2372 + buffer = (char *)__get_free_pages(gfp_flags, order); 2373 + if (buffer) 2374 + return buffer; 2375 + 2376 + /* 2377 + * complete and utter failure 2378 + */ 2379 + return NULL; 2352 2380 } 2353 2381 2354 - static char **alloc_pg_vec(struct tpacket_req *req, int order) 2382 + static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) 2355 2383 { 2356 2384 unsigned int block_nr = req->tp_block_nr; 2357 - char **pg_vec; 2385 + struct pgv *pg_vec; 2358 2386 int i; 2359 2387 2360 - pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL); 2388 + pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL); 2361 2389 if (unlikely(!pg_vec)) 2362 2390 goto out; 2363 2391 2364 2392 for (i = 0; i < block_nr; i++) { 2365 - pg_vec[i] = alloc_one_pg_vec_page(order); 2366 - if (unlikely(!pg_vec[i])) 2393 + pg_vec[i].buffer = alloc_one_pg_vec_page(order, 2394 + &pg_vec[i].flags); 2395 + if (unlikely(!pg_vec[i].buffer)) 2367 2396 goto out_free_pgvec; 2368 2397 } 2369 2398 ··· 2409 2364 2410 2365 out_free_pgvec: 2411 2366 free_pg_vec(pg_vec, order, block_nr); 2367 + kfree(pg_vec); 2412 2368 pg_vec = NULL; 2413 2369 goto out; 2414 2370 } ··· 2417 2371 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 2418 2372 int closing, int tx_ring) 2419 2373 { 2420 - char **pg_vec = NULL; 2374 + struct pgv *pg_vec = NULL; 2421 2375 struct packet_sock *po = pkt_sk(sk); 2422 2376 int was_running, order = 0; 2423 2377 struct packet_ring_buffer *rb; ··· 2579 2533 continue; 2580 2534 2581 2535 for (i = 0; i < rb->pg_vec_len; i++) { 2582 - struct page *page = virt_to_page(rb->pg_vec[i]); 2536 + struct page *page; 2537 + void *kaddr = rb->pg_vec[i].buffer; 2583 2538 int pg_num; 2584 2539 2585 2540 for (pg_num = 0; pg_num < rb->pg_vec_pages; 2586 - pg_num++, page++) { 2541 + pg_num++) { 2542 + if (rb->pg_vec[i].flags & PGV_FROM_VMALLOC) 2543 + page = vmalloc_to_page(kaddr); 2544 + else 2545 + page = virt_to_page(kaddr); 2546 + 2587 2547 err = vm_insert_page(vma, start, page); 2588 2548 if (unlikely(err)) 2589 2549 goto out; 2590 2550 start += PAGE_SIZE; 2551 + kaddr += PAGE_SIZE; 2591 2552 } 2592 2553 } 2593 2554 }