Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

net: fix unsafe set_memory_rw from softirq

on x86 system with net.core.bpf_jit_enable = 1

sudo tcpdump -i eth1 'tcp port 22'

causes the warning:
[ 56.766097] Possible unsafe locking scenario:
[ 56.766097]
[ 56.780146] CPU0
[ 56.786807] ----
[ 56.793188] lock(&(&vb->lock)->rlock);
[ 56.799593] <Interrupt>
[ 56.805889] lock(&(&vb->lock)->rlock);
[ 56.812266]
[ 56.812266] *** DEADLOCK ***
[ 56.812266]
[ 56.830670] 1 lock held by ksoftirqd/1/13:
[ 56.836838] #0: (rcu_read_lock){.+.+..}, at: [<ffffffff8118f44c>] vm_unmap_aliases+0x8c/0x380
[ 56.849757]
[ 56.849757] stack backtrace:
[ 56.862194] CPU: 1 PID: 13 Comm: ksoftirqd/1 Not tainted 3.12.0-rc3+ #45
[ 56.868721] Hardware name: System manufacturer System Product Name/P8Z77 WS, BIOS 3007 07/26/2012
[ 56.882004] ffffffff821944c0 ffff88080bbdb8c8 ffffffff8175a145 0000000000000007
[ 56.895630] ffff88080bbd5f40 ffff88080bbdb928 ffffffff81755b14 0000000000000001
[ 56.909313] ffff880800000001 ffff880800000000 ffffffff8101178f 0000000000000001
[ 56.923006] Call Trace:
[ 56.929532] [<ffffffff8175a145>] dump_stack+0x55/0x76
[ 56.936067] [<ffffffff81755b14>] print_usage_bug+0x1f7/0x208
[ 56.942445] [<ffffffff8101178f>] ? save_stack_trace+0x2f/0x50
[ 56.948932] [<ffffffff810cc0a0>] ? check_usage_backwards+0x150/0x150
[ 56.955470] [<ffffffff810ccb52>] mark_lock+0x282/0x2c0
[ 56.961945] [<ffffffff810ccfed>] __lock_acquire+0x45d/0x1d50
[ 56.968474] [<ffffffff810cce6e>] ? __lock_acquire+0x2de/0x1d50
[ 56.975140] [<ffffffff81393bf5>] ? cpumask_next_and+0x55/0x90
[ 56.981942] [<ffffffff810cef72>] lock_acquire+0x92/0x1d0
[ 56.988745] [<ffffffff8118f52a>] ? vm_unmap_aliases+0x16a/0x380
[ 56.995619] [<ffffffff817628f1>] _raw_spin_lock+0x41/0x50
[ 57.002493] [<ffffffff8118f52a>] ? vm_unmap_aliases+0x16a/0x380
[ 57.009447] [<ffffffff8118f52a>] vm_unmap_aliases+0x16a/0x380
[ 57.016477] [<ffffffff8118f44c>] ? vm_unmap_aliases+0x8c/0x380
[ 57.023607] [<ffffffff810436b0>] change_page_attr_set_clr+0xc0/0x460
[ 57.030818] [<ffffffff810cfb8d>] ? trace_hardirqs_on+0xd/0x10
[ 57.037896] [<ffffffff811a8330>] ? kmem_cache_free+0xb0/0x2b0
[ 57.044789] [<ffffffff811b59c3>] ? free_object_rcu+0x93/0xa0
[ 57.051720] [<ffffffff81043d9f>] set_memory_rw+0x2f/0x40
[ 57.058727] [<ffffffff8104e17c>] bpf_jit_free+0x2c/0x40
[ 57.065577] [<ffffffff81642cba>] sk_filter_release_rcu+0x1a/0x30
[ 57.072338] [<ffffffff811108e2>] rcu_process_callbacks+0x202/0x7c0
[ 57.078962] [<ffffffff81057f17>] __do_softirq+0xf7/0x3f0
[ 57.085373] [<ffffffff81058245>] run_ksoftirqd+0x35/0x70

cannot reuse jited filter memory, since it's readonly,
so use original bpf insns memory to hold work_struct

defer kfree of sk_filter until jit completed freeing

tested on x86_64 and i386

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Alexei Starovoitov and committed by
David S. Miller
d45ed4a4 582442d6

+36 -18
+1
arch/arm/net/bpf_jit_32.c
··· 930 930 { 931 931 if (fp->bpf_func != sk_run_filter) 932 932 module_free(NULL, fp->bpf_func); 933 + kfree(fp); 933 934 }
+1
arch/powerpc/net/bpf_jit_comp.c
··· 691 691 { 692 692 if (fp->bpf_func != sk_run_filter) 693 693 module_free(NULL, fp->bpf_func); 694 + kfree(fp); 694 695 }
+3 -1
arch/s390/net/bpf_jit_comp.c
··· 881 881 struct bpf_binary_header *header = (void *)addr; 882 882 883 883 if (fp->bpf_func == sk_run_filter) 884 - return; 884 + goto free_filter; 885 885 set_memory_rw(addr, header->pages); 886 886 module_free(NULL, header); 887 + free_filter: 888 + kfree(fp); 887 889 }
+1
arch/sparc/net/bpf_jit_comp.c
··· 808 808 { 809 809 if (fp->bpf_func != sk_run_filter) 810 810 module_free(NULL, fp->bpf_func); 811 + kfree(fp); 811 812 }
+13 -5
arch/x86/net/bpf_jit_comp.c
··· 772 772 return; 773 773 } 774 774 775 + static void bpf_jit_free_deferred(struct work_struct *work) 776 + { 777 + struct sk_filter *fp = container_of(work, struct sk_filter, work); 778 + unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; 779 + struct bpf_binary_header *header = (void *)addr; 780 + 781 + set_memory_rw(addr, header->pages); 782 + module_free(NULL, header); 783 + kfree(fp); 784 + } 785 + 775 786 void bpf_jit_free(struct sk_filter *fp) 776 787 { 777 788 if (fp->bpf_func != sk_run_filter) { 778 - unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; 779 - struct bpf_binary_header *header = (void *)addr; 780 - 781 - set_memory_rw(addr, header->pages); 782 - module_free(NULL, header); 789 + INIT_WORK(&fp->work, bpf_jit_free_deferred); 790 + schedule_work(&fp->work); 783 791 } 784 792 }
+11 -4
include/linux/filter.h
··· 6 6 7 7 #include <linux/atomic.h> 8 8 #include <linux/compat.h> 9 + #include <linux/workqueue.h> 9 10 #include <uapi/linux/filter.h> 10 11 11 12 #ifdef CONFIG_COMPAT ··· 26 25 { 27 26 atomic_t refcnt; 28 27 unsigned int len; /* Number of filter blocks */ 28 + struct rcu_head rcu; 29 29 unsigned int (*bpf_func)(const struct sk_buff *skb, 30 30 const struct sock_filter *filter); 31 - struct rcu_head rcu; 32 - struct sock_filter insns[0]; 31 + union { 32 + struct sock_filter insns[0]; 33 + struct work_struct work; 34 + }; 33 35 }; 34 36 35 - static inline unsigned int sk_filter_len(const struct sk_filter *fp) 37 + static inline unsigned int sk_filter_size(unsigned int proglen) 36 38 { 37 - return fp->len * sizeof(struct sock_filter) + sizeof(*fp); 39 + return max(sizeof(struct sk_filter), 40 + offsetof(struct sk_filter, insns[proglen])); 38 41 } 39 42 40 43 extern int sk_filter(struct sock *sk, struct sk_buff *skb); ··· 72 67 } 73 68 #define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns) 74 69 #else 70 + #include <linux/slab.h> 75 71 static inline void bpf_jit_compile(struct sk_filter *fp) 76 72 { 77 73 } 78 74 static inline void bpf_jit_free(struct sk_filter *fp) 79 75 { 76 + kfree(fp); 80 77 } 81 78 #define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns) 82 79 #endif
+2 -4
include/net/sock.h
··· 1630 1630 1631 1631 static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 1632 1632 { 1633 - unsigned int size = sk_filter_len(fp); 1634 - 1635 - atomic_sub(size, &sk->sk_omem_alloc); 1633 + atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc); 1636 1634 sk_filter_release(fp); 1637 1635 } 1638 1636 1639 1637 static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp) 1640 1638 { 1641 1639 atomic_inc(&fp->refcnt); 1642 - atomic_add(sk_filter_len(fp), &sk->sk_omem_alloc); 1640 + atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc); 1643 1641 } 1644 1642 1645 1643 /*
+4 -4
net/core/filter.c
··· 644 644 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 645 645 646 646 bpf_jit_free(fp); 647 - kfree(fp); 648 647 } 649 648 EXPORT_SYMBOL(sk_filter_release_rcu); 650 649 ··· 682 683 if (fprog->filter == NULL) 683 684 return -EINVAL; 684 685 685 - fp = kmalloc(fsize + sizeof(*fp), GFP_KERNEL); 686 + fp = kmalloc(sk_filter_size(fprog->len), GFP_KERNEL); 686 687 if (!fp) 687 688 return -ENOMEM; 688 689 memcpy(fp->insns, fprog->filter, fsize); ··· 722 723 { 723 724 struct sk_filter *fp, *old_fp; 724 725 unsigned int fsize = sizeof(struct sock_filter) * fprog->len; 726 + unsigned int sk_fsize = sk_filter_size(fprog->len); 725 727 int err; 726 728 727 729 if (sock_flag(sk, SOCK_FILTER_LOCKED)) ··· 732 732 if (fprog->filter == NULL) 733 733 return -EINVAL; 734 734 735 - fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); 735 + fp = sock_kmalloc(sk, sk_fsize, GFP_KERNEL); 736 736 if (!fp) 737 737 return -ENOMEM; 738 738 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 739 - sock_kfree_s(sk, fp, fsize+sizeof(*fp)); 739 + sock_kfree_s(sk, fp, sk_fsize); 740 740 return -EFAULT; 741 741 } 742 742