Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

neigh: new unresolved queue limits

Le mercredi 09 novembre 2011 à 16:21 -0500, David Miller a écrit :
> From: David Miller <davem@davemloft.net>
> Date: Wed, 09 Nov 2011 16:16:44 -0500 (EST)
>
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Wed, 09 Nov 2011 12:14:09 +0100
> >
> >> unres_qlen is the number of frames we are able to queue per unresolved
> >> neighbour. Its default value (3) was never changed and is responsible
> >> for strange drops, especially if IP fragments are used, or multiple
> >> sessions start in parallel. Even a single tcp flow can hit this limit.
> > ...
> >
> > Ok, I've applied this, let's see what happens :-)
>
> Early answer, build fails.
>
> Please test build this patch with DECNET enabled and resubmit. The
> decnet neigh layer still refers to the removed ->queue_len member.
>
> Thanks.

Ouch, this was fixed on one machine yesterday, but not the other one I
used this morning, sorry.

[PATCH V5 net-next] neigh: new unresolved queue limits

unres_qlen is the number of frames we are able to queue per unresolved
neighbour. Its default value (3) was never changed and is responsible
for strange drops, especially if IP fragments are used, or multiple
sessions start in parallel. Even a single tcp flow can hit this limit.

$ arp -d 192.168.20.108 ; ping -c 2 -s 8000 192.168.20.108
PING 192.168.20.108 (192.168.20.108) 8000(8028) bytes of data.
8008 bytes from 192.168.20.108: icmp_seq=2 ttl=64 time=0.322 ms

Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
8b5c171b 292d1398

+128 -56
+10
Documentation/networking/ip-sysctl.txt
··· 31 31 when using large numbers of interfaces and when communicating 32 32 with large numbers of directly-connected peers. 33 33 34 + neigh/default/unres_qlen_bytes - INTEGER 35 + The maximum number of bytes which may be used by packets 36 + queued for each unresolved address by other network layers. 37 + (added in linux 3.3) 38 + 39 + neigh/default/unres_qlen - INTEGER 40 + The maximum number of packets which may be queued for each 41 + unresolved address by other network layers. 42 + (deprecated in linux 3.3) : use unres_qlen_bytes instead. 43 + 34 44 mtu_expires - INTEGER 35 45 Time, in seconds, that cached PMTU information is kept. 36 46
+1
include/linux/neighbour.h
··· 116 116 NDTPA_PROXY_DELAY, /* u64, msecs */ 117 117 NDTPA_PROXY_QLEN, /* u32 */ 118 118 NDTPA_LOCKTIME, /* u64, msecs */ 119 + NDTPA_QUEUE_LENBYTES, /* u32 */ 119 120 __NDTPA_MAX 120 121 }; 121 122 #define NDTPA_MAX (__NDTPA_MAX - 1)
+2 -1
include/net/neighbour.h
··· 59 59 int reachable_time; 60 60 int delay_probe_time; 61 61 62 - int queue_len; 62 + int queue_len_bytes; 63 63 int ucast_probes; 64 64 int app_probes; 65 65 int mcast_probes; ··· 99 99 rwlock_t lock; 100 100 atomic_t refcnt; 101 101 struct sk_buff_head arp_queue; 102 + unsigned int arp_queue_len_bytes; 102 103 struct timer_list timer; 103 104 unsigned long used; 104 105 atomic_t probes;
+1 -1
net/atm/clip.c
··· 329 329 .gc_staletime = 60 * HZ, 330 330 .reachable_time = 30 * HZ, 331 331 .delay_probe_time = 5 * HZ, 332 - .queue_len = 3, 332 + .queue_len_bytes = 64 * 1024, 333 333 .ucast_probes = 3, 334 334 .mcast_probes = 3, 335 335 .anycast_delay = 1 * HZ,
+111 -51
net/core/neighbour.c
··· 238 238 it to safe state. 239 239 */ 240 240 skb_queue_purge(&n->arp_queue); 241 + n->arp_queue_len_bytes = 0; 241 242 n->output = neigh_blackhole; 242 243 if (n->nud_state & NUD_VALID) 243 244 n->nud_state = NUD_NOARP; ··· 703 702 printk(KERN_WARNING "Impossible event.\n"); 704 703 705 704 skb_queue_purge(&neigh->arp_queue); 705 + neigh->arp_queue_len_bytes = 0; 706 706 707 707 dev_put(neigh->dev); 708 708 neigh_parms_put(neigh->parms); ··· 844 842 write_lock(&neigh->lock); 845 843 } 846 844 skb_queue_purge(&neigh->arp_queue); 845 + neigh->arp_queue_len_bytes = 0; 847 846 } 848 847 849 848 static void neigh_probe(struct neighbour *neigh) ··· 983 980 984 981 if (neigh->nud_state == NUD_INCOMPLETE) { 985 982 if (skb) { 986 - if (skb_queue_len(&neigh->arp_queue) >= 987 - neigh->parms->queue_len) { 983 + while (neigh->arp_queue_len_bytes + skb->truesize > 984 + neigh->parms->queue_len_bytes) { 988 985 struct sk_buff *buff; 986 + 989 987 buff = __skb_dequeue(&neigh->arp_queue); 988 + if (!buff) 989 + break; 990 + neigh->arp_queue_len_bytes -= buff->truesize; 990 991 kfree_skb(buff); 991 992 NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards); 992 993 } 993 994 skb_dst_force(skb); 994 995 __skb_queue_tail(&neigh->arp_queue, skb); 996 + neigh->arp_queue_len_bytes += skb->truesize; 995 997 } 996 998 rc = 1; 997 999 } ··· 1183 1175 write_lock_bh(&neigh->lock); 1184 1176 } 1185 1177 skb_queue_purge(&neigh->arp_queue); 1178 + neigh->arp_queue_len_bytes = 0; 1186 1179 } 1187 1180 out: 1188 1181 if (update_isrouter) { ··· 1756 1747 NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex); 1757 1748 1758 1749 NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)); 1759 - NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len); 1750 + NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes); 1751 + /* approximative value for deprecated QUEUE_LEN (in packets) */ 1752 + NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, 1753 + DIV_ROUND_UP(parms->queue_len_bytes, 1754 + SKB_TRUESIZE(ETH_FRAME_LEN))); 1760 1755 NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen); 1761 1756 NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes); 1762 1757 NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes); ··· 1987 1974 1988 1975 switch (i) { 1989 1976 case NDTPA_QUEUE_LEN: 1990 - p->queue_len = nla_get_u32(tbp[i]); 1977 + p->queue_len_bytes = nla_get_u32(tbp[i]) * 1978 + SKB_TRUESIZE(ETH_FRAME_LEN); 1979 + break; 1980 + case NDTPA_QUEUE_LENBYTES: 1981 + p->queue_len_bytes = nla_get_u32(tbp[i]); 1991 1982 break; 1992 1983 case NDTPA_PROXY_QLEN: 1993 1984 p->proxy_qlen = nla_get_u32(tbp[i]); ··· 2652 2635 2653 2636 #ifdef CONFIG_SYSCTL 2654 2637 2655 - #define NEIGH_VARS_MAX 19 2638 + static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer, 2639 + size_t *lenp, loff_t *ppos) 2640 + { 2641 + int size, ret; 2642 + ctl_table tmp = *ctl; 2643 + 2644 + tmp.data = &size; 2645 + size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN)); 2646 + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); 2647 + if (write && !ret) 2648 + *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN); 2649 + return ret; 2650 + } 2651 + 2652 + enum { 2653 + NEIGH_VAR_MCAST_PROBE, 2654 + NEIGH_VAR_UCAST_PROBE, 2655 + NEIGH_VAR_APP_PROBE, 2656 + NEIGH_VAR_RETRANS_TIME, 2657 + NEIGH_VAR_BASE_REACHABLE_TIME, 2658 + NEIGH_VAR_DELAY_PROBE_TIME, 2659 + NEIGH_VAR_GC_STALETIME, 2660 + NEIGH_VAR_QUEUE_LEN, 2661 + NEIGH_VAR_QUEUE_LEN_BYTES, 2662 + NEIGH_VAR_PROXY_QLEN, 2663 + NEIGH_VAR_ANYCAST_DELAY, 2664 + NEIGH_VAR_PROXY_DELAY, 2665 + NEIGH_VAR_LOCKTIME, 2666 + NEIGH_VAR_RETRANS_TIME_MS, 2667 + NEIGH_VAR_BASE_REACHABLE_TIME_MS, 2668 + NEIGH_VAR_GC_INTERVAL, 2669 + NEIGH_VAR_GC_THRESH1, 2670 + NEIGH_VAR_GC_THRESH2, 2671 + NEIGH_VAR_GC_THRESH3, 2672 + NEIGH_VAR_MAX 2673 + }; 2656 2674 2657 2675 static struct neigh_sysctl_table { 2658 2676 struct ctl_table_header *sysctl_header; 2659 - struct ctl_table neigh_vars[NEIGH_VARS_MAX]; 2677 + struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1]; 2660 2678 char *dev_name; 2661 2679 } neigh_sysctl_template __read_mostly = { 2662 2680 .neigh_vars = { 2663 - { 2681 + [NEIGH_VAR_MCAST_PROBE] = { 2664 2682 .procname = "mcast_solicit", 2665 2683 .maxlen = sizeof(int), 2666 2684 .mode = 0644, 2667 2685 .proc_handler = proc_dointvec, 2668 2686 }, 2669 - { 2687 + [NEIGH_VAR_UCAST_PROBE] = { 2670 2688 .procname = "ucast_solicit", 2671 2689 .maxlen = sizeof(int), 2672 2690 .mode = 0644, 2673 2691 .proc_handler = proc_dointvec, 2674 2692 }, 2675 - { 2693 + [NEIGH_VAR_APP_PROBE] = { 2676 2694 .procname = "app_solicit", 2677 2695 .maxlen = sizeof(int), 2678 2696 .mode = 0644, 2679 2697 .proc_handler = proc_dointvec, 2680 2698 }, 2681 - { 2699 + [NEIGH_VAR_RETRANS_TIME] = { 2682 2700 .procname = "retrans_time", 2683 2701 .maxlen = sizeof(int), 2684 2702 .mode = 0644, 2685 2703 .proc_handler = proc_dointvec_userhz_jiffies, 2686 2704 }, 2687 - { 2705 + [NEIGH_VAR_BASE_REACHABLE_TIME] = { 2688 2706 .procname = "base_reachable_time", 2689 2707 .maxlen = sizeof(int), 2690 2708 .mode = 0644, 2691 2709 .proc_handler = proc_dointvec_jiffies, 2692 2710 }, 2693 - { 2711 + [NEIGH_VAR_DELAY_PROBE_TIME] = { 2694 2712 .procname = "delay_first_probe_time", 2695 2713 .maxlen = sizeof(int), 2696 2714 .mode = 0644, 2697 2715 .proc_handler = proc_dointvec_jiffies, 2698 2716 }, 2699 - { 2717 + [NEIGH_VAR_GC_STALETIME] = { 2700 2718 .procname = "gc_stale_time", 2701 2719 .maxlen = sizeof(int), 2702 2720 .mode = 0644, 2703 2721 .proc_handler = proc_dointvec_jiffies, 2704 2722 }, 2705 - { 2723 + [NEIGH_VAR_QUEUE_LEN] = { 2706 2724 .procname = "unres_qlen", 2725 + .maxlen = sizeof(int), 2726 + .mode = 0644, 2727 + .proc_handler = proc_unres_qlen, 2728 + }, 2729 + [NEIGH_VAR_QUEUE_LEN_BYTES] = { 2730 + .procname = "unres_qlen_bytes", 2707 2731 .maxlen = sizeof(int), 2708 2732 .mode = 0644, 2709 2733 .proc_handler = proc_dointvec, 2710 2734 }, 2711 - { 2735 + [NEIGH_VAR_PROXY_QLEN] = { 2712 2736 .procname = "proxy_qlen", 2713 2737 .maxlen = sizeof(int), 2714 2738 .mode = 0644, 2715 2739 .proc_handler = proc_dointvec, 2716 2740 }, 2717 - { 2741 + [NEIGH_VAR_ANYCAST_DELAY] = { 2718 2742 .procname = "anycast_delay", 2719 2743 .maxlen = sizeof(int), 2720 2744 .mode = 0644, 2721 2745 .proc_handler = proc_dointvec_userhz_jiffies, 2722 2746 }, 2723 - { 2747 + [NEIGH_VAR_PROXY_DELAY] = { 2724 2748 .procname = "proxy_delay", 2725 2749 .maxlen = sizeof(int), 2726 2750 .mode = 0644, 2727 2751 .proc_handler = proc_dointvec_userhz_jiffies, 2728 2752 }, 2729 - { 2753 + [NEIGH_VAR_LOCKTIME] = { 2730 2754 .procname = "locktime", 2731 2755 .maxlen = sizeof(int), 2732 2756 .mode = 0644, 2733 2757 .proc_handler = proc_dointvec_userhz_jiffies, 2734 2758 }, 2735 - { 2759 + [NEIGH_VAR_RETRANS_TIME_MS] = { 2736 2760 .procname = "retrans_time_ms", 2737 2761 .maxlen = sizeof(int), 2738 2762 .mode = 0644, 2739 2763 .proc_handler = proc_dointvec_ms_jiffies, 2740 2764 }, 2741 - { 2765 + [NEIGH_VAR_BASE_REACHABLE_TIME_MS] = { 2742 2766 .procname = "base_reachable_time_ms", 2743 2767 .maxlen = sizeof(int), 2744 2768 .mode = 0644, 2745 2769 .proc_handler = proc_dointvec_ms_jiffies, 2746 2770 }, 2747 - { 2771 + [NEIGH_VAR_GC_INTERVAL] = { 2748 2772 .procname = "gc_interval", 2749 2773 .maxlen = sizeof(int), 2750 2774 .mode = 0644, 2751 2775 .proc_handler = proc_dointvec_jiffies, 2752 2776 }, 2753 - { 2777 + [NEIGH_VAR_GC_THRESH1] = { 2754 2778 .procname = "gc_thresh1", 2755 2779 .maxlen = sizeof(int), 2756 2780 .mode = 0644, 2757 2781 .proc_handler = proc_dointvec, 2758 2782 }, 2759 - { 2783 + [NEIGH_VAR_GC_THRESH2] = { 2760 2784 .procname = "gc_thresh2", 2761 2785 .maxlen = sizeof(int), 2762 2786 .mode = 0644, 2763 2787 .proc_handler = proc_dointvec, 2764 2788 }, 2765 - { 2789 + [NEIGH_VAR_GC_THRESH3] = { 2766 2790 .procname = "gc_thresh3", 2767 2791 .maxlen = sizeof(int), 2768 2792 .mode = 0644, ··· 2836 2778 if (!t) 2837 2779 goto err; 2838 2780 2839 - t->neigh_vars[0].data = &p->mcast_probes; 2840 - t->neigh_vars[1].data = &p->ucast_probes; 2841 - t->neigh_vars[2].data = &p->app_probes; 2842 - t->neigh_vars[3].data = &p->retrans_time; 2843 - t->neigh_vars[4].data = &p->base_reachable_time; 2844 - t->neigh_vars[5].data = &p->delay_probe_time; 2845 - t->neigh_vars[6].data = &p->gc_staletime; 2846 - t->neigh_vars[7].data = &p->queue_len; 2847 - t->neigh_vars[8].data = &p->proxy_qlen; 2848 - t->neigh_vars[9].data = &p->anycast_delay; 2849 - t->neigh_vars[10].data = &p->proxy_delay; 2850 - t->neigh_vars[11].data = &p->locktime; 2851 - t->neigh_vars[12].data = &p->retrans_time; 2852 - t->neigh_vars[13].data = &p->base_reachable_time; 2781 + t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes; 2782 + t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes; 2783 + t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes; 2784 + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time; 2785 + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time; 2786 + t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time; 2787 + t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime; 2788 + t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes; 2789 + t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes; 2790 + t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen; 2791 + t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay; 2792 + t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay; 2793 + t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime; 2794 + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time; 2795 + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time; 2853 2796 2854 2797 if (dev) { 2855 2798 dev_name_source = dev->name; 2856 2799 /* Terminate the table early */ 2857 - memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14])); 2800 + memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0, 2801 + sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL])); 2858 2802 } else { 2859 2803 dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname; 2860 - t->neigh_vars[14].data = (int *)(p + 1); 2861 - t->neigh_vars[15].data = (int *)(p + 1) + 1; 2862 - t->neigh_vars[16].data = (int *)(p + 1) + 2; 2863 - t->neigh_vars[17].data = (int *)(p + 1) + 3; 2804 + t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1); 2805 + t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1; 2806 + t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2; 2807 + t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3; 2864 2808 } 2865 2809 2866 2810 2867 2811 if (handler) { 2868 2812 /* RetransTime */ 2869 - t->neigh_vars[3].proc_handler = handler; 2870 - t->neigh_vars[3].extra1 = dev; 2813 + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler; 2814 + t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev; 2871 2815 /* ReachableTime */ 2872 - t->neigh_vars[4].proc_handler = handler; 2873 - t->neigh_vars[4].extra1 = dev; 2816 + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler; 2817 + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev; 2874 2818 /* RetransTime (in milliseconds)*/ 2875 - t->neigh_vars[12].proc_handler = handler; 2876 - t->neigh_vars[12].extra1 = dev; 2819 + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler; 2820 + t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev; 2877 2821 /* ReachableTime (in milliseconds) */ 2878 - t->neigh_vars[13].proc_handler = handler; 2879 - t->neigh_vars[13].extra1 = dev; 2822 + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler; 2823 + t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev; 2880 2824 } 2881 2825 2882 2826 t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
+1 -1
net/decnet/dn_neigh.c
··· 107 107 .gc_staletime = 60 * HZ, 108 108 .reachable_time = 30 * HZ, 109 109 .delay_probe_time = 5 * HZ, 110 - .queue_len = 3, 110 + .queue_len_bytes = 64*1024, 111 111 .ucast_probes = 0, 112 112 .app_probes = 0, 113 113 .mcast_probes = 0,
+1 -1
net/ipv4/arp.c
··· 177 177 .gc_staletime = 60 * HZ, 178 178 .reachable_time = 30 * HZ, 179 179 .delay_probe_time = 5 * HZ, 180 - .queue_len = 3, 180 + .queue_len_bytes = 64*1024, 181 181 .ucast_probes = 3, 182 182 .mcast_probes = 3, 183 183 .anycast_delay = 1 * HZ,
+1 -1
net/ipv6/ndisc.c
··· 141 141 .gc_staletime = 60 * HZ, 142 142 .reachable_time = ND_REACHABLE_TIME, 143 143 .delay_probe_time = 5 * HZ, 144 - .queue_len = 3, 144 + .queue_len_bytes = 64*1024, 145 145 .ucast_probes = 3, 146 146 .mcast_probes = 3, 147 147 .anycast_delay = 1 * HZ,