Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net_sched: sch_fq: add horizon attribute

QUIC servers would like to use SO_TXTIME, without having CAP_NET_ADMIN,
to efficiently pace UDP packets.

As far as sch_fq is concerned, we need to add safety checks, so
that a buggy application does not fill the qdisc with packets
having delivery time far in the future.

This patch adds a configurable horizon (default: 10 seconds),
and a configurable policy when a packet is beyond the horizon
at enqueue() time:
- either drop the packet (default policy)
- or cap its delivery time to the horizon.

$ tc -s -d qd sh dev eth0
qdisc fq 8022: root refcnt 257 limit 10000p flow_limit 100p buckets 1024
orphan_mask 1023 quantum 10Kb initial_quantum 51160b low_rate_threshold 550Kbit
refill_delay 40.0ms timer_slack 10.000us horizon 10.000s
Sent 1234215879 bytes 837099 pkt (dropped 21, overlimits 0 requeues 6)
backlog 0b 0p requeues 6
flows 1191 (inactive 1177 throttled 0)
gc 0 highprio 0 throttled 692 latency 11.480us
pkts_too_long 0 alloc_errors 0 horizon_drops 21 horizon_caps 0

v2: fixed an overflow on 32bit kernels in fq_init(), reported
by kbuild test robot <lkp@intel.com>

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
39d01050 bf6dba76

+60 -5
+6
include/uapi/linux/pkt_sched.h
··· 913 913 914 914 TCA_FQ_TIMER_SLACK, /* timer slack */ 915 915 916 + TCA_FQ_HORIZON, /* time horizon in us */ 917 + 918 + TCA_FQ_HORIZON_DROP, /* drop packets beyond horizon, or cap their EDT */ 919 + 916 920 __TCA_FQ_MAX 917 921 }; 918 922 ··· 936 932 __u32 throttled_flows; 937 933 __u32 unthrottle_latency_ns; 938 934 __u64 ce_mark; /* packets above ce_threshold */ 935 + __u64 horizon_drops; 936 + __u64 horizon_caps; 939 937 }; 940 938 941 939 /* Heavy-Hitter Filter */
+54 -5
net/sched/sch_fq.c
··· 100 100 101 101 struct rb_root delayed; /* for rate limited flows */ 102 102 u64 time_next_delayed_flow; 103 + u64 ktime_cache; /* copy of last ktime_get_ns() */ 103 104 unsigned long unthrottle_latency_ns; 104 105 105 106 struct fq_flow internal; /* for non classified or high prio packets */ ··· 110 109 u32 flow_plimit; /* max packets per flow */ 111 110 unsigned long flow_max_rate; /* optional max rate per flow */ 112 111 u64 ce_threshold; 112 + u64 horizon; /* horizon in ns */ 113 113 u32 orphan_mask; /* mask for orphaned skb */ 114 114 u32 low_rate_threshold; 115 115 struct rb_root *fq_root; 116 116 u8 rate_enable; 117 117 u8 fq_trees_log; 118 - 118 + u8 horizon_drop; 119 119 u32 flows; 120 120 u32 inactive_flows; 121 121 u32 throttled_flows; ··· 125 123 u64 stat_internal_packets; 126 124 u64 stat_throttled; 127 125 u64 stat_ce_mark; 126 + u64 stat_horizon_drops; 127 + u64 stat_horizon_caps; 128 128 u64 stat_flows_plimit; 129 129 u64 stat_pkts_too_long; 130 130 u64 stat_allocation_errors; ··· 406 402 struct rb_node **p, *parent; 407 403 struct sk_buff *head, *aux; 408 404 409 - fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns(); 410 - 411 405 head = flow->head; 412 406 if (!head || 413 407 fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { ··· 433 431 rb_insert_color(&skb->rbnode, &flow->t_root); 434 432 } 435 433 434 + static bool fq_packet_beyond_horizon(const struct sk_buff *skb, 435 + const struct fq_sched_data *q) 436 + { 437 + return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); 438 + } 439 + 436 440 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, 437 441 struct sk_buff **to_free) 438 442 { ··· 447 439 448 440 if (unlikely(sch->q.qlen >= sch->limit)) 449 441 return qdisc_drop(skb, sch, to_free); 442 + 443 + if (!skb->tstamp) { 444 + fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); 445 + } else { 446 + /* Check if packet timestamp is too far in the future. 447 + * Try first if our cached value, to avoid ktime_get_ns() 448 + * cost in most cases. 449 + */ 450 + if (fq_packet_beyond_horizon(skb, q)) { 451 + /* Refresh our cache and check another time */ 452 + q->ktime_cache = ktime_get_ns(); 453 + if (fq_packet_beyond_horizon(skb, q)) { 454 + if (q->horizon_drop) { 455 + q->stat_horizon_drops++; 456 + return qdisc_drop(skb, sch, to_free); 457 + } 458 + q->stat_horizon_caps++; 459 + skb->tstamp = q->ktime_cache + q->horizon; 460 + } 461 + } 462 + fq_skb_cb(skb)->time_to_send = skb->tstamp; 463 + } 450 464 451 465 f = fq_classify(skb, q); 452 466 if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { ··· 542 512 goto out; 543 513 } 544 514 545 - now = ktime_get_ns(); 515 + q->ktime_cache = now = ktime_get_ns(); 546 516 fq_check_throttled(q, now); 547 517 begin: 548 518 head = &q->new_flows; ··· 795 765 [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, 796 766 [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, 797 767 [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, 768 + [TCA_FQ_HORIZON] = { .type = NLA_U32 }, 769 + [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, 798 770 }; 799 771 800 772 static int fq_change(struct Qdisc *sch, struct nlattr *opt, ··· 886 854 if (tb[TCA_FQ_TIMER_SLACK]) 887 855 q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); 888 856 857 + if (tb[TCA_FQ_HORIZON]) 858 + q->horizon = (u64)NSEC_PER_USEC * 859 + nla_get_u32(tb[TCA_FQ_HORIZON]); 860 + 861 + if (tb[TCA_FQ_HORIZON_DROP]) 862 + q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); 863 + 889 864 if (!err) { 865 + 890 866 sch_tree_unlock(sch); 891 867 err = fq_resize(sch, fq_log); 892 868 sch_tree_lock(sch); ··· 947 907 948 908 q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ 949 909 910 + q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ 911 + q->horizon_drop = 1; /* by default, drop packets beyond horizon */ 912 + 950 913 /* Default ce_threshold of 4294 seconds */ 951 914 q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; 952 915 ··· 967 924 { 968 925 struct fq_sched_data *q = qdisc_priv(sch); 969 926 u64 ce_threshold = q->ce_threshold; 927 + u64 horizon = q->horizon; 970 928 struct nlattr *opts; 971 929 972 930 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); ··· 977 933 /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ 978 934 979 935 do_div(ce_threshold, NSEC_PER_USEC); 936 + do_div(horizon, NSEC_PER_USEC); 980 937 981 938 if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || 982 939 nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || ··· 993 948 q->low_rate_threshold) || 994 949 nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || 995 950 nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || 996 - nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack)) 951 + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || 952 + nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || 953 + nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) 997 954 goto nla_put_failure; 998 955 999 956 return nla_nest_end(skb, opts); ··· 1026 979 st.unthrottle_latency_ns = min_t(unsigned long, 1027 980 q->unthrottle_latency_ns, ~0U); 1028 981 st.ce_mark = q->stat_ce_mark; 982 + st.horizon_drops = q->stat_horizon_drops; 983 + st.horizon_caps = q->stat_horizon_caps; 1029 984 sch_tree_unlock(sch); 1030 985 1031 986 return gnet_stats_copy_app(d, &st, sizeof(st));