net, sched: fix panic when updating miniq {b,q}stats

While working on fixing another bug, I ran into the following panic
on arm64 by simply attaching clsact qdisc, adding a filter and running
traffic on ingress to it:

[...]
[ 178.188591] Unable to handle kernel read from unreadable memory at virtual address 810fb501f000
[ 178.197314] Mem abort info:
[ 178.200121] ESR = 0x96000004
[ 178.203168] Exception class = DABT (current EL), IL = 32 bits
[ 178.209095] SET = 0, FnV = 0
[ 178.212157] EA = 0, S1PTW = 0
[ 178.215288] Data abort info:
[ 178.218175] ISV = 0, ISS = 0x00000004
[ 178.222019] CM = 0, WnR = 0
[ 178.224997] user pgtable: 4k pages, 48-bit VAs, pgd = 0000000023cb3f33
[ 178.231531] [0000810fb501f000] *pgd=0000000000000000
[ 178.236508] Internal error: Oops: 96000004 [#1] SMP
[...]
[ 178.311855] CPU: 73 PID: 2497 Comm: ping Tainted: G W 4.15.0-rc7+ #5
[ 178.319413] Hardware name: FOXCONN R2-1221R-A4/C2U4N_MB, BIOS G31FB18A 03/31/2017
[ 178.326887] pstate: 60400005 (nZCv daif +PAN -UAO)
[ 178.331685] pc : __netif_receive_skb_core+0x49c/0xac8
[ 178.336728] lr : __netif_receive_skb+0x28/0x78
[ 178.341161] sp : ffff00002344b750
[ 178.344465] x29: ffff00002344b750 x28: ffff810fbdfd0580
[ 178.349769] x27: 0000000000000000 x26: ffff000009378000
[...]
[ 178.418715] x1 : 0000000000000054 x0 : 0000000000000000
[ 178.424020] Process ping (pid: 2497, stack limit = 0x000000009f0a3ff4)
[ 178.430537] Call trace:
[ 178.432976] __netif_receive_skb_core+0x49c/0xac8
[ 178.437670] __netif_receive_skb+0x28/0x78
[ 178.441757] process_backlog+0x9c/0x160
[ 178.445584] net_rx_action+0x2f8/0x3f0
[...]

Reason is that sch_ingress and sch_clsact are doing mini_qdisc_pair_init()
which sets up miniq pointers to cpu_{b,q}stats from the underlying qdisc.
Problem is that this cannot work since they are actually set up right after
the qdisc ->init() callback in qdisc_create(), so first packet going into
sch_handle_ingress() tries to call mini_qdisc_bstats_cpu_update() and we
therefore panic.

In order to fix this, allocation of {b,q}stats needs to happen before we
call into ->init(). In net-next, there's already such option through commit
d59f5ffa59d8 ("net: sched: a dflt qdisc may be used with per cpu stats").
However, the bug needs to be fixed in net still for 4.15. Thus, include
these bits to reduce any merge churn and reuse the static_flags field to
set TCQ_F_CPUSTATS, and remove the allocation from qdisc_create() since
there is no other user left. Prashant Bhole ran into the same issue but
for net-next, thus adding him below as well as co-author. Same issue was
also reported by Sandipan Das when using bcc.

Fixes: 46209401f8f6 ("net: core: introduce mini_Qdisc and eliminate usage of tp->q for clsact fastpath")
Reference: https://lists.iovisor.org/pipermail/iovisor-dev/2018-January/001190.html
Reported-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Co-authored-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Co-authored-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>

+24 -30
+2
include/net/sch_generic.h
··· 179 const struct Qdisc_class_ops *cl_ops; 180 char id[IFNAMSIZ]; 181 int priv_size; 182 183 int (*enqueue)(struct sk_buff *skb, 184 struct Qdisc *sch, ··· 445 unsigned int len); 446 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, 447 const struct Qdisc_ops *ops); 448 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, 449 const struct Qdisc_ops *ops, u32 parentid); 450 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
··· 179 const struct Qdisc_class_ops *cl_ops; 180 char id[IFNAMSIZ]; 181 int priv_size; 182 + unsigned int static_flags; 183 184 int (*enqueue)(struct sk_buff *skb, 185 struct Qdisc *sch, ··· 444 unsigned int len); 445 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, 446 const struct Qdisc_ops *ops); 447 + void qdisc_free(struct Qdisc *qdisc); 448 struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, 449 const struct Qdisc_ops *ops, u32 parentid); 450 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
+1 -14
net/sched/sch_api.c
··· 1063 } 1064 1065 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { 1066 - if (qdisc_is_percpu_stats(sch)) { 1067 - sch->cpu_bstats = 1068 - netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); 1069 - if (!sch->cpu_bstats) 1070 - goto err_out4; 1071 - 1072 - sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); 1073 - if (!sch->cpu_qstats) 1074 - goto err_out4; 1075 - } 1076 - 1077 if (tca[TCA_STAB]) { 1078 stab = qdisc_get_stab(tca[TCA_STAB]); 1079 if (IS_ERR(stab)) { ··· 1104 ops->destroy(sch); 1105 err_out3: 1106 dev_put(dev); 1107 - kfree((char *) sch - sch->padded); 1108 err_out2: 1109 module_put(ops->owner); 1110 err_out: ··· 1112 return NULL; 1113 1114 err_out4: 1115 - free_percpu(sch->cpu_bstats); 1116 - free_percpu(sch->cpu_qstats); 1117 /* 1118 * Any broken qdiscs that would require a ops->reset() here? 1119 * The qdisc was never in action so it shouldn't be necessary.
··· 1063 } 1064 1065 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) { 1066 if (tca[TCA_STAB]) { 1067 stab = qdisc_get_stab(tca[TCA_STAB]); 1068 if (IS_ERR(stab)) { ··· 1115 ops->destroy(sch); 1116 err_out3: 1117 dev_put(dev); 1118 + qdisc_free(sch); 1119 err_out2: 1120 module_put(ops->owner); 1121 err_out: ··· 1123 return NULL; 1124 1125 err_out4: 1126 /* 1127 * Any broken qdiscs that would require a ops->reset() here? 1128 * The qdisc was never in action so it shouldn't be necessary.
+17 -1
net/sched/sch_generic.c
··· 633 qdisc_skb_head_init(&sch->q); 634 spin_lock_init(&sch->q.lock); 635 636 spin_lock_init(&sch->busylock); 637 lockdep_set_class(&sch->busylock, 638 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); ··· 655 dev->qdisc_running_key ?: &qdisc_running_key); 656 657 sch->ops = ops; 658 sch->enqueue = ops->enqueue; 659 sch->dequeue = ops->dequeue; 660 sch->dev_queue = dev_queue; ··· 663 refcount_set(&sch->refcnt, 1); 664 665 return sch; 666 errout: 667 return ERR_PTR(err); 668 } ··· 714 } 715 EXPORT_SYMBOL(qdisc_reset); 716 717 - static void qdisc_free(struct Qdisc *qdisc) 718 { 719 if (qdisc_is_percpu_stats(qdisc)) { 720 free_percpu(qdisc->cpu_bstats);
··· 633 qdisc_skb_head_init(&sch->q); 634 spin_lock_init(&sch->q.lock); 635 636 + if (ops->static_flags & TCQ_F_CPUSTATS) { 637 + sch->cpu_bstats = 638 + netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); 639 + if (!sch->cpu_bstats) 640 + goto errout1; 641 + 642 + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); 643 + if (!sch->cpu_qstats) { 644 + free_percpu(sch->cpu_bstats); 645 + goto errout1; 646 + } 647 + } 648 + 649 spin_lock_init(&sch->busylock); 650 lockdep_set_class(&sch->busylock, 651 dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); ··· 642 dev->qdisc_running_key ?: &qdisc_running_key); 643 644 sch->ops = ops; 645 + sch->flags = ops->static_flags; 646 sch->enqueue = ops->enqueue; 647 sch->dequeue = ops->dequeue; 648 sch->dev_queue = dev_queue; ··· 649 refcount_set(&sch->refcnt, 1); 650 651 return sch; 652 + errout1: 653 + kfree(p); 654 errout: 655 return ERR_PTR(err); 656 } ··· 698 } 699 EXPORT_SYMBOL(qdisc_reset); 700 701 + void qdisc_free(struct Qdisc *qdisc) 702 { 703 if (qdisc_is_percpu_stats(qdisc)) { 704 free_percpu(qdisc->cpu_bstats);
+4 -15
net/sched/sch_ingress.c
··· 66 { 67 struct ingress_sched_data *q = qdisc_priv(sch); 68 struct net_device *dev = qdisc_dev(sch); 69 - int err; 70 71 net_inc_ingress_queue(); 72 ··· 75 q->block_info.chain_head_change = clsact_chain_head_change; 76 q->block_info.chain_head_change_priv = &q->miniqp; 77 78 - err = tcf_block_get_ext(&q->block, sch, &q->block_info); 79 - if (err) 80 - return err; 81 - 82 - sch->flags |= TCQ_F_CPUSTATS; 83 - 84 - return 0; 85 } 86 87 static void ingress_destroy(struct Qdisc *sch) ··· 114 .cl_ops = &ingress_class_ops, 115 .id = "ingress", 116 .priv_size = sizeof(struct ingress_sched_data), 117 .init = ingress_init, 118 .destroy = ingress_destroy, 119 .dump = ingress_dump, ··· 186 q->egress_block_info.chain_head_change = clsact_chain_head_change; 187 q->egress_block_info.chain_head_change_priv = &q->miniqp_egress; 188 189 - err = tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); 190 - if (err) 191 - return err; 192 - 193 - sch->flags |= TCQ_F_CPUSTATS; 194 - 195 - return 0; 196 } 197 198 static void clsact_destroy(struct Qdisc *sch) ··· 213 .cl_ops = &clsact_class_ops, 214 .id = "clsact", 215 .priv_size = sizeof(struct clsact_sched_data), 216 .init = clsact_init, 217 .destroy = clsact_destroy, 218 .dump = ingress_dump,
··· 66 { 67 struct ingress_sched_data *q = qdisc_priv(sch); 68 struct net_device *dev = qdisc_dev(sch); 69 70 net_inc_ingress_queue(); 71 ··· 76 q->block_info.chain_head_change = clsact_chain_head_change; 77 q->block_info.chain_head_change_priv = &q->miniqp; 78 79 + return tcf_block_get_ext(&q->block, sch, &q->block_info); 80 } 81 82 static void ingress_destroy(struct Qdisc *sch) ··· 121 .cl_ops = &ingress_class_ops, 122 .id = "ingress", 123 .priv_size = sizeof(struct ingress_sched_data), 124 + .static_flags = TCQ_F_CPUSTATS, 125 .init = ingress_init, 126 .destroy = ingress_destroy, 127 .dump = ingress_dump, ··· 192 q->egress_block_info.chain_head_change = clsact_chain_head_change; 193 q->egress_block_info.chain_head_change_priv = &q->miniqp_egress; 194 195 + return tcf_block_get_ext(&q->egress_block, sch, &q->egress_block_info); 196 } 197 198 static void clsact_destroy(struct Qdisc *sch) ··· 225 .cl_ops = &clsact_class_ops, 226 .id = "clsact", 227 .priv_size = sizeof(struct clsact_sched_data), 228 + .static_flags = TCQ_F_CPUSTATS, 229 .init = clsact_init, 230 .destroy = clsact_destroy, 231 .dump = ingress_dump,