net/sched/sch_dualpi2.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / sched / sch_dualpi2.c
at master 34 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
   2/* Copyright (C) 2024 Nokia
   3 *
   4 * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>
   5 * Author: Olga Albisser <olga@albisser.org>
   6 * Author: Henrik Steen <henrist@henrist.net>
   7 * Author: Olivier Tilmans <olivier.tilmans@nokia.com>
   8 * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
   9 *
  10 * DualPI Improved with a Square (dualpi2):
  11 * - Supports congestion controls that comply with the Prague requirements
  12 *   in RFC9331 (e.g. TCP-Prague)
  13 * - Supports coupled dual-queue with PI2 as defined in RFC9332
  14 * - Supports ECN L4S-identifier (IP.ECN==0b*1)
  15 *
  16 * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks,
  17 *   they do not meet the 'Prague L4S Requirements' listed in RFC 9331
  18 *   Section 4, so they can only be used with DualPI2 in a datacenter
  19 *   context.
  20 *
  21 * References:
  22 * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332
  23 * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and
  24 *   scalable TCP."  in proc. ACM CoNEXT'16, 2016.
  25 */
  26
  27#include <linux/errno.h>
  28#include <linux/hrtimer.h>
  29#include <linux/if_vlan.h>
  30#include <linux/kernel.h>
  31#include <linux/limits.h>
  32#include <linux/module.h>
  33#include <linux/skbuff.h>
  34#include <linux/types.h>
  35
  36#include <net/gso.h>
  37#include <net/inet_ecn.h>
  38#include <net/pkt_cls.h>
  39#include <net/pkt_sched.h>
  40
  41/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets
  42 * i.e., twice the maximal snd_cwnd.
  43 * MAX_PROB must be consistent with the RNG in dualpi2_roll().
  44 */
  45#define MAX_PROB U32_MAX
  46
  47/* alpha/beta values exchanged over netlink are in units of 256ns */
  48#define ALPHA_BETA_SHIFT 8
  49
  50/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later
  51 * computations. Consequently (see and dualpi2_scale_alpha_beta()), their
  52 * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1
  53 * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to
  54 * control flows whose maximal RTTs can be in usec up to few secs.
  55 */
  56#define ALPHA_BETA_MAX ((1U << 31) - 1)
  57
  58/* Internal alpha/beta are in units of 64ns.
  59 * This enables to use all alpha/beta values in the allowed range without loss
  60 * of precision due to rounding when scaling them internally, e.g.,
  61 * scale_alpha_beta(1) will not round down to 0.
  62 */
  63#define ALPHA_BETA_GRANULARITY 6
  64
  65#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY)
  66
  67/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */
  68#define MAX_WC 100
  69
  70struct dualpi2_sched_data {
  71	struct Qdisc *l_queue;	/* The L4S Low latency queue (L-queue) */
  72	struct Qdisc *sch;	/* The Classic queue (C-queue) */
  73
  74	/* Registered tc filters */
  75	struct tcf_proto __rcu *tcf_filters;
  76	struct tcf_block *tcf_block;
  77
  78	/* PI2 parameters */
  79	u64	pi2_target;	/* Target delay in nanoseconds */
  80	u32	pi2_tupdate;	/* Timer frequency in nanoseconds */
  81	u32	pi2_prob;	/* Base PI probability */
  82	u32	pi2_alpha;	/* Gain factor for the integral rate response */
  83	u32	pi2_beta;	/* Gain factor for the proportional response */
  84	struct hrtimer pi2_timer; /* prob update timer */
  85
  86	/* Step AQM (L-queue only) parameters */
  87	u32	step_thresh;	/* Step threshold */
  88	bool	step_in_packets; /* Step thresh in packets (1) or time (0) */
  89
  90	/* C-queue starvation protection */
  91	s32	c_protection_credit; /* Credit (sign indicates which queue) */
  92	s32	c_protection_init; /* Reset value of the credit */
  93	u8	c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */
  94	u8	c_protection_wl; /* L-queue weight (MAX_WC - wc) */
  95
  96	/* General dualQ parameters */
  97	u32	memory_limit;	/* Memory limit of both queues */
  98	u8	coupling_factor;/* Coupling factor (k) between both queues */
  99	u8	ecn_mask;	/* Mask to match packets into L-queue */
 100	u32	min_qlen_step;	/* Minimum queue length to apply step thresh */
 101	bool	drop_early;	/* Drop at enqueue (1) instead of dequeue  (0) */
 102	bool	drop_overload;	/* Drop (1) on overload, or overflow (0) */
 103	bool	split_gso;	/* Split aggregated skb (1) or leave as is (0) */
 104
 105	/* Statistics */
 106	u64	c_head_ts;	/* Enqueue timestamp of the C-queue head */
 107	u64	l_head_ts;	/* Enqueue timestamp of the L-queue head */
 108	u64	last_qdelay;	/* Q delay val at the last probability update */
 109	u32	packets_in_c;	/* Enqueue packet counter of the C-queue */
 110	u32	packets_in_l;	/* Enqueue packet counter of the L-queue */
 111	u32	maxq;		/* Maximum queue size of the C-queue */
 112	u32	ecn_mark;	/* ECN mark pkt counter due to PI probability */
 113	u32	step_marks;	/* ECN mark pkt counter due to step AQM */
 114	u32	memory_used;	/* Memory used of both queues */
 115	u32	max_memory_used;/* Maximum used memory */
 116
 117	/* Deferred drop statistics */
 118	u32	deferred_drops_cnt;	/* Packets dropped */
 119	u32	deferred_drops_len;	/* Bytes dropped */
 120};
 121
 122struct dualpi2_skb_cb {
 123	u64 ts;			/* Timestamp at enqueue */
 124	u8 apply_step:1,	/* Can we apply the step threshold */
 125	   classified:2,	/* Packet classification results */
 126	   ect:2;		/* Packet ECT codepoint */
 127};
 128
 129enum dualpi2_classification_results {
 130	DUALPI2_C_CLASSIC	= 0,	/* C-queue */
 131	DUALPI2_C_L4S		= 1,	/* L-queue (scale mark/classic drop) */
 132	DUALPI2_C_LLLL		= 2,	/* L-queue (no drops/marks) */
 133	__DUALPI2_C_MAX			/* Keep last*/
 134};
 135
 136static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb)
 137{
 138	qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb));
 139	return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data;
 140}
 141
 142static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference)
 143{
 144	return reference - dualpi2_skb_cb(skb)->ts;
 145}
 146
 147static u64 head_enqueue_time(struct Qdisc *q)
 148{
 149	struct sk_buff *skb = qdisc_peek_head(q);
 150
 151	return skb ? dualpi2_skb_cb(skb)->ts : 0;
 152}
 153
 154static u32 dualpi2_scale_alpha_beta(u32 param)
 155{
 156	u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING);
 157
 158	do_div(tmp, NSEC_PER_SEC);
 159	return tmp;
 160}
 161
 162static u32 dualpi2_unscale_alpha_beta(u32 param)
 163{
 164	u64 tmp = ((u64)param * NSEC_PER_SEC << ALPHA_BETA_SCALING);
 165
 166	do_div(tmp, MAX_PROB);
 167	return tmp;
 168}
 169
 170static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q)
 171{
 172	return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);
 173}
 174
 175static bool skb_is_l4s(struct sk_buff *skb)
 176{
 177	return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S;
 178}
 179
 180static bool skb_in_l_queue(struct sk_buff *skb)
 181{
 182	return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC;
 183}
 184
 185static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q)
 186{
 187	return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step;
 188}
 189
 190static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb)
 191{
 192	if (INET_ECN_set_ce(skb)) {
 193		q->ecn_mark++;
 194		return true;
 195	}
 196	return false;
 197}
 198
 199static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q)
 200{
 201	q->c_protection_credit = q->c_protection_init;
 202}
 203
 204/* This computes the initial credit value and WRR weight for the L queue (wl)
 205 * from the weight of the C queue (wc).
 206 * If wl > wc, the scheduler will start with the L queue when reset.
 207 */
 208static void dualpi2_calculate_c_protection(struct Qdisc *sch,
 209					   struct dualpi2_sched_data *q, u32 wc)
 210{
 211	q->c_protection_wc = wc;
 212	q->c_protection_wl = MAX_WC - wc;
 213	q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) *
 214		((int)q->c_protection_wc - (int)q->c_protection_wl);
 215	dualpi2_reset_c_protection(q);
 216}
 217
 218static bool dualpi2_roll(u32 prob)
 219{
 220	return get_random_u32() <= prob;
 221}
 222
 223/* Packets in the C-queue are subject to a marking probability pC, which is the
 224 * square of the internal PI probability (i.e., have an overall lower mark/drop
 225 * probability). If the qdisc is overloaded, ignore ECT values and only drop.
 226 *
 227 * Note that this marking scheme is also applied to L4S packets during overload.
 228 * Return true if packet dropping is required in C queue
 229 */
 230static bool dualpi2_classic_marking(struct dualpi2_sched_data *q,
 231				    struct sk_buff *skb, u32 prob,
 232				    bool overload)
 233{
 234	if (dualpi2_roll(prob) && dualpi2_roll(prob)) {
 235		if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
 236			return true;
 237		dualpi2_mark(q, skb);
 238	}
 239	return false;
 240}
 241
 242/* Packets in the L-queue are subject to a marking probability pL given by the
 243 * internal PI probability scaled by the coupling factor.
 244 *
 245 * On overload (i.e., @local_l_prob is >= 100%):
 246 * - if the qdisc is configured to trade losses to preserve latency (i.e.,
 247 *   @q->drop_overload), apply classic drops first before marking.
 248 * - otherwise, preserve the "no loss" property of ECN at the cost of queueing
 249 *   delay, eventually resulting in taildrop behavior once sch->limit is
 250 *   reached.
 251 * Return true if packet dropping is required in L queue
 252 */
 253static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q,
 254				     struct sk_buff *skb,
 255				     u64 local_l_prob, u32 prob,
 256				     bool overload)
 257{
 258	if (overload) {
 259		/* Apply classic drop */
 260		if (!q->drop_overload ||
 261		    !(dualpi2_roll(prob) && dualpi2_roll(prob)))
 262			goto mark;
 263		return true;
 264	}
 265
 266	/* We can safely cut the upper 32b as overload==false */
 267	if (dualpi2_roll(local_l_prob)) {
 268		/* Non-ECT packets could have classified as L4S by filters. */
 269		if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT)
 270			return true;
 271mark:
 272		dualpi2_mark(q, skb);
 273	}
 274	return false;
 275}
 276
 277/* Decide whether a given packet must be dropped (or marked if ECT), according
 278 * to the PI2 probability.
 279 *
 280 * Never mark/drop if we have a standing queue of less than 2 MTUs.
 281 */
 282static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q,
 283		      struct sk_buff *skb)
 284{
 285	u64 local_l_prob;
 286	bool overload;
 287	u32 prob;
 288
 289	if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch)))
 290		return false;
 291
 292	prob = READ_ONCE(q->pi2_prob);
 293	local_l_prob = (u64)prob * q->coupling_factor;
 294	overload = local_l_prob > MAX_PROB;
 295
 296	switch (dualpi2_skb_cb(skb)->classified) {
 297	case DUALPI2_C_CLASSIC:
 298		return dualpi2_classic_marking(q, skb, prob, overload);
 299	case DUALPI2_C_L4S:
 300		return dualpi2_scalable_marking(q, skb, local_l_prob, prob,
 301						overload);
 302	default: /* DUALPI2_C_LLLL */
 303		return false;
 304	}
 305}
 306
 307static void dualpi2_read_ect(struct sk_buff *skb)
 308{
 309	struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
 310	int wlen = skb_network_offset(skb);
 311
 312	switch (skb_protocol(skb, true)) {
 313	case htons(ETH_P_IP):
 314		wlen += sizeof(struct iphdr);
 315		if (!pskb_may_pull(skb, wlen) ||
 316		    skb_try_make_writable(skb, wlen))
 317			goto not_ecn;
 318
 319		cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK;
 320		break;
 321	case htons(ETH_P_IPV6):
 322		wlen += sizeof(struct ipv6hdr);
 323		if (!pskb_may_pull(skb, wlen) ||
 324		    skb_try_make_writable(skb, wlen))
 325			goto not_ecn;
 326
 327		cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK;
 328		break;
 329	default:
 330		goto not_ecn;
 331	}
 332	return;
 333
 334not_ecn:
 335	/* Non pullable/writable packets can only be dropped hence are
 336	 * classified as not ECT.
 337	 */
 338	cb->ect = INET_ECN_NOT_ECT;
 339}
 340
 341static int dualpi2_skb_classify(struct dualpi2_sched_data *q,
 342				struct sk_buff *skb)
 343{
 344	struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb);
 345	struct tcf_result res;
 346	struct tcf_proto *fl;
 347	int result;
 348
 349	dualpi2_read_ect(skb);
 350	if (cb->ect & q->ecn_mask) {
 351		cb->classified = DUALPI2_C_L4S;
 352		return NET_XMIT_SUCCESS;
 353	}
 354
 355	if (TC_H_MAJ(skb->priority) == q->sch->handle &&
 356	    TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) {
 357		cb->classified = TC_H_MIN(skb->priority);
 358		return NET_XMIT_SUCCESS;
 359	}
 360
 361	fl = rcu_dereference_bh(q->tcf_filters);
 362	if (!fl) {
 363		cb->classified = DUALPI2_C_CLASSIC;
 364		return NET_XMIT_SUCCESS;
 365	}
 366
 367	result = tcf_classify(skb, NULL, fl, &res, false);
 368	if (result >= 0) {
 369#ifdef CONFIG_NET_CLS_ACT
 370		switch (result) {
 371		case TC_ACT_STOLEN:
 372		case TC_ACT_QUEUED:
 373		case TC_ACT_TRAP:
 374			return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 375		case TC_ACT_SHOT:
 376			return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 377		}
 378#endif
 379		cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ?
 380			TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC;
 381	}
 382	return NET_XMIT_SUCCESS;
 383}
 384
 385static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch,
 386			       struct sk_buff **to_free)
 387{
 388	struct dualpi2_sched_data *q = qdisc_priv(sch);
 389	struct dualpi2_skb_cb *cb;
 390
 391	if (unlikely(qdisc_qlen(sch) >= sch->limit) ||
 392	    unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) {
 393		qdisc_qstats_overlimit(sch);
 394		if (skb_in_l_queue(skb))
 395			qdisc_qstats_overlimit(q->l_queue);
 396		return qdisc_drop_reason(skb, sch, to_free,
 397					 SKB_DROP_REASON_QDISC_OVERLIMIT);
 398	}
 399
 400	if (q->drop_early && must_drop(sch, q, skb)) {
 401		qdisc_drop_reason(skb, sch, to_free,
 402				  SKB_DROP_REASON_QDISC_CONGESTED);
 403		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 404	}
 405
 406	cb = dualpi2_skb_cb(skb);
 407	cb->ts = ktime_get_ns();
 408	q->memory_used += skb->truesize;
 409	if (q->memory_used > q->max_memory_used)
 410		q->max_memory_used = q->memory_used;
 411
 412	if (qdisc_qlen(sch) > q->maxq)
 413		q->maxq = qdisc_qlen(sch);
 414
 415	if (skb_in_l_queue(skb)) {
 416		/* Apply step thresh if skb is L4S && L-queue len >= min_qlen */
 417		dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q);
 418
 419		/* Keep the overall qdisc stats consistent */
 420		++sch->q.qlen;
 421		qdisc_qstats_backlog_inc(sch, skb);
 422		++q->packets_in_l;
 423		if (!q->l_head_ts)
 424			q->l_head_ts = cb->ts;
 425		return qdisc_enqueue_tail(skb, q->l_queue);
 426	}
 427	++q->packets_in_c;
 428	if (!q->c_head_ts)
 429		q->c_head_ts = cb->ts;
 430	return qdisc_enqueue_tail(skb, sch);
 431}
 432
 433/* By default, dualpi2 will split GSO skbs into independent skbs and enqueue
 434 * each of those individually. This yields the following benefits, at the
 435 * expense of CPU usage:
 436 * - Finer-grained AQM actions as the sub-packets of a burst no longer share the
 437 *   same fate (e.g., the random mark/drop probability is applied individually)
 438 * - Improved precision of the starvation protection/WRR scheduler at dequeue,
 439 *   as the size of the dequeued packets will be smaller.
 440 */
 441static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 442				 struct sk_buff **to_free)
 443{
 444	struct dualpi2_sched_data *q = qdisc_priv(sch);
 445	int err;
 446
 447	err = dualpi2_skb_classify(q, skb);
 448	if (err != NET_XMIT_SUCCESS) {
 449		if (err & __NET_XMIT_BYPASS)
 450			qdisc_qstats_drop(sch);
 451		__qdisc_drop(skb, to_free);
 452		return err;
 453	}
 454
 455	if (q->split_gso && skb_is_gso(skb)) {
 456		netdev_features_t features;
 457		struct sk_buff *nskb, *next;
 458		int cnt, byte_len, orig_len;
 459		int err;
 460
 461		features = netif_skb_features(skb);
 462		nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 463		if (IS_ERR_OR_NULL(nskb))
 464			return qdisc_drop(skb, sch, to_free);
 465
 466		cnt = 1;
 467		byte_len = 0;
 468		orig_len = qdisc_pkt_len(skb);
 469		skb_list_walk_safe(nskb, nskb, next) {
 470			skb_mark_not_on_list(nskb);
 471
 472			/* Iterate through GSO fragments of an skb:
 473			 * (1) Set pkt_len from the single GSO fragments
 474			 * (2) Copy classified and ect values of an skb
 475			 * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb
 476			 */
 477			qdisc_skb_cb(nskb)->pkt_len = nskb->len;
 478			qdisc_skb_cb(nskb)->pkt_segs = 1;
 479			dualpi2_skb_cb(nskb)->classified =
 480				dualpi2_skb_cb(skb)->classified;
 481			dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect;
 482			err = dualpi2_enqueue_skb(nskb, sch, to_free);
 483
 484			if (err == NET_XMIT_SUCCESS) {
 485				/* Compute the backlog adjustment that needs
 486				 * to be propagated in the qdisc tree to reflect
 487				 * all new skbs successfully enqueued.
 488				 */
 489				++cnt;
 490				byte_len += nskb->len;
 491			}
 492		}
 493		if (cnt > 1) {
 494			/* The caller will add the original skb stats to its
 495			 * backlog, compensate this if any nskb is enqueued.
 496			 */
 497			--cnt;
 498			byte_len -= orig_len;
 499		}
 500		qdisc_tree_reduce_backlog(sch, -cnt, -byte_len);
 501		consume_skb(skb);
 502		return err;
 503	}
 504	return dualpi2_enqueue_skb(skb, sch, to_free);
 505}
 506
 507/* Select the queue from which the next packet can be dequeued, ensuring that
 508 * neither queue can starve the other with a WRR scheduler.
 509 *
 510 * The sign of the WRR credit determines the next queue, while the size of
 511 * the dequeued packet determines the magnitude of the WRR credit change. If
 512 * either queue is empty, the WRR credit is kept unchanged.
 513 *
 514 * As the dequeued packet can be dropped later, the caller has to perform the
 515 * qdisc_bstats_update() calls.
 516 */
 517static struct sk_buff *dequeue_packet(struct Qdisc *sch,
 518				      struct dualpi2_sched_data *q,
 519				      int *credit_change,
 520				      u64 now)
 521{
 522	struct sk_buff *skb = NULL;
 523	int c_len;
 524
 525	*credit_change = 0;
 526	c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue);
 527	if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) {
 528		skb = __qdisc_dequeue_head(&q->l_queue->q);
 529		WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue));
 530		if (c_len)
 531			*credit_change = q->c_protection_wc;
 532		qdisc_qstats_backlog_dec(q->l_queue, skb);
 533
 534		/* Keep the global queue size consistent */
 535		--sch->q.qlen;
 536		q->memory_used -= skb->truesize;
 537	} else if (c_len) {
 538		skb = __qdisc_dequeue_head(&sch->q);
 539		WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch));
 540		if (qdisc_qlen(q->l_queue))
 541			*credit_change = ~((s32)q->c_protection_wl) + 1;
 542		q->memory_used -= skb->truesize;
 543	} else {
 544		dualpi2_reset_c_protection(q);
 545		return NULL;
 546	}
 547	*credit_change *= qdisc_pkt_len(skb);
 548	qdisc_qstats_backlog_dec(sch, skb);
 549	return skb;
 550}
 551
 552static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb,
 553		       u64 now)
 554{
 555	u64 qdelay = 0;
 556
 557	if (q->step_in_packets)
 558		qdelay = qdisc_qlen(q->l_queue);
 559	else
 560		qdelay = dualpi2_sojourn_time(skb, now);
 561
 562	if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) {
 563		if (!dualpi2_skb_cb(skb)->ect) {
 564			/* Drop this non-ECT packet */
 565			return 1;
 566		}
 567
 568		if (dualpi2_mark(q, skb))
 569			++q->step_marks;
 570	}
 571	qdisc_bstats_update(q->l_queue, skb);
 572	return 0;
 573}
 574
 575static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb,
 576			   struct Qdisc *sch, enum skb_drop_reason reason)
 577{
 578	++q->deferred_drops_cnt;
 579	q->deferred_drops_len += qdisc_pkt_len(skb);
 580	kfree_skb_reason(skb, reason);
 581	qdisc_qstats_drop(sch);
 582}
 583
 584static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch)
 585{
 586	struct dualpi2_sched_data *q = qdisc_priv(sch);
 587	struct sk_buff *skb;
 588	int credit_change;
 589	u64 now;
 590
 591	now = ktime_get_ns();
 592
 593	while ((skb = dequeue_packet(sch, q, &credit_change, now))) {
 594		if (!q->drop_early && must_drop(sch, q, skb)) {
 595			drop_and_retry(q, skb, sch,
 596				       SKB_DROP_REASON_QDISC_CONGESTED);
 597			continue;
 598		}
 599
 600		if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) {
 601			qdisc_qstats_drop(q->l_queue);
 602			drop_and_retry(q, skb, sch,
 603				       SKB_DROP_REASON_DUALPI2_STEP_DROP);
 604			continue;
 605		}
 606
 607		q->c_protection_credit += credit_change;
 608		qdisc_bstats_update(sch, skb);
 609		break;
 610	}
 611
 612	if (q->deferred_drops_cnt) {
 613		qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt,
 614					  q->deferred_drops_len);
 615		q->deferred_drops_cnt = 0;
 616		q->deferred_drops_len = 0;
 617	}
 618	return skb;
 619}
 620
 621static s64 __scale_delta(u64 diff)
 622{
 623	do_div(diff, 1 << ALPHA_BETA_GRANULARITY);
 624	return diff;
 625}
 626
 627static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c,
 628			     u64 *qdelay_l)
 629{
 630	u64 now, qc, ql;
 631
 632	now = ktime_get_ns();
 633	qc = READ_ONCE(q->c_head_ts);
 634	ql = READ_ONCE(q->l_head_ts);
 635
 636	*qdelay_c = qc ? now - qc : 0;
 637	*qdelay_l = ql ? now - ql : 0;
 638}
 639
 640static u32 calculate_probability(struct Qdisc *sch)
 641{
 642	struct dualpi2_sched_data *q = qdisc_priv(sch);
 643	u32 new_prob;
 644	u64 qdelay_c;
 645	u64 qdelay_l;
 646	u64 qdelay;
 647	s64 delta;
 648
 649	get_queue_delays(q, &qdelay_c, &qdelay_l);
 650	qdelay = max(qdelay_l, qdelay_c);
 651
 652	/* Alpha and beta take at most 32b, i.e, the delay difference would
 653	 * overflow for queuing delay differences > ~4.2sec.
 654	 */
 655	delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha;
 656	delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta;
 657	q->last_qdelay = qdelay;
 658
 659	/* Bound new_prob between 0 and MAX_PROB */
 660	if (delta > 0) {
 661		new_prob = __scale_delta(delta) + q->pi2_prob;
 662		if (new_prob < q->pi2_prob)
 663			new_prob = MAX_PROB;
 664	} else {
 665		new_prob = q->pi2_prob - __scale_delta(~delta + 1);
 666		if (new_prob > q->pi2_prob)
 667			new_prob = 0;
 668	}
 669
 670	/* If we do not drop on overload, ensure we cap the L4S probability to
 671	 * 100% to keep window fairness when overflowing.
 672	 */
 673	if (!q->drop_overload)
 674		return min_t(u32, new_prob, MAX_PROB / q->coupling_factor);
 675	return new_prob;
 676}
 677
 678static u32 get_memory_limit(struct Qdisc *sch, u32 limit)
 679{
 680	/* Apply rule of thumb, i.e., doubling the packet length,
 681	 * to further include per packet overhead in memory_limit.
 682	 */
 683	u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch)));
 684
 685	if (upper_32_bits(memlim))
 686		return U32_MAX;
 687	else
 688		return lower_32_bits(memlim);
 689}
 690
 691static u32 convert_us_to_nsec(u32 us)
 692{
 693	u64 ns = mul_u32_u32(us, NSEC_PER_USEC);
 694
 695	if (upper_32_bits(ns))
 696		return U32_MAX;
 697
 698	return lower_32_bits(ns);
 699}
 700
 701static u32 convert_ns_to_usec(u64 ns)
 702{
 703	do_div(ns, NSEC_PER_USEC);
 704	if (upper_32_bits(ns))
 705		return U32_MAX;
 706
 707	return lower_32_bits(ns);
 708}
 709
 710static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer)
 711{
 712	struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer);
 713	struct Qdisc *sch = q->sch;
 714	spinlock_t *root_lock; /* to lock qdisc for probability calculations */
 715
 716	rcu_read_lock();
 717	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 718	spin_lock(root_lock);
 719
 720	WRITE_ONCE(q->pi2_prob, calculate_probability(sch));
 721	hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q));
 722
 723	spin_unlock(root_lock);
 724	rcu_read_unlock();
 725	return HRTIMER_RESTART;
 726}
 727
 728static struct netlink_range_validation dualpi2_alpha_beta_range = {
 729	.min = 1,
 730	.max = ALPHA_BETA_MAX,
 731};
 732
 733static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = {
 734	[TCA_DUALPI2_LIMIT]		= NLA_POLICY_MIN(NLA_U32, 1),
 735	[TCA_DUALPI2_MEMORY_LIMIT]	= NLA_POLICY_MIN(NLA_U32, 1),
 736	[TCA_DUALPI2_TARGET]		= { .type = NLA_U32 },
 737	[TCA_DUALPI2_TUPDATE]		= NLA_POLICY_MIN(NLA_U32, 1),
 738	[TCA_DUALPI2_ALPHA]		=
 739		NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
 740	[TCA_DUALPI2_BETA]		=
 741		NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
 742	[TCA_DUALPI2_STEP_THRESH_PKTS]	= { .type = NLA_U32 },
 743	[TCA_DUALPI2_STEP_THRESH_US]	= { .type = NLA_U32 },
 744	[TCA_DUALPI2_MIN_QLEN_STEP]	= { .type = NLA_U32 },
 745	[TCA_DUALPI2_COUPLING]		= NLA_POLICY_MIN(NLA_U8, 1),
 746	[TCA_DUALPI2_DROP_OVERLOAD]	=
 747		NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX),
 748	[TCA_DUALPI2_DROP_EARLY]	=
 749		NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX),
 750	[TCA_DUALPI2_C_PROTECTION]	=
 751		NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC),
 752	[TCA_DUALPI2_ECN_MASK]		=
 753		NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT,
 754				 TCA_DUALPI2_ECN_MASK_MAX),
 755	[TCA_DUALPI2_SPLIT_GSO]		=
 756		NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX),
 757};
 758
 759static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
 760			  struct netlink_ext_ack *extack)
 761{
 762	struct nlattr *tb[TCA_DUALPI2_MAX + 1];
 763	struct dualpi2_sched_data *q;
 764	int old_backlog;
 765	int old_qlen;
 766	int err;
 767
 768	if (!opt || !nla_len(opt)) {
 769		NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required");
 770		return -EINVAL;
 771	}
 772	err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy,
 773			       extack);
 774	if (err < 0)
 775		return err;
 776	if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) {
 777		NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes");
 778		return -EINVAL;
 779	}
 780
 781	q = qdisc_priv(sch);
 782	sch_tree_lock(sch);
 783
 784	if (tb[TCA_DUALPI2_LIMIT]) {
 785		u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]);
 786
 787		WRITE_ONCE(sch->limit, limit);
 788		WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit));
 789	}
 790
 791	if (tb[TCA_DUALPI2_MEMORY_LIMIT])
 792		WRITE_ONCE(q->memory_limit,
 793			   nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]));
 794
 795	if (tb[TCA_DUALPI2_TARGET]) {
 796		u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]);
 797
 798		WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC);
 799	}
 800
 801	if (tb[TCA_DUALPI2_TUPDATE]) {
 802		u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]);
 803
 804		WRITE_ONCE(q->pi2_tupdate, convert_us_to_nsec(tupdate));
 805	}
 806
 807	if (tb[TCA_DUALPI2_ALPHA]) {
 808		u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]);
 809
 810		WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha));
 811	}
 812
 813	if (tb[TCA_DUALPI2_BETA]) {
 814		u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]);
 815
 816		WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta));
 817	}
 818
 819	if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) {
 820		u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]);
 821
 822		WRITE_ONCE(q->step_in_packets, true);
 823		WRITE_ONCE(q->step_thresh, step_th);
 824	} else if (tb[TCA_DUALPI2_STEP_THRESH_US]) {
 825		u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]);
 826
 827		WRITE_ONCE(q->step_in_packets, false);
 828		WRITE_ONCE(q->step_thresh, convert_us_to_nsec(step_th));
 829	}
 830
 831	if (tb[TCA_DUALPI2_MIN_QLEN_STEP])
 832		WRITE_ONCE(q->min_qlen_step,
 833			   nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]));
 834
 835	if (tb[TCA_DUALPI2_COUPLING]) {
 836		u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]);
 837
 838		WRITE_ONCE(q->coupling_factor, coupling);
 839	}
 840
 841	if (tb[TCA_DUALPI2_DROP_OVERLOAD]) {
 842		u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]);
 843
 844		WRITE_ONCE(q->drop_overload, (bool)drop_overload);
 845	}
 846
 847	if (tb[TCA_DUALPI2_DROP_EARLY]) {
 848		u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]);
 849
 850		WRITE_ONCE(q->drop_early, (bool)drop_early);
 851	}
 852
 853	if (tb[TCA_DUALPI2_C_PROTECTION]) {
 854		u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]);
 855
 856		dualpi2_calculate_c_protection(sch, q, wc);
 857	}
 858
 859	if (tb[TCA_DUALPI2_ECN_MASK]) {
 860		u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]);
 861
 862		WRITE_ONCE(q->ecn_mask, ecn_mask);
 863	}
 864
 865	if (tb[TCA_DUALPI2_SPLIT_GSO]) {
 866		u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]);
 867
 868		WRITE_ONCE(q->split_gso, (bool)split_gso);
 869	}
 870
 871	old_qlen = qdisc_qlen(sch);
 872	old_backlog = sch->qstats.backlog;
 873	while (qdisc_qlen(sch) > sch->limit ||
 874	       q->memory_used > q->memory_limit) {
 875		struct sk_buff *skb = qdisc_dequeue_internal(sch, true);
 876
 877		q->memory_used -= skb->truesize;
 878		qdisc_qstats_backlog_dec(sch, skb);
 879		rtnl_qdisc_drop(skb, sch);
 880	}
 881	qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch),
 882				  old_backlog - sch->qstats.backlog);
 883
 884	sch_tree_unlock(sch);
 885	return 0;
 886}
 887
 888/* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */
 889static void dualpi2_reset_default(struct Qdisc *sch)
 890{
 891	struct dualpi2_sched_data *q = qdisc_priv(sch);
 892
 893	q->sch->limit = 10000;				/* Max 125ms at 1Gbps */
 894	q->memory_limit = get_memory_limit(sch, q->sch->limit);
 895
 896	q->pi2_target = 15 * NSEC_PER_MSEC;
 897	q->pi2_tupdate = 16 * NSEC_PER_MSEC;
 898	q->pi2_alpha = dualpi2_scale_alpha_beta(41);	/* ~0.16 Hz * 256 */
 899	q->pi2_beta = dualpi2_scale_alpha_beta(819);	/* ~3.20 Hz * 256 */
 900
 901	q->step_thresh = 1 * NSEC_PER_MSEC;
 902	q->step_in_packets = false;
 903
 904	dualpi2_calculate_c_protection(q->sch, q, 10);	/* wc=10%, wl=90% */
 905
 906	q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT;	/* INET_ECN_ECT_1 */
 907	q->min_qlen_step = 0;		/* Always apply step mark in L-queue */
 908	q->coupling_factor = 2;		/* window fairness for equal RTTs */
 909	q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */
 910	q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */
 911	q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO;	/* Split GSO */
 912}
 913
 914static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt,
 915			struct netlink_ext_ack *extack)
 916{
 917	struct dualpi2_sched_data *q = qdisc_priv(sch);
 918	int err;
 919
 920	q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
 921				       TC_H_MAKE(sch->handle, 1), extack);
 922	if (!q->l_queue)
 923		return -ENOMEM;
 924
 925	err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack);
 926	if (err)
 927		return err;
 928
 929	q->sch = sch;
 930	dualpi2_reset_default(sch);
 931	hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC,
 932		      HRTIMER_MODE_ABS_PINNED_SOFT);
 933
 934	if (opt && nla_len(opt)) {
 935		err = dualpi2_change(sch, opt, extack);
 936
 937		if (err)
 938			return err;
 939	}
 940
 941	hrtimer_start(&q->pi2_timer, next_pi2_timeout(q),
 942		      HRTIMER_MODE_ABS_PINNED_SOFT);
 943	return 0;
 944}
 945
 946static int dualpi2_dump(struct Qdisc *sch, struct sk_buff *skb)
 947{
 948	struct dualpi2_sched_data *q = qdisc_priv(sch);
 949	struct nlattr *opts;
 950	bool step_in_pkts;
 951	u32 step_th;
 952
 953	step_in_pkts = READ_ONCE(q->step_in_packets);
 954	step_th = READ_ONCE(q->step_thresh);
 955
 956	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
 957	if (!opts)
 958		goto nla_put_failure;
 959
 960	if (step_in_pkts &&
 961	    (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
 962	    nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
 963			READ_ONCE(q->memory_limit)) ||
 964	    nla_put_u32(skb, TCA_DUALPI2_TARGET,
 965			convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
 966	    nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
 967			convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
 968	    nla_put_u32(skb, TCA_DUALPI2_ALPHA,
 969			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
 970	    nla_put_u32(skb, TCA_DUALPI2_BETA,
 971			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
 972	    nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_PKTS, step_th) ||
 973	    nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
 974			READ_ONCE(q->min_qlen_step)) ||
 975	    nla_put_u8(skb, TCA_DUALPI2_COUPLING,
 976		       READ_ONCE(q->coupling_factor)) ||
 977	    nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
 978		       READ_ONCE(q->drop_overload)) ||
 979	    nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
 980		       READ_ONCE(q->drop_early)) ||
 981	    nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
 982		       READ_ONCE(q->c_protection_wc)) ||
 983	    nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
 984	    nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
 985		goto nla_put_failure;
 986
 987	if (!step_in_pkts &&
 988	    (nla_put_u32(skb, TCA_DUALPI2_LIMIT, READ_ONCE(sch->limit)) ||
 989	    nla_put_u32(skb, TCA_DUALPI2_MEMORY_LIMIT,
 990			READ_ONCE(q->memory_limit)) ||
 991	    nla_put_u32(skb, TCA_DUALPI2_TARGET,
 992			convert_ns_to_usec(READ_ONCE(q->pi2_target))) ||
 993	    nla_put_u32(skb, TCA_DUALPI2_TUPDATE,
 994			convert_ns_to_usec(READ_ONCE(q->pi2_tupdate))) ||
 995	    nla_put_u32(skb, TCA_DUALPI2_ALPHA,
 996			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_alpha))) ||
 997	    nla_put_u32(skb, TCA_DUALPI2_BETA,
 998			dualpi2_unscale_alpha_beta(READ_ONCE(q->pi2_beta))) ||
 999	    nla_put_u32(skb, TCA_DUALPI2_STEP_THRESH_US,
1000			convert_ns_to_usec(step_th)) ||
1001	    nla_put_u32(skb, TCA_DUALPI2_MIN_QLEN_STEP,
1002			READ_ONCE(q->min_qlen_step)) ||
1003	    nla_put_u8(skb, TCA_DUALPI2_COUPLING,
1004		       READ_ONCE(q->coupling_factor)) ||
1005	    nla_put_u8(skb, TCA_DUALPI2_DROP_OVERLOAD,
1006		       READ_ONCE(q->drop_overload)) ||
1007	    nla_put_u8(skb, TCA_DUALPI2_DROP_EARLY,
1008		       READ_ONCE(q->drop_early)) ||
1009	    nla_put_u8(skb, TCA_DUALPI2_C_PROTECTION,
1010		       READ_ONCE(q->c_protection_wc)) ||
1011	    nla_put_u8(skb, TCA_DUALPI2_ECN_MASK, READ_ONCE(q->ecn_mask)) ||
1012	    nla_put_u8(skb, TCA_DUALPI2_SPLIT_GSO, READ_ONCE(q->split_gso))))
1013		goto nla_put_failure;
1014
1015	return nla_nest_end(skb, opts);
1016
1017nla_put_failure:
1018	nla_nest_cancel(skb, opts);
1019	return -1;
1020}
1021
1022static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1023{
1024	struct dualpi2_sched_data *q = qdisc_priv(sch);
1025	struct tc_dualpi2_xstats st = {
1026		.prob			= READ_ONCE(q->pi2_prob),
1027		.packets_in_c		= q->packets_in_c,
1028		.packets_in_l		= q->packets_in_l,
1029		.maxq			= q->maxq,
1030		.ecn_mark		= q->ecn_mark,
1031		.credit			= q->c_protection_credit,
1032		.step_marks		= q->step_marks,
1033		.memory_used		= q->memory_used,
1034		.max_memory_used	= q->max_memory_used,
1035		.memory_limit		= q->memory_limit,
1036	};
1037	u64 qc, ql;
1038
1039	get_queue_delays(q, &qc, &ql);
1040	st.delay_l = convert_ns_to_usec(ql);
1041	st.delay_c = convert_ns_to_usec(qc);
1042	return gnet_stats_copy_app(d, &st, sizeof(st));
1043}
1044
1045/* Reset both L-queue and C-queue, internal packet counters, PI probability,
1046 * C-queue protection credit, and timestamps, while preserving current
1047 * configuration of DUALPI2.
1048 */
1049static void dualpi2_reset(struct Qdisc *sch)
1050{
1051	struct dualpi2_sched_data *q = qdisc_priv(sch);
1052
1053	qdisc_reset_queue(sch);
1054	qdisc_reset_queue(q->l_queue);
1055	q->c_head_ts = 0;
1056	q->l_head_ts = 0;
1057	q->pi2_prob = 0;
1058	q->packets_in_c = 0;
1059	q->packets_in_l = 0;
1060	q->maxq = 0;
1061	q->ecn_mark = 0;
1062	q->step_marks = 0;
1063	q->memory_used = 0;
1064	q->max_memory_used = 0;
1065	dualpi2_reset_c_protection(q);
1066}
1067
1068static void dualpi2_destroy(struct Qdisc *sch)
1069{
1070	struct dualpi2_sched_data *q = qdisc_priv(sch);
1071
1072	q->pi2_tupdate = 0;
1073	hrtimer_cancel(&q->pi2_timer);
1074	if (q->l_queue)
1075		qdisc_put(q->l_queue);
1076	tcf_block_put(q->tcf_block);
1077}
1078
1079static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg)
1080{
1081	return NULL;
1082}
1083
1084static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid)
1085{
1086	return 0;
1087}
1088
1089static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent,
1090				  u32 classid)
1091{
1092	return 0;
1093}
1094
1095static void dualpi2_unbind(struct Qdisc *q, unsigned long cl)
1096{
1097}
1098
1099static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl,
1100					   struct netlink_ext_ack *extack)
1101{
1102	struct dualpi2_sched_data *q = qdisc_priv(sch);
1103
1104	if (cl)
1105		return NULL;
1106	return q->tcf_block;
1107}
1108
1109static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1110{
1111	unsigned int i;
1112
1113	if (arg->stop)
1114		return;
1115
1116	/* We statically define only 2 queues */
1117	for (i = 0; i < 2; i++) {
1118		if (arg->count < arg->skip) {
1119			arg->count++;
1120			continue;
1121		}
1122		if (arg->fn(sch, i + 1, arg) < 0) {
1123			arg->stop = 1;
1124			break;
1125		}
1126		arg->count++;
1127	}
1128}
1129
1130/* Minimal class support to handle tc filters */
1131static const struct Qdisc_class_ops dualpi2_class_ops = {
1132	.leaf		= dualpi2_leaf,
1133	.find		= dualpi2_find,
1134	.tcf_block	= dualpi2_tcf_block,
1135	.bind_tcf	= dualpi2_bind,
1136	.unbind_tcf	= dualpi2_unbind,
1137	.walk		= dualpi2_walk,
1138};
1139
1140static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = {
1141	.id		= "dualpi2",
1142	.cl_ops		= &dualpi2_class_ops,
1143	.priv_size	= sizeof(struct dualpi2_sched_data),
1144	.enqueue	= dualpi2_qdisc_enqueue,
1145	.dequeue	= dualpi2_qdisc_dequeue,
1146	.peek		= qdisc_peek_dequeued,
1147	.init		= dualpi2_init,
1148	.destroy	= dualpi2_destroy,
1149	.reset		= dualpi2_reset,
1150	.change		= dualpi2_change,
1151	.dump		= dualpi2_dump,
1152	.dump_stats	= dualpi2_dump_stats,
1153	.owner		= THIS_MODULE,
1154};
1155
1156static int __init dualpi2_module_init(void)
1157{
1158	return register_qdisc(&dualpi2_qdisc_ops);
1159}
1160
1161static void __exit dualpi2_module_exit(void)
1162{
1163	unregister_qdisc(&dualpi2_qdisc_ops);
1164}
1165
1166module_init(dualpi2_module_init);
1167module_exit(dualpi2_module_exit);
1168
1169MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler");
1170MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>");
1171MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>");
1172MODULE_AUTHOR("Olga Albisser <olga@albisser.org>");
1173MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>");
1174MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>");
1175
1176MODULE_LICENSE("Dual BSD/GPL");
1177MODULE_VERSION("1.0");