net/sched/sch_cake.c at master · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / net / sched / sch_cake.c
at master 83 kB view raw
   1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2
   3/* COMMON Applications Kept Enhanced (CAKE) discipline
   4 *
   5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
   6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
   7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
   8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
   9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
  10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
  11 *
  12 * The CAKE Principles:
  13 *		   (or, how to have your cake and eat it too)
  14 *
  15 * This is a combination of several shaping, AQM and FQ techniques into one
  16 * easy-to-use package:
  17 *
  18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
  19 *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
  20 *   eliminating the need for any sort of burst parameter (eg. token bucket
  21 *   depth).  Burst support is limited to that necessary to overcome scheduling
  22 *   latency.
  23 *
  24 * - A Diffserv-aware priority queue, giving more priority to certain classes,
  25 *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
  26 *   the priority is reduced to avoid starving other tins.
  27 *
  28 * - Each priority tin has a separate Flow Queue system, to isolate traffic
  29 *   flows from each other.  This prevents a burst on one flow from increasing
  30 *   the delay to another.  Flows are distributed to queues using a
  31 *   set-associative hash function.
  32 *
  33 * - Each queue is actively managed by Cobalt, which is a combination of the
  34 *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
  35 *   congestion early via ECN (if available) and/or packet drops, to keep
  36 *   latency low.  The codel parameters are auto-tuned based on the bandwidth
  37 *   setting, as is necessary at low bandwidths.
  38 *
  39 * The configuration parameters are kept deliberately simple for ease of use.
  40 * Everything has sane defaults.  Complete generality of configuration is *not*
  41 * a goal.
  42 *
  43 * The priority queue operates according to a weighted DRR scheme, combined with
  44 * a bandwidth tracker which reuses the shaper logic to detect which side of the
  45 * bandwidth sharing threshold the tin is operating.  This determines whether a
  46 * priority-based weight (high) or a bandwidth-based weight (low) is used for
  47 * that tin in the current pass.
  48 *
  49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
  50 * granted us permission to leverage.
  51 */
  52
  53#include <linux/module.h>
  54#include <linux/types.h>
  55#include <linux/kernel.h>
  56#include <linux/jiffies.h>
  57#include <linux/string.h>
  58#include <linux/in.h>
  59#include <linux/errno.h>
  60#include <linux/init.h>
  61#include <linux/skbuff.h>
  62#include <linux/jhash.h>
  63#include <linux/slab.h>
  64#include <linux/vmalloc.h>
  65#include <linux/reciprocal_div.h>
  66#include <net/netlink.h>
  67#include <linux/if_vlan.h>
  68#include <net/gso.h>
  69#include <net/pkt_sched.h>
  70#include <net/pkt_cls.h>
  71#include <net/tcp.h>
  72#include <net/flow_dissector.h>
  73
  74#if IS_ENABLED(CONFIG_NF_CONNTRACK)
  75#include <net/netfilter/nf_conntrack_core.h>
  76#endif
  77
  78#define CAKE_SET_WAYS (8)
  79#define CAKE_MAX_TINS (8)
  80#define CAKE_QUEUES (1024)
  81#define CAKE_FLOW_MASK 63
  82#define CAKE_FLOW_NAT_FLAG 64
  83
  84/* struct cobalt_params - contains codel and blue parameters
  85 * @interval:	codel initial drop rate
  86 * @target:     maximum persistent sojourn time & blue update rate
  87 * @mtu_time:   serialisation delay of maximum-size packet
  88 * @p_inc:      increment of blue drop probability (0.32 fxp)
  89 * @p_dec:      decrement of blue drop probability (0.32 fxp)
  90 */
  91struct cobalt_params {
  92	u64	interval;
  93	u64	target;
  94	u64	mtu_time;
  95	u32	p_inc;
  96	u32	p_dec;
  97};
  98
  99/* struct cobalt_vars - contains codel and blue variables
 100 * @count:		codel dropping frequency
 101 * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
 102 * @drop_next:		time to drop next packet, or when we dropped last
 103 * @blue_timer:		Blue time to next drop
 104 * @p_drop:		BLUE drop probability (0.32 fxp)
 105 * @dropping:		set if in dropping state
 106 * @ecn_marked:		set if marked
 107 */
 108struct cobalt_vars {
 109	u32	count;
 110	u32	rec_inv_sqrt;
 111	ktime_t	drop_next;
 112	ktime_t	blue_timer;
 113	u32     p_drop;
 114	bool	dropping;
 115	bool    ecn_marked;
 116};
 117
 118enum {
 119	CAKE_SET_NONE = 0,
 120	CAKE_SET_SPARSE,
 121	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
 122	CAKE_SET_BULK,
 123	CAKE_SET_DECAYING
 124};
 125
 126struct cake_flow {
 127	/* this stuff is all needed per-flow at dequeue time */
 128	struct sk_buff	  *head;
 129	struct sk_buff	  *tail;
 130	struct list_head  flowchain;
 131	s32		  deficit;
 132	u32		  dropped;
 133	struct cobalt_vars cvars;
 134	u16		  srchost; /* index into cake_host table */
 135	u16		  dsthost;
 136	u8		  set;
 137}; /* please try to keep this structure <= 64 bytes */
 138
 139struct cake_host {
 140	u32 srchost_tag;
 141	u32 dsthost_tag;
 142	u16 srchost_bulk_flow_count;
 143	u16 dsthost_bulk_flow_count;
 144};
 145
 146struct cake_heap_entry {
 147	u16 t:3, b:10;
 148};
 149
 150struct cake_tin_data {
 151	struct cake_flow flows[CAKE_QUEUES];
 152	u32	backlogs[CAKE_QUEUES];
 153	u32	tags[CAKE_QUEUES]; /* for set association */
 154	u16	overflow_idx[CAKE_QUEUES];
 155	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
 156	u16	flow_quantum;
 157
 158	struct cobalt_params cparams;
 159	u32	drop_overlimit;
 160	u16	bulk_flow_count;
 161	u16	sparse_flow_count;
 162	u16	decaying_flow_count;
 163	u16	unresponsive_flow_count;
 164
 165	u32	max_skblen;
 166
 167	struct list_head new_flows;
 168	struct list_head old_flows;
 169	struct list_head decaying_flows;
 170
 171	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 172	ktime_t	time_next_packet;
 173	u64	tin_rate_ns;
 174	u64	tin_rate_bps;
 175	u16	tin_rate_shft;
 176
 177	u16	tin_quantum;
 178	s32	tin_deficit;
 179	u32	tin_backlog;
 180	u32	tin_dropped;
 181	u32	tin_ecn_mark;
 182
 183	u32	packets;
 184	u64	bytes;
 185
 186	u32	ack_drops;
 187
 188	/* moving averages */
 189	u64 avge_delay;
 190	u64 peak_delay;
 191	u64 base_delay;
 192
 193	/* hash function stats */
 194	u32	way_directs;
 195	u32	way_hits;
 196	u32	way_misses;
 197	u32	way_collisions;
 198}; /* number of tins is small, so size of this struct doesn't matter much */
 199
 200struct cake_sched_data {
 201	struct tcf_proto __rcu *filter_list; /* optional external classifier */
 202	struct tcf_block *block;
 203	struct cake_tin_data *tins;
 204
 205	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
 206	u16		overflow_timeout;
 207
 208	u16		tin_cnt;
 209	u8		tin_mode;
 210	u8		flow_mode;
 211	u8		ack_filter;
 212	u8		atm_mode;
 213
 214	u32		fwmark_mask;
 215	u16		fwmark_shft;
 216
 217	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 218	u16		rate_shft;
 219	ktime_t		time_next_packet;
 220	ktime_t		failsafe_next_packet;
 221	u64		rate_ns;
 222	u64		rate_bps;
 223	u16		rate_flags;
 224	s16		rate_overhead;
 225	u16		rate_mpu;
 226	u64		interval;
 227	u64		target;
 228
 229	/* resource tracking */
 230	u32		buffer_used;
 231	u32		buffer_max_used;
 232	u32		buffer_limit;
 233	u32		buffer_config_limit;
 234
 235	/* indices for dequeue */
 236	u16		cur_tin;
 237	u16		cur_flow;
 238
 239	struct qdisc_watchdog watchdog;
 240	const u8	*tin_index;
 241	const u8	*tin_order;
 242
 243	/* bandwidth capacity estimate */
 244	ktime_t		last_packet_time;
 245	ktime_t		avg_window_begin;
 246	u64		avg_packet_interval;
 247	u64		avg_window_bytes;
 248	u64		avg_peak_bandwidth;
 249	ktime_t		last_reconfig_time;
 250
 251	/* packet length stats */
 252	u32		avg_netoff;
 253	u16		max_netlen;
 254	u16		max_adjlen;
 255	u16		min_netlen;
 256	u16		min_adjlen;
 257};
 258
 259enum {
 260	CAKE_FLAG_OVERHEAD	   = BIT(0),
 261	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
 262	CAKE_FLAG_INGRESS	   = BIT(2),
 263	CAKE_FLAG_WASH		   = BIT(3),
 264	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
 265};
 266
 267/* COBALT operates the Codel and BLUE algorithms in parallel, in order to
 268 * obtain the best features of each.  Codel is excellent on flows which
 269 * respond to congestion signals in a TCP-like way.  BLUE is more effective on
 270 * unresponsive flows.
 271 */
 272
 273struct cobalt_skb_cb {
 274	ktime_t enqueue_time;
 275	u32     adjusted_len;
 276};
 277
 278static u64 us_to_ns(u64 us)
 279{
 280	return us * NSEC_PER_USEC;
 281}
 282
 283static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
 284{
 285	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
 286	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
 287}
 288
 289static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
 290{
 291	return get_cobalt_cb(skb)->enqueue_time;
 292}
 293
 294static void cobalt_set_enqueue_time(struct sk_buff *skb,
 295				    ktime_t now)
 296{
 297	get_cobalt_cb(skb)->enqueue_time = now;
 298}
 299
 300static u16 quantum_div[CAKE_QUEUES + 1] = {0};
 301
 302/* Diffserv lookup tables */
 303
 304static const u8 precedence[] = {
 305	0, 0, 0, 0, 0, 0, 0, 0,
 306	1, 1, 1, 1, 1, 1, 1, 1,
 307	2, 2, 2, 2, 2, 2, 2, 2,
 308	3, 3, 3, 3, 3, 3, 3, 3,
 309	4, 4, 4, 4, 4, 4, 4, 4,
 310	5, 5, 5, 5, 5, 5, 5, 5,
 311	6, 6, 6, 6, 6, 6, 6, 6,
 312	7, 7, 7, 7, 7, 7, 7, 7,
 313};
 314
 315static const u8 diffserv8[] = {
 316	2, 0, 1, 2, 4, 2, 2, 2,
 317	1, 2, 1, 2, 1, 2, 1, 2,
 318	5, 2, 4, 2, 4, 2, 4, 2,
 319	3, 2, 3, 2, 3, 2, 3, 2,
 320	6, 2, 3, 2, 3, 2, 3, 2,
 321	6, 2, 2, 2, 6, 2, 6, 2,
 322	7, 2, 2, 2, 2, 2, 2, 2,
 323	7, 2, 2, 2, 2, 2, 2, 2,
 324};
 325
 326static const u8 diffserv4[] = {
 327	0, 1, 0, 0, 2, 0, 0, 0,
 328	1, 0, 0, 0, 0, 0, 0, 0,
 329	2, 0, 2, 0, 2, 0, 2, 0,
 330	2, 0, 2, 0, 2, 0, 2, 0,
 331	3, 0, 2, 0, 2, 0, 2, 0,
 332	3, 0, 0, 0, 3, 0, 3, 0,
 333	3, 0, 0, 0, 0, 0, 0, 0,
 334	3, 0, 0, 0, 0, 0, 0, 0,
 335};
 336
 337static const u8 diffserv3[] = {
 338	0, 1, 0, 0, 2, 0, 0, 0,
 339	1, 0, 0, 0, 0, 0, 0, 0,
 340	0, 0, 0, 0, 0, 0, 0, 0,
 341	0, 0, 0, 0, 0, 0, 0, 0,
 342	0, 0, 0, 0, 0, 0, 0, 0,
 343	0, 0, 0, 0, 2, 0, 2, 0,
 344	2, 0, 0, 0, 0, 0, 0, 0,
 345	2, 0, 0, 0, 0, 0, 0, 0,
 346};
 347
 348static const u8 besteffort[] = {
 349	0, 0, 0, 0, 0, 0, 0, 0,
 350	0, 0, 0, 0, 0, 0, 0, 0,
 351	0, 0, 0, 0, 0, 0, 0, 0,
 352	0, 0, 0, 0, 0, 0, 0, 0,
 353	0, 0, 0, 0, 0, 0, 0, 0,
 354	0, 0, 0, 0, 0, 0, 0, 0,
 355	0, 0, 0, 0, 0, 0, 0, 0,
 356	0, 0, 0, 0, 0, 0, 0, 0,
 357};
 358
 359/* tin priority order for stats dumping */
 360
 361static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
 362static const u8 bulk_order[] = {1, 0, 2, 3};
 363
 364/* There is a big difference in timing between the accurate values placed in the
 365 * cache and the approximations given by a single Newton step for small count
 366 * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
 367 * these values are calculated using eight Newton steps, using the
 368 * implementation below. Above 16, a single Newton step gives sufficient
 369 * accuracy in either direction, given the precision stored.
 370 *
 371 * The magnitude of the error when stepping up to count 2 is such as to give the
 372 * value that *should* have been produced at count 4.
 373 */
 374
 375#define REC_INV_SQRT_CACHE (16)
 376static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
 377		~0,         ~0, 3037000500, 2479700525,
 378	2147483647, 1920767767, 1753413056, 1623345051,
 379	1518500250, 1431655765, 1358187914, 1294981364,
 380	1239850263, 1191209601, 1147878294, 1108955788
 381};
 382
 383/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
 384 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 385 *
 386 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 387 */
 388
 389static void cobalt_newton_step(struct cobalt_vars *vars)
 390{
 391	u32 invsqrt, invsqrt2;
 392	u64 val;
 393
 394	invsqrt = vars->rec_inv_sqrt;
 395	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
 396	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
 397
 398	val >>= 2; /* avoid overflow in following multiply */
 399	val = (val * invsqrt) >> (32 - 2 + 1);
 400
 401	vars->rec_inv_sqrt = val;
 402}
 403
 404static void cobalt_invsqrt(struct cobalt_vars *vars)
 405{
 406	if (vars->count < REC_INV_SQRT_CACHE)
 407		vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
 408	else
 409		cobalt_newton_step(vars);
 410}
 411
 412static void cobalt_vars_init(struct cobalt_vars *vars)
 413{
 414	memset(vars, 0, sizeof(*vars));
 415}
 416
 417/* CoDel control_law is t + interval/sqrt(count)
 418 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 419 * both sqrt() and divide operation.
 420 */
 421static ktime_t cobalt_control(ktime_t t,
 422			      u64 interval,
 423			      u32 rec_inv_sqrt)
 424{
 425	return ktime_add_ns(t, reciprocal_scale(interval,
 426						rec_inv_sqrt));
 427}
 428
 429/* Call this when a packet had to be dropped due to queue overflow.  Returns
 430 * true if the BLUE state was quiescent before but active after this call.
 431 */
 432static bool cobalt_queue_full(struct cobalt_vars *vars,
 433			      struct cobalt_params *p,
 434			      ktime_t now)
 435{
 436	bool up = false;
 437
 438	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 439		up = !vars->p_drop;
 440		vars->p_drop += p->p_inc;
 441		if (vars->p_drop < p->p_inc)
 442			vars->p_drop = ~0;
 443		vars->blue_timer = now;
 444	}
 445	vars->dropping = true;
 446	vars->drop_next = now;
 447	if (!vars->count)
 448		vars->count = 1;
 449
 450	return up;
 451}
 452
 453/* Call this when the queue was serviced but turned out to be empty.  Returns
 454 * true if the BLUE state was active before but quiescent after this call.
 455 */
 456static bool cobalt_queue_empty(struct cobalt_vars *vars,
 457			       struct cobalt_params *p,
 458			       ktime_t now)
 459{
 460	bool down = false;
 461
 462	if (vars->p_drop &&
 463	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 464		if (vars->p_drop < p->p_dec)
 465			vars->p_drop = 0;
 466		else
 467			vars->p_drop -= p->p_dec;
 468		vars->blue_timer = now;
 469		down = !vars->p_drop;
 470	}
 471	vars->dropping = false;
 472
 473	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
 474		vars->count--;
 475		cobalt_invsqrt(vars);
 476		vars->drop_next = cobalt_control(vars->drop_next,
 477						 p->interval,
 478						 vars->rec_inv_sqrt);
 479	}
 480
 481	return down;
 482}
 483
 484/* Call this with a freshly dequeued packet for possible congestion marking.
 485 * Returns true as an instruction to drop the packet, false for delivery.
 486 */
 487static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
 488					       struct cobalt_params *p,
 489					       ktime_t now,
 490					       struct sk_buff *skb,
 491					       u32 bulk_flows)
 492{
 493	enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
 494	bool next_due, over_target;
 495	ktime_t schedule;
 496	u64 sojourn;
 497
 498/* The 'schedule' variable records, in its sign, whether 'now' is before or
 499 * after 'drop_next'.  This allows 'drop_next' to be updated before the next
 500 * scheduling decision is actually branched, without destroying that
 501 * information.  Similarly, the first 'schedule' value calculated is preserved
 502 * in the boolean 'next_due'.
 503 *
 504 * As for 'drop_next', we take advantage of the fact that 'interval' is both
 505 * the delay between first exceeding 'target' and the first signalling event,
 506 * *and* the scaling factor for the signalling frequency.  It's therefore very
 507 * natural to use a single mechanism for both purposes, and eliminates a
 508 * significant amount of reference Codel's spaghetti code.  To help with this,
 509 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
 510 * as possible to 1.0 in fixed-point.
 511 */
 512
 513	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
 514	schedule = ktime_sub(now, vars->drop_next);
 515	over_target = sojourn > p->target &&
 516		      sojourn > p->mtu_time * bulk_flows * 2 &&
 517		      sojourn > p->mtu_time * 4;
 518	next_due = vars->count && ktime_to_ns(schedule) >= 0;
 519
 520	vars->ecn_marked = false;
 521
 522	if (over_target) {
 523		if (!vars->dropping) {
 524			vars->dropping = true;
 525			vars->drop_next = cobalt_control(now,
 526							 p->interval,
 527							 vars->rec_inv_sqrt);
 528		}
 529		if (!vars->count)
 530			vars->count = 1;
 531	} else if (vars->dropping) {
 532		vars->dropping = false;
 533	}
 534
 535	if (next_due && vars->dropping) {
 536		/* Use ECN mark if possible, otherwise drop */
 537		if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
 538			reason = SKB_DROP_REASON_QDISC_CONGESTED;
 539
 540		vars->count++;
 541		if (!vars->count)
 542			vars->count--;
 543		cobalt_invsqrt(vars);
 544		vars->drop_next = cobalt_control(vars->drop_next,
 545						 p->interval,
 546						 vars->rec_inv_sqrt);
 547		schedule = ktime_sub(now, vars->drop_next);
 548	} else {
 549		while (next_due) {
 550			vars->count--;
 551			cobalt_invsqrt(vars);
 552			vars->drop_next = cobalt_control(vars->drop_next,
 553							 p->interval,
 554							 vars->rec_inv_sqrt);
 555			schedule = ktime_sub(now, vars->drop_next);
 556			next_due = vars->count && ktime_to_ns(schedule) >= 0;
 557		}
 558	}
 559
 560	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
 561	if (vars->p_drop && reason == SKB_NOT_DROPPED_YET &&
 562	    get_random_u32() < vars->p_drop)
 563		reason = SKB_DROP_REASON_CAKE_FLOOD;
 564
 565	/* Overload the drop_next field as an activity timeout */
 566	if (!vars->count)
 567		vars->drop_next = ktime_add_ns(now, p->interval);
 568	else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET)
 569		vars->drop_next = now;
 570
 571	return reason;
 572}
 573
 574static bool cake_update_flowkeys(struct flow_keys *keys,
 575				 const struct sk_buff *skb)
 576{
 577#if IS_ENABLED(CONFIG_NF_CONNTRACK)
 578	struct nf_conntrack_tuple tuple = {};
 579	bool rev = !skb->_nfct, upd = false;
 580	__be32 ip;
 581
 582	if (skb_protocol(skb, true) != htons(ETH_P_IP))
 583		return false;
 584
 585	if (!nf_ct_get_tuple_skb(&tuple, skb))
 586		return false;
 587
 588	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
 589	if (ip != keys->addrs.v4addrs.src) {
 590		keys->addrs.v4addrs.src = ip;
 591		upd = true;
 592	}
 593	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
 594	if (ip != keys->addrs.v4addrs.dst) {
 595		keys->addrs.v4addrs.dst = ip;
 596		upd = true;
 597	}
 598
 599	if (keys->ports.ports) {
 600		__be16 port;
 601
 602		port = rev ? tuple.dst.u.all : tuple.src.u.all;
 603		if (port != keys->ports.src) {
 604			keys->ports.src = port;
 605			upd = true;
 606		}
 607		port = rev ? tuple.src.u.all : tuple.dst.u.all;
 608		if (port != keys->ports.dst) {
 609			port = keys->ports.dst;
 610			upd = true;
 611		}
 612	}
 613	return upd;
 614#else
 615	return false;
 616#endif
 617}
 618
 619/* Cake has several subtle multiple bit settings. In these cases you
 620 *  would be matching triple isolate mode as well.
 621 */
 622
 623static bool cake_dsrc(int flow_mode)
 624{
 625	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
 626}
 627
 628static bool cake_ddst(int flow_mode)
 629{
 630	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
 631}
 632
 633static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
 634					     struct cake_flow *flow,
 635					     int flow_mode)
 636{
 637	if (likely(cake_dsrc(flow_mode) &&
 638		   q->hosts[flow->srchost].srchost_bulk_flow_count))
 639		q->hosts[flow->srchost].srchost_bulk_flow_count--;
 640}
 641
 642static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
 643					     struct cake_flow *flow,
 644					     int flow_mode)
 645{
 646	if (likely(cake_dsrc(flow_mode) &&
 647		   q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
 648		q->hosts[flow->srchost].srchost_bulk_flow_count++;
 649}
 650
 651static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
 652					     struct cake_flow *flow,
 653					     int flow_mode)
 654{
 655	if (likely(cake_ddst(flow_mode) &&
 656		   q->hosts[flow->dsthost].dsthost_bulk_flow_count))
 657		q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
 658}
 659
 660static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
 661					     struct cake_flow *flow,
 662					     int flow_mode)
 663{
 664	if (likely(cake_ddst(flow_mode) &&
 665		   q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
 666		q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
 667}
 668
 669static u16 cake_get_flow_quantum(struct cake_tin_data *q,
 670				 struct cake_flow *flow,
 671				 int flow_mode)
 672{
 673	u16 host_load = 1;
 674
 675	if (cake_dsrc(flow_mode))
 676		host_load = max(host_load,
 677				q->hosts[flow->srchost].srchost_bulk_flow_count);
 678
 679	if (cake_ddst(flow_mode))
 680		host_load = max(host_load,
 681				q->hosts[flow->dsthost].dsthost_bulk_flow_count);
 682
 683	/* The get_random_u16() is a way to apply dithering to avoid
 684	 * accumulating roundoff errors
 685	 */
 686	return (q->flow_quantum * quantum_div[host_load] +
 687		get_random_u16()) >> 16;
 688}
 689
 690static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 691		     int flow_mode, u16 flow_override, u16 host_override)
 692{
 693	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
 694	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
 695	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
 696	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
 697	u16 reduced_hash, srchost_idx, dsthost_idx;
 698	struct flow_keys keys, host_keys;
 699	bool use_skbhash = skb->l4_hash;
 700
 701	if (unlikely(flow_mode == CAKE_FLOW_NONE))
 702		return 0;
 703
 704	/* If both overrides are set, or we can use the SKB hash and nat mode is
 705	 * disabled, we can skip packet dissection entirely. If nat mode is
 706	 * enabled there's another check below after doing the conntrack lookup.
 707	 */
 708	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
 709		goto skip_hash;
 710
 711	skb_flow_dissect_flow_keys(skb, &keys,
 712				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 713
 714	/* Don't use the SKB hash if we change the lookup keys from conntrack */
 715	if (nat_enabled && cake_update_flowkeys(&keys, skb))
 716		use_skbhash = false;
 717
 718	/* If we can still use the SKB hash and don't need the host hash, we can
 719	 * skip the rest of the hashing procedure
 720	 */
 721	if (use_skbhash && !hash_hosts)
 722		goto skip_hash;
 723
 724	/* flow_hash_from_keys() sorts the addresses by value, so we have
 725	 * to preserve their order in a separate data structure to treat
 726	 * src and dst host addresses as independently selectable.
 727	 */
 728	host_keys = keys;
 729	host_keys.ports.ports     = 0;
 730	host_keys.basic.ip_proto  = 0;
 731	host_keys.keyid.keyid     = 0;
 732	host_keys.tags.flow_label = 0;
 733
 734	switch (host_keys.control.addr_type) {
 735	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 736		host_keys.addrs.v4addrs.src = 0;
 737		dsthost_hash = flow_hash_from_keys(&host_keys);
 738		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
 739		host_keys.addrs.v4addrs.dst = 0;
 740		srchost_hash = flow_hash_from_keys(&host_keys);
 741		break;
 742
 743	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 744		memset(&host_keys.addrs.v6addrs.src, 0,
 745		       sizeof(host_keys.addrs.v6addrs.src));
 746		dsthost_hash = flow_hash_from_keys(&host_keys);
 747		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
 748		memset(&host_keys.addrs.v6addrs.dst, 0,
 749		       sizeof(host_keys.addrs.v6addrs.dst));
 750		srchost_hash = flow_hash_from_keys(&host_keys);
 751		break;
 752
 753	default:
 754		dsthost_hash = 0;
 755		srchost_hash = 0;
 756	}
 757
 758	/* This *must* be after the above switch, since as a
 759	 * side-effect it sorts the src and dst addresses.
 760	 */
 761	if (hash_flows && !use_skbhash)
 762		flow_hash = flow_hash_from_keys(&keys);
 763
 764skip_hash:
 765	if (flow_override)
 766		flow_hash = flow_override - 1;
 767	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
 768		flow_hash = skb->hash;
 769	if (host_override) {
 770		dsthost_hash = host_override - 1;
 771		srchost_hash = host_override - 1;
 772	}
 773
 774	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
 775		if (flow_mode & CAKE_FLOW_SRC_IP)
 776			flow_hash ^= srchost_hash;
 777
 778		if (flow_mode & CAKE_FLOW_DST_IP)
 779			flow_hash ^= dsthost_hash;
 780	}
 781
 782	reduced_hash = flow_hash % CAKE_QUEUES;
 783
 784	/* set-associative hashing */
 785	/* fast path if no hash collision (direct lookup succeeds) */
 786	if (likely(q->tags[reduced_hash] == flow_hash &&
 787		   q->flows[reduced_hash].set)) {
 788		q->way_directs++;
 789	} else {
 790		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
 791		u32 outer_hash = reduced_hash - inner_hash;
 792		bool allocate_src = false;
 793		bool allocate_dst = false;
 794		u32 i, k;
 795
 796		/* check if any active queue in the set is reserved for
 797		 * this flow.
 798		 */
 799		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 800		     i++, k = (k + 1) % CAKE_SET_WAYS) {
 801			if (q->tags[outer_hash + k] == flow_hash) {
 802				if (i)
 803					q->way_hits++;
 804
 805				if (!q->flows[outer_hash + k].set) {
 806					/* need to increment host refcnts */
 807					allocate_src = cake_dsrc(flow_mode);
 808					allocate_dst = cake_ddst(flow_mode);
 809				}
 810
 811				goto found;
 812			}
 813		}
 814
 815		/* no queue is reserved for this flow, look for an
 816		 * empty one.
 817		 */
 818		for (i = 0; i < CAKE_SET_WAYS;
 819			 i++, k = (k + 1) % CAKE_SET_WAYS) {
 820			if (!q->flows[outer_hash + k].set) {
 821				q->way_misses++;
 822				allocate_src = cake_dsrc(flow_mode);
 823				allocate_dst = cake_ddst(flow_mode);
 824				goto found;
 825			}
 826		}
 827
 828		/* With no empty queues, default to the original
 829		 * queue, accept the collision, update the host tags.
 830		 */
 831		q->way_collisions++;
 832		allocate_src = cake_dsrc(flow_mode);
 833		allocate_dst = cake_ddst(flow_mode);
 834
 835		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
 836			cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
 837			cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
 838		}
 839found:
 840		/* reserve queue for future packets in same flow */
 841		reduced_hash = outer_hash + k;
 842		q->tags[reduced_hash] = flow_hash;
 843
 844		if (allocate_src) {
 845			srchost_idx = srchost_hash % CAKE_QUEUES;
 846			inner_hash = srchost_idx % CAKE_SET_WAYS;
 847			outer_hash = srchost_idx - inner_hash;
 848			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 849				i++, k = (k + 1) % CAKE_SET_WAYS) {
 850				if (q->hosts[outer_hash + k].srchost_tag ==
 851				    srchost_hash)
 852					goto found_src;
 853			}
 854			for (i = 0; i < CAKE_SET_WAYS;
 855				i++, k = (k + 1) % CAKE_SET_WAYS) {
 856				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
 857					break;
 858			}
 859			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
 860found_src:
 861			srchost_idx = outer_hash + k;
 862			q->flows[reduced_hash].srchost = srchost_idx;
 863
 864			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 865				cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
 866		}
 867
 868		if (allocate_dst) {
 869			dsthost_idx = dsthost_hash % CAKE_QUEUES;
 870			inner_hash = dsthost_idx % CAKE_SET_WAYS;
 871			outer_hash = dsthost_idx - inner_hash;
 872			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 873			     i++, k = (k + 1) % CAKE_SET_WAYS) {
 874				if (q->hosts[outer_hash + k].dsthost_tag ==
 875				    dsthost_hash)
 876					goto found_dst;
 877			}
 878			for (i = 0; i < CAKE_SET_WAYS;
 879			     i++, k = (k + 1) % CAKE_SET_WAYS) {
 880				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
 881					break;
 882			}
 883			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
 884found_dst:
 885			dsthost_idx = outer_hash + k;
 886			q->flows[reduced_hash].dsthost = dsthost_idx;
 887
 888			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 889				cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
 890		}
 891	}
 892
 893	return reduced_hash;
 894}
 895
 896/* helper functions : might be changed when/if skb use a standard list_head */
 897/* remove one skb from head of slot queue */
 898
 899static struct sk_buff *dequeue_head(struct cake_flow *flow)
 900{
 901	struct sk_buff *skb = flow->head;
 902
 903	if (skb) {
 904		flow->head = skb->next;
 905		skb_mark_not_on_list(skb);
 906	}
 907
 908	return skb;
 909}
 910
 911/* add skb to flow queue (tail add) */
 912
 913static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
 914{
 915	if (!flow->head)
 916		flow->head = skb;
 917	else
 918		flow->tail->next = skb;
 919	flow->tail = skb;
 920	skb->next = NULL;
 921}
 922
 923static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
 924				    struct ipv6hdr *buf)
 925{
 926	unsigned int offset = skb_network_offset(skb);
 927	struct iphdr *iph;
 928
 929	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
 930
 931	if (!iph)
 932		return NULL;
 933
 934	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
 935		return skb_header_pointer(skb, offset + iph->ihl * 4,
 936					  sizeof(struct ipv6hdr), buf);
 937
 938	else if (iph->version == 4)
 939		return iph;
 940
 941	else if (iph->version == 6)
 942		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
 943					  buf);
 944
 945	return NULL;
 946}
 947
 948static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
 949				      void *buf, unsigned int bufsize)
 950{
 951	unsigned int offset = skb_network_offset(skb);
 952	const struct ipv6hdr *ipv6h;
 953	const struct tcphdr *tcph;
 954	const struct iphdr *iph;
 955	struct ipv6hdr _ipv6h;
 956	struct tcphdr _tcph;
 957
 958	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
 959
 960	if (!ipv6h)
 961		return NULL;
 962
 963	if (ipv6h->version == 4) {
 964		iph = (struct iphdr *)ipv6h;
 965		offset += iph->ihl * 4;
 966
 967		/* special-case 6in4 tunnelling, as that is a common way to get
 968		 * v6 connectivity in the home
 969		 */
 970		if (iph->protocol == IPPROTO_IPV6) {
 971			ipv6h = skb_header_pointer(skb, offset,
 972						   sizeof(_ipv6h), &_ipv6h);
 973
 974			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
 975				return NULL;
 976
 977			offset += sizeof(struct ipv6hdr);
 978
 979		} else if (iph->protocol != IPPROTO_TCP) {
 980			return NULL;
 981		}
 982
 983	} else if (ipv6h->version == 6) {
 984		if (ipv6h->nexthdr != IPPROTO_TCP)
 985			return NULL;
 986
 987		offset += sizeof(struct ipv6hdr);
 988	} else {
 989		return NULL;
 990	}
 991
 992	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
 993	if (!tcph || tcph->doff < 5)
 994		return NULL;
 995
 996	return skb_header_pointer(skb, offset,
 997				  min(__tcp_hdrlen(tcph), bufsize), buf);
 998}
 999
1000static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1001				   int code, int *oplen)
1002{
1003	/* inspired by tcp_parse_options in tcp_input.c */
1004	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1005	const u8 *ptr = (const u8 *)(tcph + 1);
1006
1007	while (length > 0) {
1008		int opcode = *ptr++;
1009		int opsize;
1010
1011		if (opcode == TCPOPT_EOL)
1012			break;
1013		if (opcode == TCPOPT_NOP) {
1014			length--;
1015			continue;
1016		}
1017		if (length < 2)
1018			break;
1019		opsize = *ptr++;
1020		if (opsize < 2 || opsize > length)
1021			break;
1022
1023		if (opcode == code) {
1024			*oplen = opsize;
1025			return ptr;
1026		}
1027
1028		ptr += opsize - 2;
1029		length -= opsize;
1030	}
1031
1032	return NULL;
1033}
1034
1035/* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1036 * bytes than the other. In the case where both sequences ACKs bytes that the
1037 * other doesn't, A is considered greater. DSACKs in A also makes A be
1038 * considered greater.
1039 *
1040 * @return -1, 0 or 1 as normal compare functions
1041 */
1042static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1043				  const struct tcphdr *tcph_b)
1044{
1045	const struct tcp_sack_block_wire *sack_a, *sack_b;
1046	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1047	u32 bytes_a = 0, bytes_b = 0;
1048	int oplen_a, oplen_b;
1049	bool first = true;
1050
1051	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1052	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1053
1054	/* pointers point to option contents */
1055	oplen_a -= TCPOLEN_SACK_BASE;
1056	oplen_b -= TCPOLEN_SACK_BASE;
1057
1058	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1059	    (!sack_b || oplen_b < sizeof(*sack_b)))
1060		return -1;
1061	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1062		 (!sack_a || oplen_a < sizeof(*sack_a)))
1063		return 1;
1064	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1065		 (!sack_b || oplen_b < sizeof(*sack_b)))
1066		return 0;
1067
1068	while (oplen_a >= sizeof(*sack_a)) {
1069		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1070		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1071		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1072		int oplen_tmp = oplen_b;
1073		bool found = false;
1074
1075		/* DSACK; always considered greater to prevent dropping */
1076		if (before(start_a, ack_seq_a))
1077			return -1;
1078
1079		bytes_a += end_a - start_a;
1080
1081		while (oplen_tmp >= sizeof(*sack_tmp)) {
1082			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1083			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1084
1085			/* first time through we count the total size */
1086			if (first)
1087				bytes_b += end_b - start_b;
1088
1089			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1090				found = true;
1091				if (!first)
1092					break;
1093			}
1094			oplen_tmp -= sizeof(*sack_tmp);
1095			sack_tmp++;
1096		}
1097
1098		if (!found)
1099			return -1;
1100
1101		oplen_a -= sizeof(*sack_a);
1102		sack_a++;
1103		first = false;
1104	}
1105
1106	/* If we made it this far, all ranges SACKed by A are covered by B, so
1107	 * either the SACKs are equal, or B SACKs more bytes.
1108	 */
1109	return bytes_b > bytes_a ? 1 : 0;
1110}
1111
1112static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1113				 u32 *tsval, u32 *tsecr)
1114{
1115	const u8 *ptr;
1116	int opsize;
1117
1118	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1119
1120	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1121		*tsval = get_unaligned_be32(ptr);
1122		*tsecr = get_unaligned_be32(ptr + 4);
1123	}
1124}
1125
1126static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1127			       u32 tstamp_new, u32 tsecr_new)
1128{
1129	/* inspired by tcp_parse_options in tcp_input.c */
1130	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1131	const u8 *ptr = (const u8 *)(tcph + 1);
1132	u32 tstamp, tsecr;
1133
1134	/* 3 reserved flags must be unset to avoid future breakage
1135	 * ACK must be set
1136	 * ECE/CWR are handled separately
1137	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1138	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1139	 * 0x00C00000 = CWR/ECE (handled separately)
1140	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1141	 */
1142	if (((tcp_flag_word(tcph) &
1143	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1144		return false;
1145
1146	while (length > 0) {
1147		int opcode = *ptr++;
1148		int opsize;
1149
1150		if (opcode == TCPOPT_EOL)
1151			break;
1152		if (opcode == TCPOPT_NOP) {
1153			length--;
1154			continue;
1155		}
1156		if (length < 2)
1157			break;
1158		opsize = *ptr++;
1159		if (opsize < 2 || opsize > length)
1160			break;
1161
1162		switch (opcode) {
1163		case TCPOPT_MD5SIG: /* doesn't influence state */
1164			break;
1165
1166		case TCPOPT_SACK: /* stricter checking performed later */
1167			if (opsize % 8 != 2)
1168				return false;
1169			break;
1170
1171		case TCPOPT_TIMESTAMP:
1172			/* only drop timestamps lower than new */
1173			if (opsize != TCPOLEN_TIMESTAMP)
1174				return false;
1175			tstamp = get_unaligned_be32(ptr);
1176			tsecr = get_unaligned_be32(ptr + 4);
1177			if (after(tstamp, tstamp_new) ||
1178			    after(tsecr, tsecr_new))
1179				return false;
1180			break;
1181
1182		case TCPOPT_MSS:  /* these should only be set on SYN */
1183		case TCPOPT_WINDOW:
1184		case TCPOPT_SACK_PERM:
1185		case TCPOPT_FASTOPEN:
1186		case TCPOPT_EXP:
1187		default: /* don't drop if any unknown options are present */
1188			return false;
1189		}
1190
1191		ptr += opsize - 2;
1192		length -= opsize;
1193	}
1194
1195	return true;
1196}
1197
1198static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1199				       struct cake_flow *flow)
1200{
1201	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1202	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1203	struct sk_buff *skb_check, *skb_prev = NULL;
1204	const struct ipv6hdr *ipv6h, *ipv6h_check;
1205	unsigned char _tcph[64], _tcph_check[64];
1206	const struct tcphdr *tcph, *tcph_check;
1207	const struct iphdr *iph, *iph_check;
1208	struct ipv6hdr _iph, _iph_check;
1209	const struct sk_buff *skb;
1210	int seglen, num_found = 0;
1211	u32 tstamp = 0, tsecr = 0;
1212	__be32 elig_flags = 0;
1213	int sack_comp;
1214
1215	/* no other possible ACKs to filter */
1216	if (flow->head == flow->tail)
1217		return NULL;
1218
1219	skb = flow->tail;
1220	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1221	iph = cake_get_iphdr(skb, &_iph);
1222	if (!tcph)
1223		return NULL;
1224
1225	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1226
1227	/* the 'triggering' packet need only have the ACK flag set.
1228	 * also check that SYN is not set, as there won't be any previous ACKs.
1229	 */
1230	if ((tcp_flag_word(tcph) &
1231	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1232		return NULL;
1233
1234	/* the 'triggering' ACK is at the tail of the queue, we have already
1235	 * returned if it is the only packet in the flow. loop through the rest
1236	 * of the queue looking for pure ACKs with the same 5-tuple as the
1237	 * triggering one.
1238	 */
1239	for (skb_check = flow->head;
1240	     skb_check && skb_check != skb;
1241	     skb_prev = skb_check, skb_check = skb_check->next) {
1242		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1243		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1244					     sizeof(_tcph_check));
1245
1246		/* only TCP packets with matching 5-tuple are eligible, and only
1247		 * drop safe headers
1248		 */
1249		if (!tcph_check || iph->version != iph_check->version ||
1250		    tcph_check->source != tcph->source ||
1251		    tcph_check->dest != tcph->dest)
1252			continue;
1253
1254		if (iph_check->version == 4) {
1255			if (iph_check->saddr != iph->saddr ||
1256			    iph_check->daddr != iph->daddr)
1257				continue;
1258
1259			seglen = iph_totlen(skb, iph_check) -
1260				       (4 * iph_check->ihl);
1261		} else if (iph_check->version == 6) {
1262			ipv6h = (struct ipv6hdr *)iph;
1263			ipv6h_check = (struct ipv6hdr *)iph_check;
1264
1265			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1266			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1267				continue;
1268
1269			seglen = ntohs(ipv6h_check->payload_len);
1270		} else {
1271			WARN_ON(1);  /* shouldn't happen */
1272			continue;
1273		}
1274
1275		/* If the ECE/CWR flags changed from the previous eligible
1276		 * packet in the same flow, we should no longer be dropping that
1277		 * previous packet as this would lose information.
1278		 */
1279		if (elig_ack && (tcp_flag_word(tcph_check) &
1280				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1281			elig_ack = NULL;
1282			elig_ack_prev = NULL;
1283			num_found--;
1284		}
1285
1286		/* Check TCP options and flags, don't drop ACKs with segment
1287		 * data, and don't drop ACKs with a higher cumulative ACK
1288		 * counter than the triggering packet. Check ACK seqno here to
1289		 * avoid parsing SACK options of packets we are going to exclude
1290		 * anyway.
1291		 */
1292		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1293		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1294		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1295			continue;
1296
1297		/* Check SACK options. The triggering packet must SACK more data
1298		 * than the ACK under consideration, or SACK the same range but
1299		 * have a larger cumulative ACK counter. The latter is a
1300		 * pathological case, but is contained in the following check
1301		 * anyway, just to be safe.
1302		 */
1303		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1304
1305		if (sack_comp < 0 ||
1306		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1307		     sack_comp == 0))
1308			continue;
1309
1310		/* At this point we have found an eligible pure ACK to drop; if
1311		 * we are in aggressive mode, we are done. Otherwise, keep
1312		 * searching unless this is the second eligible ACK we
1313		 * found.
1314		 *
1315		 * Since we want to drop ACK closest to the head of the queue,
1316		 * save the first eligible ACK we find, even if we need to loop
1317		 * again.
1318		 */
1319		if (!elig_ack) {
1320			elig_ack = skb_check;
1321			elig_ack_prev = skb_prev;
1322			elig_flags = (tcp_flag_word(tcph_check)
1323				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1324		}
1325
1326		if (num_found++ > 0)
1327			goto found;
1328	}
1329
1330	/* We made it through the queue without finding two eligible ACKs . If
1331	 * we found a single eligible ACK we can drop it in aggressive mode if
1332	 * we can guarantee that this does not interfere with ECN flag
1333	 * information. We ensure this by dropping it only if the enqueued
1334	 * packet is consecutive with the eligible ACK, and their flags match.
1335	 */
1336	if (elig_ack && aggressive && elig_ack->next == skb &&
1337	    (elig_flags == (tcp_flag_word(tcph) &
1338			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1339		goto found;
1340
1341	return NULL;
1342
1343found:
1344	if (elig_ack_prev)
1345		elig_ack_prev->next = elig_ack->next;
1346	else
1347		flow->head = elig_ack->next;
1348
1349	skb_mark_not_on_list(elig_ack);
1350
1351	return elig_ack;
1352}
1353
1354static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1355{
1356	avg -= avg >> shift;
1357	avg += sample >> shift;
1358	return avg;
1359}
1360
1361static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1362{
1363	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1364		len -= off;
1365
1366	if (q->max_netlen < len)
1367		q->max_netlen = len;
1368	if (q->min_netlen > len)
1369		q->min_netlen = len;
1370
1371	len += q->rate_overhead;
1372
1373	if (len < q->rate_mpu)
1374		len = q->rate_mpu;
1375
1376	if (q->atm_mode == CAKE_ATM_ATM) {
1377		len += 47;
1378		len /= 48;
1379		len *= 53;
1380	} else if (q->atm_mode == CAKE_ATM_PTM) {
1381		/* Add one byte per 64 bytes or part thereof.
1382		 * This is conservative and easier to calculate than the
1383		 * precise value.
1384		 */
1385		len += (len + 63) / 64;
1386	}
1387
1388	if (q->max_adjlen < len)
1389		q->max_adjlen = len;
1390	if (q->min_adjlen > len)
1391		q->min_adjlen = len;
1392
1393	return len;
1394}
1395
1396static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1397{
1398	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1399	unsigned int hdr_len, last_len = 0;
1400	u32 off = skb_network_offset(skb);
1401	u16 segs = qdisc_pkt_segs(skb);
1402	u32 len = qdisc_pkt_len(skb);
1403
1404	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1405
1406	if (segs == 1)
1407		return cake_calc_overhead(q, len, off);
1408
1409	/* borrowed from qdisc_pkt_len_segs_init() */
1410	if (!skb->encapsulation)
1411		hdr_len = skb_transport_offset(skb);
1412	else
1413		hdr_len = skb_inner_transport_offset(skb);
1414
1415	/* + transport layer */
1416	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1417						SKB_GSO_TCPV6))) {
1418		const struct tcphdr *th;
1419		struct tcphdr _tcphdr;
1420
1421		th = skb_header_pointer(skb, hdr_len,
1422					sizeof(_tcphdr), &_tcphdr);
1423		if (likely(th))
1424			hdr_len += __tcp_hdrlen(th);
1425	} else {
1426		struct udphdr _udphdr;
1427
1428		if (skb_header_pointer(skb, hdr_len,
1429				       sizeof(_udphdr), &_udphdr))
1430			hdr_len += sizeof(struct udphdr);
1431	}
1432
1433	len = shinfo->gso_size + hdr_len;
1434	last_len = skb->len - shinfo->gso_size * (segs - 1);
1435
1436	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1437		cake_calc_overhead(q, last_len, off));
1438}
1439
1440static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1441{
1442	struct cake_heap_entry ii = q->overflow_heap[i];
1443	struct cake_heap_entry jj = q->overflow_heap[j];
1444
1445	q->overflow_heap[i] = jj;
1446	q->overflow_heap[j] = ii;
1447
1448	q->tins[ii.t].overflow_idx[ii.b] = j;
1449	q->tins[jj.t].overflow_idx[jj.b] = i;
1450}
1451
1452static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1453{
1454	struct cake_heap_entry ii = q->overflow_heap[i];
1455
1456	return q->tins[ii.t].backlogs[ii.b];
1457}
1458
1459static void cake_heapify(struct cake_sched_data *q, u16 i)
1460{
1461	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1462	u32 mb = cake_heap_get_backlog(q, i);
1463	u32 m = i;
1464
1465	while (m < a) {
1466		u32 l = m + m + 1;
1467		u32 r = l + 1;
1468
1469		if (l < a) {
1470			u32 lb = cake_heap_get_backlog(q, l);
1471
1472			if (lb > mb) {
1473				m  = l;
1474				mb = lb;
1475			}
1476		}
1477
1478		if (r < a) {
1479			u32 rb = cake_heap_get_backlog(q, r);
1480
1481			if (rb > mb) {
1482				m  = r;
1483				mb = rb;
1484			}
1485		}
1486
1487		if (m != i) {
1488			cake_heap_swap(q, i, m);
1489			i = m;
1490		} else {
1491			break;
1492		}
1493	}
1494}
1495
1496static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1497{
1498	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1499		u16 p = (i - 1) >> 1;
1500		u32 ib = cake_heap_get_backlog(q, i);
1501		u32 pb = cake_heap_get_backlog(q, p);
1502
1503		if (ib > pb) {
1504			cake_heap_swap(q, i, p);
1505			i = p;
1506		} else {
1507			break;
1508		}
1509	}
1510}
1511
1512static int cake_advance_shaper(struct cake_sched_data *q,
1513			       struct cake_tin_data *b,
1514			       struct sk_buff *skb,
1515			       ktime_t now, bool drop)
1516{
1517	u32 len = get_cobalt_cb(skb)->adjusted_len;
1518
1519	/* charge packet bandwidth to this tin
1520	 * and to the global shaper.
1521	 */
1522	if (q->rate_ns) {
1523		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1524		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1525		u64 failsafe_dur = global_dur + (global_dur >> 1);
1526
1527		if (ktime_before(b->time_next_packet, now))
1528			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1529							   tin_dur);
1530
1531		else if (ktime_before(b->time_next_packet,
1532				      ktime_add_ns(now, tin_dur)))
1533			b->time_next_packet = ktime_add_ns(now, tin_dur);
1534
1535		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1536						   global_dur);
1537		if (!drop)
1538			q->failsafe_next_packet = \
1539				ktime_add_ns(q->failsafe_next_packet,
1540					     failsafe_dur);
1541	}
1542	return len;
1543}
1544
1545static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1546{
1547	struct cake_sched_data *q = qdisc_priv(sch);
1548	ktime_t now = ktime_get();
1549	u32 idx = 0, tin = 0, len;
1550	struct cake_heap_entry qq;
1551	struct cake_tin_data *b;
1552	struct cake_flow *flow;
1553	struct sk_buff *skb;
1554
1555	if (!q->overflow_timeout) {
1556		int i;
1557		/* Build fresh max-heap */
1558		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1559			cake_heapify(q, i);
1560	}
1561	q->overflow_timeout = 65535;
1562
1563	/* select longest queue for pruning */
1564	qq  = q->overflow_heap[0];
1565	tin = qq.t;
1566	idx = qq.b;
1567
1568	b = &q->tins[tin];
1569	flow = &b->flows[idx];
1570	skb = dequeue_head(flow);
1571	if (unlikely(!skb)) {
1572		/* heap has gone wrong, rebuild it next time */
1573		q->overflow_timeout = 0;
1574		return idx + (tin << 16);
1575	}
1576
1577	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1578		b->unresponsive_flow_count++;
1579
1580	len = qdisc_pkt_len(skb);
1581	q->buffer_used      -= skb->truesize;
1582	b->backlogs[idx]    -= len;
1583	b->tin_backlog      -= len;
1584	sch->qstats.backlog -= len;
1585
1586	flow->dropped++;
1587	b->tin_dropped++;
1588
1589	if (q->rate_flags & CAKE_FLAG_INGRESS)
1590		cake_advance_shaper(q, b, skb, now, true);
1591
1592	qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
1593	sch->q.qlen--;
1594
1595	cake_heapify(q, 0);
1596
1597	return idx + (tin << 16);
1598}
1599
1600static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1601{
1602	const int offset = skb_network_offset(skb);
1603	u16 *buf, buf_;
1604	u8 dscp;
1605
1606	switch (skb_protocol(skb, true)) {
1607	case htons(ETH_P_IP):
1608		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1609		if (unlikely(!buf))
1610			return 0;
1611
1612		/* ToS is in the second byte of iphdr */
1613		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1614
1615		if (wash && dscp) {
1616			const int wlen = offset + sizeof(struct iphdr);
1617
1618			if (!pskb_may_pull(skb, wlen) ||
1619			    skb_try_make_writable(skb, wlen))
1620				return 0;
1621
1622			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1623		}
1624
1625		return dscp;
1626
1627	case htons(ETH_P_IPV6):
1628		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1629		if (unlikely(!buf))
1630			return 0;
1631
1632		/* Traffic class is in the first and second bytes of ipv6hdr */
1633		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1634
1635		if (wash && dscp) {
1636			const int wlen = offset + sizeof(struct ipv6hdr);
1637
1638			if (!pskb_may_pull(skb, wlen) ||
1639			    skb_try_make_writable(skb, wlen))
1640				return 0;
1641
1642			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1643		}
1644
1645		return dscp;
1646
1647	case htons(ETH_P_ARP):
1648		return 0x38;  /* CS7 - Net Control */
1649
1650	default:
1651		/* If there is no Diffserv field, treat as best-effort */
1652		return 0;
1653	}
1654}
1655
1656static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1657					     struct sk_buff *skb)
1658{
1659	struct cake_sched_data *q = qdisc_priv(sch);
1660	u32 tin, mark;
1661	bool wash;
1662	u8 dscp;
1663
1664	/* Tin selection: Default to diffserv-based selection, allow overriding
1665	 * using firewall marks or skb->priority. Call DSCP parsing early if
1666	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1667	 */
1668	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1669	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1670	if (wash)
1671		dscp = cake_handle_diffserv(skb, wash);
1672
1673	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1674		tin = 0;
1675
1676	else if (mark && mark <= q->tin_cnt)
1677		tin = q->tin_order[mark - 1];
1678
1679	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1680		 TC_H_MIN(skb->priority) > 0 &&
1681		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1682		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1683
1684	else {
1685		if (!wash)
1686			dscp = cake_handle_diffserv(skb, wash);
1687		tin = q->tin_index[dscp];
1688
1689		if (unlikely(tin >= q->tin_cnt))
1690			tin = 0;
1691	}
1692
1693	return &q->tins[tin];
1694}
1695
1696static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1697			 struct sk_buff *skb, int flow_mode, int *qerr)
1698{
1699	struct cake_sched_data *q = qdisc_priv(sch);
1700	struct tcf_proto *filter;
1701	struct tcf_result res;
1702	u16 flow = 0, host = 0;
1703	int result;
1704
1705	filter = rcu_dereference_bh(q->filter_list);
1706	if (!filter)
1707		goto hash;
1708
1709	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1710	result = tcf_classify(skb, NULL, filter, &res, false);
1711
1712	if (result >= 0) {
1713#ifdef CONFIG_NET_CLS_ACT
1714		switch (result) {
1715		case TC_ACT_STOLEN:
1716		case TC_ACT_QUEUED:
1717		case TC_ACT_TRAP:
1718			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1719			fallthrough;
1720		case TC_ACT_SHOT:
1721			return 0;
1722		}
1723#endif
1724		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1725			flow = TC_H_MIN(res.classid);
1726		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1727			host = TC_H_MAJ(res.classid) >> 16;
1728	}
1729hash:
1730	*t = cake_select_tin(sch, skb);
1731	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1732}
1733
1734static void cake_reconfigure(struct Qdisc *sch);
1735
1736static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1737			struct sk_buff **to_free)
1738{
1739	u32 idx, tin, prev_qlen, prev_backlog, drop_id;
1740	struct cake_sched_data *q = qdisc_priv(sch);
1741	int len = qdisc_pkt_len(skb), ret;
1742	struct sk_buff *ack = NULL;
1743	ktime_t now = ktime_get();
1744	struct cake_tin_data *b;
1745	struct cake_flow *flow;
1746	bool same_flow = false;
1747
1748	/* choose flow to insert into */
1749	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1750	if (idx == 0) {
1751		if (ret & __NET_XMIT_BYPASS)
1752			qdisc_qstats_drop(sch);
1753		__qdisc_drop(skb, to_free);
1754		return ret;
1755	}
1756	tin = (u32)(b - q->tins);
1757	idx--;
1758	flow = &b->flows[idx];
1759
1760	/* ensure shaper state isn't stale */
1761	if (!b->tin_backlog) {
1762		if (ktime_before(b->time_next_packet, now))
1763			b->time_next_packet = now;
1764
1765		if (!sch->q.qlen) {
1766			if (ktime_before(q->time_next_packet, now)) {
1767				q->failsafe_next_packet = now;
1768				q->time_next_packet = now;
1769			} else if (ktime_after(q->time_next_packet, now) &&
1770				   ktime_after(q->failsafe_next_packet, now)) {
1771				u64 next = \
1772					min(ktime_to_ns(q->time_next_packet),
1773					    ktime_to_ns(
1774						   q->failsafe_next_packet));
1775				sch->qstats.overlimits++;
1776				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1777			}
1778		}
1779	}
1780
1781	if (unlikely(len > b->max_skblen))
1782		b->max_skblen = len;
1783
1784	if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1785		struct sk_buff *segs, *nskb;
1786		netdev_features_t features = netif_skb_features(skb);
1787		unsigned int slen = 0, numsegs = 0;
1788
1789		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1790		if (IS_ERR_OR_NULL(segs))
1791			return qdisc_drop(skb, sch, to_free);
1792
1793		skb_list_walk_safe(segs, segs, nskb) {
1794			skb_mark_not_on_list(segs);
1795			qdisc_skb_cb(segs)->pkt_len = segs->len;
1796			qdisc_skb_cb(segs)->pkt_segs = 1;
1797			cobalt_set_enqueue_time(segs, now);
1798			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1799									  segs);
1800			flow_queue_add(flow, segs);
1801
1802			sch->q.qlen++;
1803			numsegs++;
1804			slen += segs->len;
1805			q->buffer_used += segs->truesize;
1806			b->packets++;
1807		}
1808
1809		/* stats */
1810		b->bytes	    += slen;
1811		b->backlogs[idx]    += slen;
1812		b->tin_backlog      += slen;
1813		sch->qstats.backlog += slen;
1814		q->avg_window_bytes += slen;
1815
1816		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1817		consume_skb(skb);
1818	} else {
1819		/* not splitting */
1820		int ack_pkt_len = 0;
1821
1822		cobalt_set_enqueue_time(skb, now);
1823		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1824		flow_queue_add(flow, skb);
1825
1826		if (q->ack_filter)
1827			ack = cake_ack_filter(q, flow);
1828
1829		if (ack) {
1830			b->ack_drops++;
1831			sch->qstats.drops++;
1832			ack_pkt_len = qdisc_pkt_len(ack);
1833			b->bytes += ack_pkt_len;
1834			q->buffer_used += skb->truesize - ack->truesize;
1835			if (q->rate_flags & CAKE_FLAG_INGRESS)
1836				cake_advance_shaper(q, b, ack, now, true);
1837
1838			qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len);
1839			consume_skb(ack);
1840		} else {
1841			sch->q.qlen++;
1842			q->buffer_used      += skb->truesize;
1843		}
1844
1845		/* stats */
1846		b->packets++;
1847		b->bytes	    += len - ack_pkt_len;
1848		b->backlogs[idx]    += len - ack_pkt_len;
1849		b->tin_backlog      += len - ack_pkt_len;
1850		sch->qstats.backlog += len - ack_pkt_len;
1851		q->avg_window_bytes += len - ack_pkt_len;
1852	}
1853
1854	if (q->overflow_timeout)
1855		cake_heapify_up(q, b->overflow_idx[idx]);
1856
1857	/* incoming bandwidth capacity estimate */
1858	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1859		u64 packet_interval = \
1860			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1861
1862		if (packet_interval > NSEC_PER_SEC)
1863			packet_interval = NSEC_PER_SEC;
1864
1865		/* filter out short-term bursts, eg. wifi aggregation */
1866		q->avg_packet_interval = \
1867			cake_ewma(q->avg_packet_interval,
1868				  packet_interval,
1869				  (packet_interval > q->avg_packet_interval ?
1870					  2 : 8));
1871
1872		q->last_packet_time = now;
1873
1874		if (packet_interval > q->avg_packet_interval) {
1875			u64 window_interval = \
1876				ktime_to_ns(ktime_sub(now,
1877						      q->avg_window_begin));
1878			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1879
1880			b = div64_u64(b, window_interval);
1881			q->avg_peak_bandwidth =
1882				cake_ewma(q->avg_peak_bandwidth, b,
1883					  b > q->avg_peak_bandwidth ? 2 : 8);
1884			q->avg_window_bytes = 0;
1885			q->avg_window_begin = now;
1886
1887			if (ktime_after(now,
1888					ktime_add_ms(q->last_reconfig_time,
1889						     250))) {
1890				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1891				cake_reconfigure(sch);
1892			}
1893		}
1894	} else {
1895		q->avg_window_bytes = 0;
1896		q->last_packet_time = now;
1897	}
1898
1899	/* flowchain */
1900	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1901		if (!flow->set) {
1902			list_add_tail(&flow->flowchain, &b->new_flows);
1903		} else {
1904			b->decaying_flow_count--;
1905			list_move_tail(&flow->flowchain, &b->new_flows);
1906		}
1907		flow->set = CAKE_SET_SPARSE;
1908		b->sparse_flow_count++;
1909
1910		flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode);
1911	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1912		/* this flow was empty, accounted as a sparse flow, but actually
1913		 * in the bulk rotation.
1914		 */
1915		flow->set = CAKE_SET_BULK;
1916		b->sparse_flow_count--;
1917		b->bulk_flow_count++;
1918
1919		cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
1920		cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
1921	}
1922
1923	if (q->buffer_used > q->buffer_max_used)
1924		q->buffer_max_used = q->buffer_used;
1925
1926	if (q->buffer_used <= q->buffer_limit)
1927		return NET_XMIT_SUCCESS;
1928
1929	prev_qlen = sch->q.qlen;
1930	prev_backlog = sch->qstats.backlog;
1931
1932	while (q->buffer_used > q->buffer_limit) {
1933		drop_id = cake_drop(sch, to_free);
1934		if ((drop_id >> 16) == tin &&
1935		    (drop_id & 0xFFFF) == idx)
1936			same_flow = true;
1937	}
1938
1939	prev_qlen -= sch->q.qlen;
1940	prev_backlog -= sch->qstats.backlog;
1941	b->drop_overlimit += prev_qlen;
1942
1943	if (same_flow) {
1944		qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
1945					  prev_backlog - len);
1946		return NET_XMIT_CN;
1947	}
1948	qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
1949	return NET_XMIT_SUCCESS;
1950}
1951
1952static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1953{
1954	struct cake_sched_data *q = qdisc_priv(sch);
1955	struct cake_tin_data *b = &q->tins[q->cur_tin];
1956	struct cake_flow *flow = &b->flows[q->cur_flow];
1957	struct sk_buff *skb = NULL;
1958	u32 len;
1959
1960	if (flow->head) {
1961		skb = dequeue_head(flow);
1962		len = qdisc_pkt_len(skb);
1963		b->backlogs[q->cur_flow] -= len;
1964		b->tin_backlog		 -= len;
1965		sch->qstats.backlog      -= len;
1966		q->buffer_used		 -= skb->truesize;
1967		sch->q.qlen--;
1968
1969		if (q->overflow_timeout)
1970			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1971	}
1972	return skb;
1973}
1974
1975/* Discard leftover packets from a tin no longer in use. */
1976static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1977{
1978	struct cake_sched_data *q = qdisc_priv(sch);
1979	struct sk_buff *skb;
1980
1981	q->cur_tin = tin;
1982	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1983		while (!!(skb = cake_dequeue_one(sch)))
1984			kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
1985}
1986
1987static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1988{
1989	struct cake_sched_data *q = qdisc_priv(sch);
1990	struct cake_tin_data *b = &q->tins[q->cur_tin];
1991	enum skb_drop_reason reason;
1992	ktime_t now = ktime_get();
1993	struct cake_flow *flow;
1994	struct list_head *head;
1995	bool first_flow = true;
1996	struct sk_buff *skb;
1997	u64 delay;
1998	u32 len;
1999
2000begin:
2001	if (!sch->q.qlen)
2002		return NULL;
2003
2004	/* global hard shaper */
2005	if (ktime_after(q->time_next_packet, now) &&
2006	    ktime_after(q->failsafe_next_packet, now)) {
2007		u64 next = min(ktime_to_ns(q->time_next_packet),
2008			       ktime_to_ns(q->failsafe_next_packet));
2009
2010		sch->qstats.overlimits++;
2011		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2012		return NULL;
2013	}
2014
2015	/* Choose a class to work on. */
2016	if (!q->rate_ns) {
2017		/* In unlimited mode, can't rely on shaper timings, just balance
2018		 * with DRR
2019		 */
2020		bool wrapped = false, empty = true;
2021
2022		while (b->tin_deficit < 0 ||
2023		       !(b->sparse_flow_count + b->bulk_flow_count)) {
2024			if (b->tin_deficit <= 0)
2025				b->tin_deficit += b->tin_quantum;
2026			if (b->sparse_flow_count + b->bulk_flow_count)
2027				empty = false;
2028
2029			q->cur_tin++;
2030			b++;
2031			if (q->cur_tin >= q->tin_cnt) {
2032				q->cur_tin = 0;
2033				b = q->tins;
2034
2035				if (wrapped) {
2036					/* It's possible for q->qlen to be
2037					 * nonzero when we actually have no
2038					 * packets anywhere.
2039					 */
2040					if (empty)
2041						return NULL;
2042				} else {
2043					wrapped = true;
2044				}
2045			}
2046		}
2047	} else {
2048		/* In shaped mode, choose:
2049		 * - Highest-priority tin with queue and meeting schedule, or
2050		 * - The earliest-scheduled tin with queue.
2051		 */
2052		ktime_t best_time = KTIME_MAX;
2053		int tin, best_tin = 0;
2054
2055		for (tin = 0; tin < q->tin_cnt; tin++) {
2056			b = q->tins + tin;
2057			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2058				ktime_t time_to_pkt = \
2059					ktime_sub(b->time_next_packet, now);
2060
2061				if (ktime_to_ns(time_to_pkt) <= 0 ||
2062				    ktime_compare(time_to_pkt,
2063						  best_time) <= 0) {
2064					best_time = time_to_pkt;
2065					best_tin = tin;
2066				}
2067			}
2068		}
2069
2070		q->cur_tin = best_tin;
2071		b = q->tins + best_tin;
2072
2073		/* No point in going further if no packets to deliver. */
2074		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2075			return NULL;
2076	}
2077
2078retry:
2079	/* service this class */
2080	head = &b->decaying_flows;
2081	if (!first_flow || list_empty(head)) {
2082		head = &b->new_flows;
2083		if (list_empty(head)) {
2084			head = &b->old_flows;
2085			if (unlikely(list_empty(head))) {
2086				head = &b->decaying_flows;
2087				if (unlikely(list_empty(head)))
2088					goto begin;
2089			}
2090		}
2091	}
2092	flow = list_first_entry(head, struct cake_flow, flowchain);
2093	q->cur_flow = flow - b->flows;
2094	first_flow = false;
2095
2096	/* flow isolation (DRR++) */
2097	if (flow->deficit <= 0) {
2098		/* Keep all flows with deficits out of the sparse and decaying
2099		 * rotations.  No non-empty flow can go into the decaying
2100		 * rotation, so they can't get deficits
2101		 */
2102		if (flow->set == CAKE_SET_SPARSE) {
2103			if (flow->head) {
2104				b->sparse_flow_count--;
2105				b->bulk_flow_count++;
2106
2107				cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
2108				cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2109
2110				flow->set = CAKE_SET_BULK;
2111			} else {
2112				/* we've moved it to the bulk rotation for
2113				 * correct deficit accounting but we still want
2114				 * to count it as a sparse flow, not a bulk one.
2115				 */
2116				flow->set = CAKE_SET_SPARSE_WAIT;
2117			}
2118		}
2119
2120		flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode);
2121		list_move_tail(&flow->flowchain, &b->old_flows);
2122
2123		goto retry;
2124	}
2125
2126	/* Retrieve a packet via the AQM */
2127	while (1) {
2128		skb = cake_dequeue_one(sch);
2129		if (!skb) {
2130			/* this queue was actually empty */
2131			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2132				b->unresponsive_flow_count--;
2133
2134			if (flow->cvars.p_drop || flow->cvars.count ||
2135			    ktime_before(now, flow->cvars.drop_next)) {
2136				/* keep in the flowchain until the state has
2137				 * decayed to rest
2138				 */
2139				list_move_tail(&flow->flowchain,
2140					       &b->decaying_flows);
2141				if (flow->set == CAKE_SET_BULK) {
2142					b->bulk_flow_count--;
2143
2144					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2145					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2146
2147					b->decaying_flow_count++;
2148				} else if (flow->set == CAKE_SET_SPARSE ||
2149					   flow->set == CAKE_SET_SPARSE_WAIT) {
2150					b->sparse_flow_count--;
2151					b->decaying_flow_count++;
2152				}
2153				flow->set = CAKE_SET_DECAYING;
2154			} else {
2155				/* remove empty queue from the flowchain */
2156				list_del_init(&flow->flowchain);
2157				if (flow->set == CAKE_SET_SPARSE ||
2158				    flow->set == CAKE_SET_SPARSE_WAIT)
2159					b->sparse_flow_count--;
2160				else if (flow->set == CAKE_SET_BULK) {
2161					b->bulk_flow_count--;
2162
2163					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2164					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2165				} else
2166					b->decaying_flow_count--;
2167
2168				flow->set = CAKE_SET_NONE;
2169			}
2170			goto begin;
2171		}
2172
2173		reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2174					    (b->bulk_flow_count *
2175					     !!(q->rate_flags &
2176						CAKE_FLAG_INGRESS)));
2177		/* Last packet in queue may be marked, shouldn't be dropped */
2178		if (reason == SKB_NOT_DROPPED_YET || !flow->head)
2179			break;
2180
2181		/* drop this packet, get another one */
2182		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2183			len = cake_advance_shaper(q, b, skb,
2184						  now, true);
2185			flow->deficit -= len;
2186			b->tin_deficit -= len;
2187		}
2188		flow->dropped++;
2189		b->tin_dropped++;
2190		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2191		qdisc_qstats_drop(sch);
2192		qdisc_dequeue_drop(sch, skb, reason);
2193		if (q->rate_flags & CAKE_FLAG_INGRESS)
2194			goto retry;
2195	}
2196
2197	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2198	qdisc_bstats_update(sch, skb);
2199
2200	/* collect delay stats */
2201	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2202	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2203	b->peak_delay = cake_ewma(b->peak_delay, delay,
2204				  delay > b->peak_delay ? 2 : 8);
2205	b->base_delay = cake_ewma(b->base_delay, delay,
2206				  delay < b->base_delay ? 2 : 8);
2207
2208	len = cake_advance_shaper(q, b, skb, now, false);
2209	flow->deficit -= len;
2210	b->tin_deficit -= len;
2211
2212	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2213		u64 next = min(ktime_to_ns(q->time_next_packet),
2214			       ktime_to_ns(q->failsafe_next_packet));
2215
2216		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2217	} else if (!sch->q.qlen) {
2218		int i;
2219
2220		for (i = 0; i < q->tin_cnt; i++) {
2221			if (q->tins[i].decaying_flow_count) {
2222				ktime_t next = \
2223					ktime_add_ns(now,
2224						     q->tins[i].cparams.target);
2225
2226				qdisc_watchdog_schedule_ns(&q->watchdog,
2227							   ktime_to_ns(next));
2228				break;
2229			}
2230		}
2231	}
2232
2233	if (q->overflow_timeout)
2234		q->overflow_timeout--;
2235
2236	return skb;
2237}
2238
2239static void cake_reset(struct Qdisc *sch)
2240{
2241	struct cake_sched_data *q = qdisc_priv(sch);
2242	u32 c;
2243
2244	if (!q->tins)
2245		return;
2246
2247	for (c = 0; c < CAKE_MAX_TINS; c++)
2248		cake_clear_tin(sch, c);
2249}
2250
2251static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2252	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2253	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2254	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2255	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2256	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2257	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2258	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2259	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2260	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2261	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2262	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2263	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2264	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2265	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2266	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2267	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2268	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2269};
2270
2271static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2272			  u64 target_ns, u64 rtt_est_ns)
2273{
2274	/* convert byte-rate into time-per-byte
2275	 * so it will always unwedge in reasonable time.
2276	 */
2277	static const u64 MIN_RATE = 64;
2278	u32 byte_target = mtu;
2279	u64 byte_target_ns;
2280	u8  rate_shft = 0;
2281	u64 rate_ns = 0;
2282
2283	b->flow_quantum = 1514;
2284	if (rate) {
2285		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2286		rate_shft = 34;
2287		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2288		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2289		while (!!(rate_ns >> 34)) {
2290			rate_ns >>= 1;
2291			rate_shft--;
2292		}
2293	} /* else unlimited, ie. zero delay */
2294
2295	b->tin_rate_bps  = rate;
2296	b->tin_rate_ns   = rate_ns;
2297	b->tin_rate_shft = rate_shft;
2298
2299	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2300
2301	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2302	b->cparams.interval = max(rtt_est_ns +
2303				     b->cparams.target - target_ns,
2304				     b->cparams.target * 2);
2305	b->cparams.mtu_time = byte_target_ns;
2306	b->cparams.p_inc = 1 << 24; /* 1/256 */
2307	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2308}
2309
2310static int cake_config_besteffort(struct Qdisc *sch)
2311{
2312	struct cake_sched_data *q = qdisc_priv(sch);
2313	struct cake_tin_data *b = &q->tins[0];
2314	u32 mtu = psched_mtu(qdisc_dev(sch));
2315	u64 rate = q->rate_bps;
2316
2317	q->tin_cnt = 1;
2318
2319	q->tin_index = besteffort;
2320	q->tin_order = normal_order;
2321
2322	cake_set_rate(b, rate, mtu,
2323		      us_to_ns(q->target), us_to_ns(q->interval));
2324	b->tin_quantum = 65535;
2325
2326	return 0;
2327}
2328
2329static int cake_config_precedence(struct Qdisc *sch)
2330{
2331	/* convert high-level (user visible) parameters into internal format */
2332	struct cake_sched_data *q = qdisc_priv(sch);
2333	u32 mtu = psched_mtu(qdisc_dev(sch));
2334	u64 rate = q->rate_bps;
2335	u32 quantum = 256;
2336	u32 i;
2337
2338	q->tin_cnt = 8;
2339	q->tin_index = precedence;
2340	q->tin_order = normal_order;
2341
2342	for (i = 0; i < q->tin_cnt; i++) {
2343		struct cake_tin_data *b = &q->tins[i];
2344
2345		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2346			      us_to_ns(q->interval));
2347
2348		b->tin_quantum = max_t(u16, 1U, quantum);
2349
2350		/* calculate next class's parameters */
2351		rate  *= 7;
2352		rate >>= 3;
2353
2354		quantum  *= 7;
2355		quantum >>= 3;
2356	}
2357
2358	return 0;
2359}
2360
2361/*	List of known Diffserv codepoints:
2362 *
2363 *	Default Forwarding (DF/CS0) - Best Effort
2364 *	Max Throughput (TOS2)
2365 *	Min Delay (TOS4)
2366 *	LLT "La" (TOS5)
2367 *	Assured Forwarding 1 (AF1x) - x3
2368 *	Assured Forwarding 2 (AF2x) - x3
2369 *	Assured Forwarding 3 (AF3x) - x3
2370 *	Assured Forwarding 4 (AF4x) - x3
2371 *	Precedence Class 1 (CS1)
2372 *	Precedence Class 2 (CS2)
2373 *	Precedence Class 3 (CS3)
2374 *	Precedence Class 4 (CS4)
2375 *	Precedence Class 5 (CS5)
2376 *	Precedence Class 6 (CS6)
2377 *	Precedence Class 7 (CS7)
2378 *	Voice Admit (VA)
2379 *	Expedited Forwarding (EF)
2380 *	Lower Effort (LE)
2381 *
2382 *	Total 26 codepoints.
2383 */
2384
2385/*	List of traffic classes in RFC 4594, updated by RFC 8622:
2386 *		(roughly descending order of contended priority)
2387 *		(roughly ascending order of uncontended throughput)
2388 *
2389 *	Network Control (CS6,CS7)      - routing traffic
2390 *	Telephony (EF,VA)         - aka. VoIP streams
2391 *	Signalling (CS5)               - VoIP setup
2392 *	Multimedia Conferencing (AF4x) - aka. video calls
2393 *	Realtime Interactive (CS4)     - eg. games
2394 *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2395 *	Broadcast Video (CS3)
2396 *	Low-Latency Data (AF2x,TOS4)      - eg. database
2397 *	Ops, Admin, Management (CS2)      - eg. ssh
2398 *	Standard Service (DF & unrecognised codepoints)
2399 *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2400 *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2401 *
2402 *	Total 12 traffic classes.
2403 */
2404
2405static int cake_config_diffserv8(struct Qdisc *sch)
2406{
2407/*	Pruned list of traffic classes for typical applications:
2408 *
2409 *		Network Control          (CS6, CS7)
2410 *		Minimum Latency          (EF, VA, CS5, CS4)
2411 *		Interactive Shell        (CS2)
2412 *		Low Latency Transactions (AF2x, TOS4)
2413 *		Video Streaming          (AF4x, AF3x, CS3)
2414 *		Bog Standard             (DF etc.)
2415 *		High Throughput          (AF1x, TOS2, CS1)
2416 *		Background Traffic       (LE)
2417 *
2418 *		Total 8 traffic classes.
2419 */
2420
2421	struct cake_sched_data *q = qdisc_priv(sch);
2422	u32 mtu = psched_mtu(qdisc_dev(sch));
2423	u64 rate = q->rate_bps;
2424	u32 quantum = 256;
2425	u32 i;
2426
2427	q->tin_cnt = 8;
2428
2429	/* codepoint to class mapping */
2430	q->tin_index = diffserv8;
2431	q->tin_order = normal_order;
2432
2433	/* class characteristics */
2434	for (i = 0; i < q->tin_cnt; i++) {
2435		struct cake_tin_data *b = &q->tins[i];
2436
2437		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2438			      us_to_ns(q->interval));
2439
2440		b->tin_quantum = max_t(u16, 1U, quantum);
2441
2442		/* calculate next class's parameters */
2443		rate  *= 7;
2444		rate >>= 3;
2445
2446		quantum  *= 7;
2447		quantum >>= 3;
2448	}
2449
2450	return 0;
2451}
2452
2453static int cake_config_diffserv4(struct Qdisc *sch)
2454{
2455/*  Further pruned list of traffic classes for four-class system:
2456 *
2457 *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2458 *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2459 *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2460 *	    Background Traffic (LE, CS1)
2461 *
2462 *		Total 4 traffic classes.
2463 */
2464
2465	struct cake_sched_data *q = qdisc_priv(sch);
2466	u32 mtu = psched_mtu(qdisc_dev(sch));
2467	u64 rate = q->rate_bps;
2468	u32 quantum = 1024;
2469
2470	q->tin_cnt = 4;
2471
2472	/* codepoint to class mapping */
2473	q->tin_index = diffserv4;
2474	q->tin_order = bulk_order;
2475
2476	/* class characteristics */
2477	cake_set_rate(&q->tins[0], rate, mtu,
2478		      us_to_ns(q->target), us_to_ns(q->interval));
2479	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2480		      us_to_ns(q->target), us_to_ns(q->interval));
2481	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2482		      us_to_ns(q->target), us_to_ns(q->interval));
2483	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2484		      us_to_ns(q->target), us_to_ns(q->interval));
2485
2486	/* bandwidth-sharing weights */
2487	q->tins[0].tin_quantum = quantum;
2488	q->tins[1].tin_quantum = quantum >> 4;
2489	q->tins[2].tin_quantum = quantum >> 1;
2490	q->tins[3].tin_quantum = quantum >> 2;
2491
2492	return 0;
2493}
2494
2495static int cake_config_diffserv3(struct Qdisc *sch)
2496{
2497/*  Simplified Diffserv structure with 3 tins.
2498 *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2499 *		Best Effort
2500 *		Low Priority		(LE, CS1)
2501 */
2502	struct cake_sched_data *q = qdisc_priv(sch);
2503	u32 mtu = psched_mtu(qdisc_dev(sch));
2504	u64 rate = q->rate_bps;
2505	u32 quantum = 1024;
2506
2507	q->tin_cnt = 3;
2508
2509	/* codepoint to class mapping */
2510	q->tin_index = diffserv3;
2511	q->tin_order = bulk_order;
2512
2513	/* class characteristics */
2514	cake_set_rate(&q->tins[0], rate, mtu,
2515		      us_to_ns(q->target), us_to_ns(q->interval));
2516	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2517		      us_to_ns(q->target), us_to_ns(q->interval));
2518	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2519		      us_to_ns(q->target), us_to_ns(q->interval));
2520
2521	/* bandwidth-sharing weights */
2522	q->tins[0].tin_quantum = quantum;
2523	q->tins[1].tin_quantum = quantum >> 4;
2524	q->tins[2].tin_quantum = quantum >> 2;
2525
2526	return 0;
2527}
2528
2529static void cake_reconfigure(struct Qdisc *sch)
2530{
2531	struct cake_sched_data *q = qdisc_priv(sch);
2532	int c, ft;
2533
2534	switch (q->tin_mode) {
2535	case CAKE_DIFFSERV_BESTEFFORT:
2536		ft = cake_config_besteffort(sch);
2537		break;
2538
2539	case CAKE_DIFFSERV_PRECEDENCE:
2540		ft = cake_config_precedence(sch);
2541		break;
2542
2543	case CAKE_DIFFSERV_DIFFSERV8:
2544		ft = cake_config_diffserv8(sch);
2545		break;
2546
2547	case CAKE_DIFFSERV_DIFFSERV4:
2548		ft = cake_config_diffserv4(sch);
2549		break;
2550
2551	case CAKE_DIFFSERV_DIFFSERV3:
2552	default:
2553		ft = cake_config_diffserv3(sch);
2554		break;
2555	}
2556
2557	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2558		cake_clear_tin(sch, c);
2559		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2560	}
2561
2562	q->rate_ns   = q->tins[ft].tin_rate_ns;
2563	q->rate_shft = q->tins[ft].tin_rate_shft;
2564
2565	if (q->buffer_config_limit) {
2566		q->buffer_limit = q->buffer_config_limit;
2567	} else if (q->rate_bps) {
2568		u64 t = q->rate_bps * q->interval;
2569
2570		do_div(t, USEC_PER_SEC / 4);
2571		q->buffer_limit = max_t(u32, t, 4U << 20);
2572	} else {
2573		q->buffer_limit = ~0;
2574	}
2575
2576	sch->flags &= ~TCQ_F_CAN_BYPASS;
2577
2578	q->buffer_limit = min(q->buffer_limit,
2579			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2580				  q->buffer_config_limit));
2581}
2582
2583static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2584		       struct netlink_ext_ack *extack)
2585{
2586	struct cake_sched_data *q = qdisc_priv(sch);
2587	struct nlattr *tb[TCA_CAKE_MAX + 1];
2588	u16 rate_flags;
2589	u8 flow_mode;
2590	int err;
2591
2592	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2593					  extack);
2594	if (err < 0)
2595		return err;
2596
2597	flow_mode = q->flow_mode;
2598	if (tb[TCA_CAKE_NAT]) {
2599#if IS_ENABLED(CONFIG_NF_CONNTRACK)
2600		flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2601		flow_mode |= CAKE_FLOW_NAT_FLAG *
2602			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2603#else
2604		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2605				    "No conntrack support in kernel");
2606		return -EOPNOTSUPP;
2607#endif
2608	}
2609
2610	if (tb[TCA_CAKE_BASE_RATE64])
2611		WRITE_ONCE(q->rate_bps,
2612			   nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2613
2614	if (tb[TCA_CAKE_DIFFSERV_MODE])
2615		WRITE_ONCE(q->tin_mode,
2616			   nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2617
2618	rate_flags = q->rate_flags;
2619	if (tb[TCA_CAKE_WASH]) {
2620		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2621			rate_flags |= CAKE_FLAG_WASH;
2622		else
2623			rate_flags &= ~CAKE_FLAG_WASH;
2624	}
2625
2626	if (tb[TCA_CAKE_FLOW_MODE])
2627		flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2628				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2629					CAKE_FLOW_MASK));
2630
2631	if (tb[TCA_CAKE_ATM])
2632		WRITE_ONCE(q->atm_mode,
2633			   nla_get_u32(tb[TCA_CAKE_ATM]));
2634
2635	if (tb[TCA_CAKE_OVERHEAD]) {
2636		WRITE_ONCE(q->rate_overhead,
2637			   nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2638		rate_flags |= CAKE_FLAG_OVERHEAD;
2639
2640		q->max_netlen = 0;
2641		q->max_adjlen = 0;
2642		q->min_netlen = ~0;
2643		q->min_adjlen = ~0;
2644	}
2645
2646	if (tb[TCA_CAKE_RAW]) {
2647		rate_flags &= ~CAKE_FLAG_OVERHEAD;
2648
2649		q->max_netlen = 0;
2650		q->max_adjlen = 0;
2651		q->min_netlen = ~0;
2652		q->min_adjlen = ~0;
2653	}
2654
2655	if (tb[TCA_CAKE_MPU])
2656		WRITE_ONCE(q->rate_mpu,
2657			   nla_get_u32(tb[TCA_CAKE_MPU]));
2658
2659	if (tb[TCA_CAKE_RTT]) {
2660		u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2661
2662		WRITE_ONCE(q->interval, max(interval, 1U));
2663	}
2664
2665	if (tb[TCA_CAKE_TARGET]) {
2666		u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2667
2668		WRITE_ONCE(q->target, max(target, 1U));
2669	}
2670
2671	if (tb[TCA_CAKE_AUTORATE]) {
2672		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2673			rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2674		else
2675			rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2676	}
2677
2678	if (tb[TCA_CAKE_INGRESS]) {
2679		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2680			rate_flags |= CAKE_FLAG_INGRESS;
2681		else
2682			rate_flags &= ~CAKE_FLAG_INGRESS;
2683	}
2684
2685	if (tb[TCA_CAKE_ACK_FILTER])
2686		WRITE_ONCE(q->ack_filter,
2687			   nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2688
2689	if (tb[TCA_CAKE_MEMORY])
2690		WRITE_ONCE(q->buffer_config_limit,
2691			   nla_get_u32(tb[TCA_CAKE_MEMORY]));
2692
2693	if (tb[TCA_CAKE_SPLIT_GSO]) {
2694		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2695			rate_flags |= CAKE_FLAG_SPLIT_GSO;
2696		else
2697			rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2698	}
2699
2700	if (tb[TCA_CAKE_FWMARK]) {
2701		WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2702		WRITE_ONCE(q->fwmark_shft,
2703			   q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2704	}
2705
2706	WRITE_ONCE(q->rate_flags, rate_flags);
2707	WRITE_ONCE(q->flow_mode, flow_mode);
2708	if (q->tins) {
2709		sch_tree_lock(sch);
2710		cake_reconfigure(sch);
2711		sch_tree_unlock(sch);
2712	}
2713
2714	return 0;
2715}
2716
2717static void cake_destroy(struct Qdisc *sch)
2718{
2719	struct cake_sched_data *q = qdisc_priv(sch);
2720
2721	qdisc_watchdog_cancel(&q->watchdog);
2722	tcf_block_put(q->block);
2723	kvfree(q->tins);
2724}
2725
2726static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2727		     struct netlink_ext_ack *extack)
2728{
2729	struct cake_sched_data *q = qdisc_priv(sch);
2730	int i, j, err;
2731
2732	sch->limit = 10240;
2733	sch->flags |= TCQ_F_DEQUEUE_DROPS;
2734
2735	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2736	q->flow_mode  = CAKE_FLOW_TRIPLE;
2737
2738	q->rate_bps = 0; /* unlimited by default */
2739
2740	q->interval = 100000; /* 100ms default */
2741	q->target   =   5000; /* 5ms: codel RFC argues
2742			       * for 5 to 10% of interval
2743			       */
2744	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2745	q->cur_tin = 0;
2746	q->cur_flow  = 0;
2747
2748	qdisc_watchdog_init(&q->watchdog, sch);
2749
2750	if (opt) {
2751		err = cake_change(sch, opt, extack);
2752
2753		if (err)
2754			return err;
2755	}
2756
2757	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2758	if (err)
2759		return err;
2760
2761	quantum_div[0] = ~0;
2762	for (i = 1; i <= CAKE_QUEUES; i++)
2763		quantum_div[i] = 65535 / i;
2764
2765	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2766			   GFP_KERNEL);
2767	if (!q->tins)
2768		return -ENOMEM;
2769
2770	for (i = 0; i < CAKE_MAX_TINS; i++) {
2771		struct cake_tin_data *b = q->tins + i;
2772
2773		INIT_LIST_HEAD(&b->new_flows);
2774		INIT_LIST_HEAD(&b->old_flows);
2775		INIT_LIST_HEAD(&b->decaying_flows);
2776		b->sparse_flow_count = 0;
2777		b->bulk_flow_count = 0;
2778		b->decaying_flow_count = 0;
2779
2780		for (j = 0; j < CAKE_QUEUES; j++) {
2781			struct cake_flow *flow = b->flows + j;
2782			u32 k = j * CAKE_MAX_TINS + i;
2783
2784			INIT_LIST_HEAD(&flow->flowchain);
2785			cobalt_vars_init(&flow->cvars);
2786
2787			q->overflow_heap[k].t = i;
2788			q->overflow_heap[k].b = j;
2789			b->overflow_idx[j] = k;
2790		}
2791	}
2792
2793	cake_reconfigure(sch);
2794	q->avg_peak_bandwidth = q->rate_bps;
2795	q->min_netlen = ~0;
2796	q->min_adjlen = ~0;
2797	return 0;
2798}
2799
2800static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2801{
2802	struct cake_sched_data *q = qdisc_priv(sch);
2803	struct nlattr *opts;
2804	u16 rate_flags;
2805	u8 flow_mode;
2806
2807	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2808	if (!opts)
2809		goto nla_put_failure;
2810
2811	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2812			      READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2813		goto nla_put_failure;
2814
2815	flow_mode = READ_ONCE(q->flow_mode);
2816	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2817		goto nla_put_failure;
2818
2819	if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2820		goto nla_put_failure;
2821
2822	if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2823		goto nla_put_failure;
2824
2825	if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2826			READ_ONCE(q->buffer_config_limit)))
2827		goto nla_put_failure;
2828
2829	rate_flags = READ_ONCE(q->rate_flags);
2830	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2831			!!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2832		goto nla_put_failure;
2833
2834	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2835			!!(rate_flags & CAKE_FLAG_INGRESS)))
2836		goto nla_put_failure;
2837
2838	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2839		goto nla_put_failure;
2840
2841	if (nla_put_u32(skb, TCA_CAKE_NAT,
2842			!!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2843		goto nla_put_failure;
2844
2845	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2846		goto nla_put_failure;
2847
2848	if (nla_put_u32(skb, TCA_CAKE_WASH,
2849			!!(rate_flags & CAKE_FLAG_WASH)))
2850		goto nla_put_failure;
2851
2852	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2853		goto nla_put_failure;
2854
2855	if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2856		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2857			goto nla_put_failure;
2858
2859	if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2860		goto nla_put_failure;
2861
2862	if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2863		goto nla_put_failure;
2864
2865	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2866			!!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2867		goto nla_put_failure;
2868
2869	if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2870		goto nla_put_failure;
2871
2872	return nla_nest_end(skb, opts);
2873
2874nla_put_failure:
2875	return -1;
2876}
2877
2878static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2879{
2880	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2881	struct cake_sched_data *q = qdisc_priv(sch);
2882	struct nlattr *tstats, *ts;
2883	int i;
2884
2885	if (!stats)
2886		return -1;
2887
2888#define PUT_STAT_U32(attr, data) do {				       \
2889		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2890			goto nla_put_failure;			       \
2891	} while (0)
2892#define PUT_STAT_U64(attr, data) do {				       \
2893		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2894					data, TCA_CAKE_STATS_PAD)) \
2895			goto nla_put_failure;			       \
2896	} while (0)
2897
2898	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2899	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2900	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2901	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2902	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2903	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2904	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2905	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2906
2907#undef PUT_STAT_U32
2908#undef PUT_STAT_U64
2909
2910	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2911	if (!tstats)
2912		goto nla_put_failure;
2913
2914#define PUT_TSTAT_U32(attr, data) do {					\
2915		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2916			goto nla_put_failure;				\
2917	} while (0)
2918#define PUT_TSTAT_U64(attr, data) do {					\
2919		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2920					data, TCA_CAKE_TIN_STATS_PAD))	\
2921			goto nla_put_failure;				\
2922	} while (0)
2923
2924	for (i = 0; i < q->tin_cnt; i++) {
2925		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2926
2927		ts = nla_nest_start_noflag(d->skb, i + 1);
2928		if (!ts)
2929			goto nla_put_failure;
2930
2931		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2932		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2933		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2934
2935		PUT_TSTAT_U32(TARGET_US,
2936			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2937		PUT_TSTAT_U32(INTERVAL_US,
2938			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2939
2940		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2941		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2942		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2943		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2944
2945		PUT_TSTAT_U32(PEAK_DELAY_US,
2946			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2947		PUT_TSTAT_U32(AVG_DELAY_US,
2948			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2949		PUT_TSTAT_U32(BASE_DELAY_US,
2950			      ktime_to_us(ns_to_ktime(b->base_delay)));
2951
2952		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2953		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2954		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2955
2956		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2957					    b->decaying_flow_count);
2958		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2959		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2960		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2961
2962		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2963		nla_nest_end(d->skb, ts);
2964	}
2965
2966#undef PUT_TSTAT_U32
2967#undef PUT_TSTAT_U64
2968
2969	nla_nest_end(d->skb, tstats);
2970	return nla_nest_end(d->skb, stats);
2971
2972nla_put_failure:
2973	nla_nest_cancel(d->skb, stats);
2974	return -1;
2975}
2976
2977static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2978{
2979	return NULL;
2980}
2981
2982static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2983{
2984	return 0;
2985}
2986
2987static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2988			       u32 classid)
2989{
2990	return 0;
2991}
2992
2993static void cake_unbind(struct Qdisc *q, unsigned long cl)
2994{
2995}
2996
2997static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2998					struct netlink_ext_ack *extack)
2999{
3000	struct cake_sched_data *q = qdisc_priv(sch);
3001
3002	if (cl)
3003		return NULL;
3004	return q->block;
3005}
3006
3007static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
3008			   struct sk_buff *skb, struct tcmsg *tcm)
3009{
3010	tcm->tcm_handle |= TC_H_MIN(cl);
3011	return 0;
3012}
3013
3014static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
3015				 struct gnet_dump *d)
3016{
3017	struct cake_sched_data *q = qdisc_priv(sch);
3018	const struct cake_flow *flow = NULL;
3019	struct gnet_stats_queue qs = { 0 };
3020	struct nlattr *stats;
3021	u32 idx = cl - 1;
3022
3023	if (idx < CAKE_QUEUES * q->tin_cnt) {
3024		const struct cake_tin_data *b = \
3025			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
3026		const struct sk_buff *skb;
3027
3028		flow = &b->flows[idx % CAKE_QUEUES];
3029
3030		if (flow->head) {
3031			sch_tree_lock(sch);
3032			skb = flow->head;
3033			while (skb) {
3034				qs.qlen++;
3035				skb = skb->next;
3036			}
3037			sch_tree_unlock(sch);
3038		}
3039		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3040		qs.drops = flow->dropped;
3041	}
3042	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3043		return -1;
3044	if (flow) {
3045		ktime_t now = ktime_get();
3046
3047		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3048		if (!stats)
3049			return -1;
3050
3051#define PUT_STAT_U32(attr, data) do {				       \
3052		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3053			goto nla_put_failure;			       \
3054	} while (0)
3055#define PUT_STAT_S32(attr, data) do {				       \
3056		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3057			goto nla_put_failure;			       \
3058	} while (0)
3059
3060		PUT_STAT_S32(DEFICIT, flow->deficit);
3061		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3062		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3063		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3064		if (flow->cvars.p_drop) {
3065			PUT_STAT_S32(BLUE_TIMER_US,
3066				     ktime_to_us(
3067					     ktime_sub(now,
3068						       flow->cvars.blue_timer)));
3069		}
3070		if (flow->cvars.dropping) {
3071			PUT_STAT_S32(DROP_NEXT_US,
3072				     ktime_to_us(
3073					     ktime_sub(now,
3074						       flow->cvars.drop_next)));
3075		}
3076
3077		if (nla_nest_end(d->skb, stats) < 0)
3078			return -1;
3079	}
3080
3081	return 0;
3082
3083nla_put_failure:
3084	nla_nest_cancel(d->skb, stats);
3085	return -1;
3086}
3087
3088static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3089{
3090	struct cake_sched_data *q = qdisc_priv(sch);
3091	unsigned int i, j;
3092
3093	if (arg->stop)
3094		return;
3095
3096	for (i = 0; i < q->tin_cnt; i++) {
3097		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3098
3099		for (j = 0; j < CAKE_QUEUES; j++) {
3100			if (list_empty(&b->flows[j].flowchain)) {
3101				arg->count++;
3102				continue;
3103			}
3104			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3105						 arg))
3106				break;
3107		}
3108	}
3109}
3110
3111static const struct Qdisc_class_ops cake_class_ops = {
3112	.leaf		=	cake_leaf,
3113	.find		=	cake_find,
3114	.tcf_block	=	cake_tcf_block,
3115	.bind_tcf	=	cake_bind,
3116	.unbind_tcf	=	cake_unbind,
3117	.dump		=	cake_dump_class,
3118	.dump_stats	=	cake_dump_class_stats,
3119	.walk		=	cake_walk,
3120};
3121
3122static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3123	.cl_ops		=	&cake_class_ops,
3124	.id		=	"cake",
3125	.priv_size	=	sizeof(struct cake_sched_data),
3126	.enqueue	=	cake_enqueue,
3127	.dequeue	=	cake_dequeue,
3128	.peek		=	qdisc_peek_dequeued,
3129	.init		=	cake_init,
3130	.reset		=	cake_reset,
3131	.destroy	=	cake_destroy,
3132	.change		=	cake_change,
3133	.dump		=	cake_dump,
3134	.dump_stats	=	cake_dump_stats,
3135	.owner		=	THIS_MODULE,
3136};
3137MODULE_ALIAS_NET_SCH("cake");
3138
3139static int __init cake_module_init(void)
3140{
3141	return register_qdisc(&cake_qdisc_ops);
3142}
3143
3144static void __exit cake_module_exit(void)
3145{
3146	unregister_qdisc(&cake_qdisc_ops);
3147}
3148
3149module_init(cake_module_init)
3150module_exit(cake_module_exit)
3151MODULE_AUTHOR("Jonathan Morton");
3152MODULE_LICENSE("Dual BSD/GPL");
3153MODULE_DESCRIPTION("The CAKE shaper.");