sched: Struct definition and parsing of dualpi2 qdisc

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

DualPI2 is the reference implementation of IETF RFC9332 DualQ Coupled
AQM (https://datatracker.ietf.org/doc/html/rfc9332) providing two
queues called low latency (L-queue) and classic (C-queue). By default,
it enqueues non-ECN and ECT(0) packets into the C-queue and ECT(1) and
CE packets into the low latency queue (L-queue), as per IETF RFC9332 spec.

This patch defines the dualpi2 Qdisc structure and parsing, and the
following two patches include dumping and enqueue/dequeue for the DualPI2.

Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Link: https://patch.msgid.link/20250722095915.24485-2-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Chia-Yu Chang and committed by

Jakub Kicinski 8 months ago 320d031a 1cdf3f2d

+644

2 changed files

expand all

include

uapi

linux

pkt_sched.h

net

sched

sch_dualpi2.c

+53

include/uapi/linux/pkt_sched.h

··· 1211 1211 1212 1212 #define TCA_ETS_MAX (__TCA_ETS_MAX - 1) 1213 1213 1214 + /* DUALPI2 */ 1215 + enum tc_dualpi2_drop_overload { 1216 + TC_DUALPI2_DROP_OVERLOAD_OVERFLOW = 0, 1217 + TC_DUALPI2_DROP_OVERLOAD_DROP = 1, 1218 + __TCA_DUALPI2_DROP_OVERLOAD_MAX, 1219 + }; 1220 + #define TCA_DUALPI2_DROP_OVERLOAD_MAX (__TCA_DUALPI2_DROP_OVERLOAD_MAX - 1) 1221 + 1222 + enum tc_dualpi2_drop_early { 1223 + TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE = 0, 1224 + TC_DUALPI2_DROP_EARLY_DROP_ENQUEUE = 1, 1225 + __TCA_DUALPI2_DROP_EARLY_MAX, 1226 + }; 1227 + #define TCA_DUALPI2_DROP_EARLY_MAX (__TCA_DUALPI2_DROP_EARLY_MAX - 1) 1228 + 1229 + enum tc_dualpi2_ecn_mask { 1230 + TC_DUALPI2_ECN_MASK_L4S_ECT = 1, 1231 + TC_DUALPI2_ECN_MASK_CLA_ECT = 2, 1232 + TC_DUALPI2_ECN_MASK_ANY_ECT = 3, 1233 + __TCA_DUALPI2_ECN_MASK_MAX, 1234 + }; 1235 + #define TCA_DUALPI2_ECN_MASK_MAX (__TCA_DUALPI2_ECN_MASK_MAX - 1) 1236 + 1237 + enum tc_dualpi2_split_gso { 1238 + TC_DUALPI2_SPLIT_GSO_NO_SPLIT_GSO = 0, 1239 + TC_DUALPI2_SPLIT_GSO_SPLIT_GSO = 1, 1240 + __TCA_DUALPI2_SPLIT_GSO_MAX, 1241 + }; 1242 + #define TCA_DUALPI2_SPLIT_GSO_MAX (__TCA_DUALPI2_SPLIT_GSO_MAX - 1) 1243 + 1244 + enum { 1245 + TCA_DUALPI2_UNSPEC, 1246 + TCA_DUALPI2_LIMIT, /* Packets */ 1247 + TCA_DUALPI2_MEMORY_LIMIT, /* Bytes */ 1248 + TCA_DUALPI2_TARGET, /* us */ 1249 + TCA_DUALPI2_TUPDATE, /* us */ 1250 + TCA_DUALPI2_ALPHA, /* Hz scaled up by 256 */ 1251 + TCA_DUALPI2_BETA, /* Hz scaled up by 256 */ 1252 + TCA_DUALPI2_STEP_THRESH_PKTS, /* Step threshold in packets */ 1253 + TCA_DUALPI2_STEP_THRESH_US, /* Step threshold in microseconds */ 1254 + TCA_DUALPI2_MIN_QLEN_STEP, /* Minimum qlen to apply STEP_THRESH */ 1255 + TCA_DUALPI2_COUPLING, /* Coupling factor between queues */ 1256 + TCA_DUALPI2_DROP_OVERLOAD, /* Whether to drop on overload */ 1257 + TCA_DUALPI2_DROP_EARLY, /* Whether to drop on enqueue */ 1258 + TCA_DUALPI2_C_PROTECTION, /* Percentage */ 1259 + TCA_DUALPI2_ECN_MASK, /* L4S queue classification mask */ 1260 + TCA_DUALPI2_SPLIT_GSO, /* Split GSO packets at enqueue */ 1261 + TCA_DUALPI2_PAD, 1262 + __TCA_DUALPI2_MAX 1263 + }; 1264 + 1265 + #define TCA_DUALPI2_MAX (__TCA_DUALPI2_MAX - 1) 1266 + 1214 1267 #endif

+591

net/sched/sch_dualpi2.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause 2 + /* Copyright (C) 2024 Nokia 3 + * 4 + * Author: Koen De Schepper <koen.de_schepper@nokia-bell-labs.com> 5 + * Author: Olga Albisser <olga@albisser.org> 6 + * Author: Henrik Steen <henrist@henrist.net> 7 + * Author: Olivier Tilmans <olivier.tilmans@nokia.com> 8 + * Author: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com> 9 + * 10 + * DualPI Improved with a Square (dualpi2): 11 + * - Supports congestion controls that comply with the Prague requirements 12 + * in RFC9331 (e.g. TCP-Prague) 13 + * - Supports coupled dual-queue with PI2 as defined in RFC9332 14 + * - Supports ECN L4S-identifier (IP.ECN==0b*1) 15 + * 16 + * note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks, 17 + * they do not meet the 'Prague L4S Requirements' listed in RFC 9331 18 + * Section 4, so they can only be used with DualPI2 in a datacenter 19 + * context. 20 + * 21 + * References: 22 + * - RFC9332: https://datatracker.ietf.org/doc/html/rfc9332 23 + * - De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and 24 + * scalable TCP." in proc. ACM CoNEXT'16, 2016. 25 + */ 26 + 27 + #include <linux/errno.h> 28 + #include <linux/hrtimer.h> 29 + #include <linux/if_vlan.h> 30 + #include <linux/kernel.h> 31 + #include <linux/limits.h> 32 + #include <linux/module.h> 33 + #include <linux/skbuff.h> 34 + #include <linux/types.h> 35 + 36 + #include <net/gso.h> 37 + #include <net/inet_ecn.h> 38 + #include <net/pkt_cls.h> 39 + #include <net/pkt_sched.h> 40 + 41 + /* 32b enable to support flows with windows up to ~8.6 * 1e9 packets 42 + * i.e., twice the maximal snd_cwnd. 43 + * MAX_PROB must be consistent with the RNG in dualpi2_roll(). 44 + */ 45 + #define MAX_PROB U32_MAX 46 + 47 + /* alpha/beta values exchanged over netlink are in units of 256ns */ 48 + #define ALPHA_BETA_SHIFT 8 49 + 50 + /* Scaled values of alpha/beta must fit in 32b to avoid overflow in later 51 + * computations. Consequently (see and dualpi2_scale_alpha_beta()), their 52 + * netlink-provided values can use at most 31b, i.e. be at most (2^23)-1 53 + * (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to 54 + * control flows whose maximal RTTs can be in usec up to few secs. 55 + */ 56 + #define ALPHA_BETA_MAX ((1U << 31) - 1) 57 + 58 + /* Internal alpha/beta are in units of 64ns. 59 + * This enables to use all alpha/beta values in the allowed range without loss 60 + * of precision due to rounding when scaling them internally, e.g., 61 + * scale_alpha_beta(1) will not round down to 0. 62 + */ 63 + #define ALPHA_BETA_GRANULARITY 6 64 + 65 + #define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY) 66 + 67 + /* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */ 68 + #define MAX_WC 100 69 + 70 + struct dualpi2_sched_data { 71 + struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */ 72 + struct Qdisc *sch; /* The Classic queue (C-queue) */ 73 + 74 + /* Registered tc filters */ 75 + struct tcf_proto __rcu *tcf_filters; 76 + struct tcf_block *tcf_block; 77 + 78 + /* PI2 parameters */ 79 + u64 pi2_target; /* Target delay in nanoseconds */ 80 + u32 pi2_tupdate; /* Timer frequency in nanoseconds */ 81 + u32 pi2_prob; /* Base PI probability */ 82 + u32 pi2_alpha; /* Gain factor for the integral rate response */ 83 + u32 pi2_beta; /* Gain factor for the proportional response */ 84 + struct hrtimer pi2_timer; /* prob update timer */ 85 + 86 + /* Step AQM (L-queue only) parameters */ 87 + u32 step_thresh; /* Step threshold */ 88 + bool step_in_packets; /* Step thresh in packets (1) or time (0) */ 89 + 90 + /* C-queue starvation protection */ 91 + s32 c_protection_credit; /* Credit (sign indicates which queue) */ 92 + s32 c_protection_init; /* Reset value of the credit */ 93 + u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */ 94 + u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */ 95 + 96 + /* General dualQ parameters */ 97 + u32 memory_limit; /* Memory limit of both queues */ 98 + u8 coupling_factor;/* Coupling factor (k) between both queues */ 99 + u8 ecn_mask; /* Mask to match packets into L-queue */ 100 + u32 min_qlen_step; /* Minimum queue length to apply step thresh */ 101 + bool drop_early; /* Drop at enqueue (1) instead of dequeue (0) */ 102 + bool drop_overload; /* Drop (1) on overload, or overflow (0) */ 103 + bool split_gso; /* Split aggregated skb (1) or leave as is (0) */ 104 + 105 + /* Statistics */ 106 + u64 c_head_ts; /* Enqueue timestamp of the C-queue head */ 107 + u64 l_head_ts; /* Enqueue timestamp of the L-queue head */ 108 + u64 last_qdelay; /* Q delay val at the last probability update */ 109 + u32 packets_in_c; /* Enqueue packet counter of the C-queue */ 110 + u32 packets_in_l; /* Enqueue packet counter of the L-queue */ 111 + u32 maxq; /* Maximum queue size of the C-queue */ 112 + u32 ecn_mark; /* ECN mark pkt counter due to PI probability */ 113 + u32 step_marks; /* ECN mark pkt counter due to step AQM */ 114 + u32 memory_used; /* Memory used of both queues */ 115 + u32 max_memory_used;/* Maximum used memory */ 116 + }; 117 + 118 + static u32 dualpi2_scale_alpha_beta(u32 param) 119 + { 120 + u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING); 121 + 122 + do_div(tmp, NSEC_PER_SEC); 123 + return tmp; 124 + } 125 + 126 + static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) 127 + { 128 + return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate); 129 + } 130 + 131 + static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q) 132 + { 133 + q->c_protection_credit = q->c_protection_init; 134 + } 135 + 136 + /* This computes the initial credit value and WRR weight for the L queue (wl) 137 + * from the weight of the C queue (wc). 138 + * If wl > wc, the scheduler will start with the L queue when reset. 139 + */ 140 + static void dualpi2_calculate_c_protection(struct Qdisc *sch, 141 + struct dualpi2_sched_data *q, u32 wc) 142 + { 143 + q->c_protection_wc = wc; 144 + q->c_protection_wl = MAX_WC - wc; 145 + q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) * 146 + ((int)q->c_protection_wc - (int)q->c_protection_wl); 147 + dualpi2_reset_c_protection(q); 148 + } 149 + 150 + static s64 __scale_delta(u64 diff) 151 + { 152 + do_div(diff, 1 << ALPHA_BETA_GRANULARITY); 153 + return diff; 154 + } 155 + 156 + static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c, 157 + u64 *qdelay_l) 158 + { 159 + u64 now, qc, ql; 160 + 161 + now = ktime_get_ns(); 162 + qc = q->c_head_ts; 163 + ql = q->l_head_ts; 164 + 165 + *qdelay_c = qc ? now - qc : 0; 166 + *qdelay_l = ql ? now - ql : 0; 167 + } 168 + 169 + static u32 calculate_probability(struct Qdisc *sch) 170 + { 171 + struct dualpi2_sched_data *q = qdisc_priv(sch); 172 + u32 new_prob; 173 + u64 qdelay_c; 174 + u64 qdelay_l; 175 + u64 qdelay; 176 + s64 delta; 177 + 178 + get_queue_delays(q, &qdelay_c, &qdelay_l); 179 + qdelay = max(qdelay_l, qdelay_c); 180 + 181 + /* Alpha and beta take at most 32b, i.e, the delay difference would 182 + * overflow for queuing delay differences > ~4.2sec. 183 + */ 184 + delta = ((s64)qdelay - (s64)q->pi2_target) * q->pi2_alpha; 185 + delta += ((s64)qdelay - (s64)q->last_qdelay) * q->pi2_beta; 186 + q->last_qdelay = qdelay; 187 + 188 + /* Bound new_prob between 0 and MAX_PROB */ 189 + if (delta > 0) { 190 + new_prob = __scale_delta(delta) + q->pi2_prob; 191 + if (new_prob < q->pi2_prob) 192 + new_prob = MAX_PROB; 193 + } else { 194 + new_prob = q->pi2_prob - __scale_delta(~delta + 1); 195 + if (new_prob > q->pi2_prob) 196 + new_prob = 0; 197 + } 198 + 199 + /* If we do not drop on overload, ensure we cap the L4S probability to 200 + * 100% to keep window fairness when overflowing. 201 + */ 202 + if (!q->drop_overload) 203 + return min_t(u32, new_prob, MAX_PROB / q->coupling_factor); 204 + return new_prob; 205 + } 206 + 207 + static u32 get_memory_limit(struct Qdisc *sch, u32 limit) 208 + { 209 + /* Apply rule of thumb, i.e., doubling the packet length, 210 + * to further include per packet overhead in memory_limit. 211 + */ 212 + u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch))); 213 + 214 + if (upper_32_bits(memlim)) 215 + return U32_MAX; 216 + else 217 + return lower_32_bits(memlim); 218 + } 219 + 220 + static u32 convert_us_to_nsec(u32 us) 221 + { 222 + u64 ns = mul_u32_u32(us, NSEC_PER_USEC); 223 + 224 + if (upper_32_bits(ns)) 225 + return U32_MAX; 226 + 227 + return lower_32_bits(ns); 228 + } 229 + 230 + static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) 231 + { 232 + struct dualpi2_sched_data *q = timer_container_of(q, timer, pi2_timer); 233 + struct Qdisc *sch = q->sch; 234 + spinlock_t *root_lock; /* to lock qdisc for probability calculations */ 235 + 236 + rcu_read_lock(); 237 + root_lock = qdisc_lock(qdisc_root_sleeping(sch)); 238 + spin_lock(root_lock); 239 + 240 + q->pi2_prob = calculate_probability(sch); 241 + hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q)); 242 + 243 + spin_unlock(root_lock); 244 + rcu_read_unlock(); 245 + return HRTIMER_RESTART; 246 + } 247 + 248 + static struct netlink_range_validation dualpi2_alpha_beta_range = { 249 + .min = 1, 250 + .max = ALPHA_BETA_MAX, 251 + }; 252 + 253 + static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = { 254 + [TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), 255 + [TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1), 256 + [TCA_DUALPI2_TARGET] = { .type = NLA_U32 }, 257 + [TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1), 258 + [TCA_DUALPI2_ALPHA] = 259 + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), 260 + [TCA_DUALPI2_BETA] = 261 + NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range), 262 + [TCA_DUALPI2_STEP_THRESH_PKTS] = { .type = NLA_U32 }, 263 + [TCA_DUALPI2_STEP_THRESH_US] = { .type = NLA_U32 }, 264 + [TCA_DUALPI2_MIN_QLEN_STEP] = { .type = NLA_U32 }, 265 + [TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1), 266 + [TCA_DUALPI2_DROP_OVERLOAD] = 267 + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_OVERLOAD_MAX), 268 + [TCA_DUALPI2_DROP_EARLY] = 269 + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_DROP_EARLY_MAX), 270 + [TCA_DUALPI2_C_PROTECTION] = 271 + NLA_POLICY_RANGE(NLA_U8, 0, MAX_WC), 272 + [TCA_DUALPI2_ECN_MASK] = 273 + NLA_POLICY_RANGE(NLA_U8, TC_DUALPI2_ECN_MASK_L4S_ECT, 274 + TCA_DUALPI2_ECN_MASK_MAX), 275 + [TCA_DUALPI2_SPLIT_GSO] = 276 + NLA_POLICY_MAX(NLA_U8, TCA_DUALPI2_SPLIT_GSO_MAX), 277 + }; 278 + 279 + static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt, 280 + struct netlink_ext_ack *extack) 281 + { 282 + struct nlattr *tb[TCA_DUALPI2_MAX + 1]; 283 + struct dualpi2_sched_data *q; 284 + int old_backlog; 285 + int old_qlen; 286 + int err; 287 + 288 + if (!opt || !nla_len(opt)) { 289 + NL_SET_ERR_MSG_MOD(extack, "Dualpi2 options are required"); 290 + return -EINVAL; 291 + } 292 + err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy, 293 + extack); 294 + if (err < 0) 295 + return err; 296 + if (tb[TCA_DUALPI2_STEP_THRESH_PKTS] && tb[TCA_DUALPI2_STEP_THRESH_US]) { 297 + NL_SET_ERR_MSG_MOD(extack, "multiple step thresh attributes"); 298 + return -EINVAL; 299 + } 300 + 301 + q = qdisc_priv(sch); 302 + sch_tree_lock(sch); 303 + 304 + if (tb[TCA_DUALPI2_LIMIT]) { 305 + u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]); 306 + 307 + sch->limit = limit; 308 + q->memory_limit = get_memory_limit(sch, limit); 309 + } 310 + 311 + if (tb[TCA_DUALPI2_MEMORY_LIMIT]) 312 + q->memory_limit = nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]); 313 + 314 + if (tb[TCA_DUALPI2_TARGET]) { 315 + u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]); 316 + 317 + q->pi2_target = target * NSEC_PER_USEC; 318 + } 319 + 320 + if (tb[TCA_DUALPI2_TUPDATE]) { 321 + u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]); 322 + 323 + q->pi2_tupdate = convert_us_to_nsec(tupdate); 324 + } 325 + 326 + if (tb[TCA_DUALPI2_ALPHA]) { 327 + u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]); 328 + 329 + q->pi2_alpha = dualpi2_scale_alpha_beta(alpha); 330 + } 331 + 332 + if (tb[TCA_DUALPI2_BETA]) { 333 + u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]); 334 + 335 + q->pi2_beta = dualpi2_scale_alpha_beta(beta); 336 + } 337 + 338 + if (tb[TCA_DUALPI2_STEP_THRESH_PKTS]) { 339 + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_PKTS]); 340 + 341 + q->step_in_packets = true; 342 + q->step_thresh = step_th; 343 + } else if (tb[TCA_DUALPI2_STEP_THRESH_US]) { 344 + u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH_US]); 345 + 346 + q->step_in_packets = false; 347 + q->step_thresh = convert_us_to_nsec(step_th); 348 + } 349 + 350 + if (tb[TCA_DUALPI2_MIN_QLEN_STEP]) 351 + q->min_qlen_step = nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]); 352 + 353 + if (tb[TCA_DUALPI2_COUPLING]) { 354 + u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]); 355 + 356 + q->coupling_factor = coupling; 357 + } 358 + 359 + if (tb[TCA_DUALPI2_DROP_OVERLOAD]) { 360 + u8 drop_overload = nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]); 361 + 362 + q->drop_overload = (bool)drop_overload; 363 + } 364 + 365 + if (tb[TCA_DUALPI2_DROP_EARLY]) { 366 + u8 drop_early = nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]); 367 + 368 + q->drop_early = (bool)drop_early; 369 + } 370 + 371 + if (tb[TCA_DUALPI2_C_PROTECTION]) { 372 + u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]); 373 + 374 + dualpi2_calculate_c_protection(sch, q, wc); 375 + } 376 + 377 + if (tb[TCA_DUALPI2_ECN_MASK]) { 378 + u8 ecn_mask = nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]); 379 + 380 + q->ecn_mask = ecn_mask; 381 + } 382 + 383 + if (tb[TCA_DUALPI2_SPLIT_GSO]) { 384 + u8 split_gso = nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]); 385 + 386 + q->split_gso = (bool)split_gso; 387 + } 388 + 389 + old_qlen = qdisc_qlen(sch); 390 + old_backlog = sch->qstats.backlog; 391 + while (qdisc_qlen(sch) > sch->limit || 392 + q->memory_used > q->memory_limit) { 393 + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); 394 + 395 + q->memory_used -= skb->truesize; 396 + qdisc_qstats_backlog_dec(sch, skb); 397 + rtnl_qdisc_drop(skb, sch); 398 + } 399 + qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch), 400 + old_backlog - sch->qstats.backlog); 401 + 402 + sch_tree_unlock(sch); 403 + return 0; 404 + } 405 + 406 + /* Default alpha/beta values give a 10dB stability margin with max_rtt=100ms. */ 407 + static void dualpi2_reset_default(struct Qdisc *sch) 408 + { 409 + struct dualpi2_sched_data *q = qdisc_priv(sch); 410 + 411 + q->sch->limit = 10000; /* Max 125ms at 1Gbps */ 412 + q->memory_limit = get_memory_limit(sch, q->sch->limit); 413 + 414 + q->pi2_target = 15 * NSEC_PER_MSEC; 415 + q->pi2_tupdate = 16 * NSEC_PER_MSEC; 416 + q->pi2_alpha = dualpi2_scale_alpha_beta(41); /* ~0.16 Hz * 256 */ 417 + q->pi2_beta = dualpi2_scale_alpha_beta(819); /* ~3.20 Hz * 256 */ 418 + 419 + q->step_thresh = 1 * NSEC_PER_MSEC; 420 + q->step_in_packets = false; 421 + 422 + dualpi2_calculate_c_protection(q->sch, q, 10); /* wc=10%, wl=90% */ 423 + 424 + q->ecn_mask = TC_DUALPI2_ECN_MASK_L4S_ECT; /* INET_ECN_ECT_1 */ 425 + q->min_qlen_step = 0; /* Always apply step mark in L-queue */ 426 + q->coupling_factor = 2; /* window fairness for equal RTTs */ 427 + q->drop_overload = TC_DUALPI2_DROP_OVERLOAD_DROP; /* Drop overload */ 428 + q->drop_early = TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE; /* Drop dequeue */ 429 + q->split_gso = TC_DUALPI2_SPLIT_GSO_SPLIT_GSO; /* Split GSO */ 430 + } 431 + 432 + static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, 433 + struct netlink_ext_ack *extack) 434 + { 435 + struct dualpi2_sched_data *q = qdisc_priv(sch); 436 + int err; 437 + 438 + q->l_queue = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, 439 + TC_H_MAKE(sch->handle, 1), extack); 440 + if (!q->l_queue) 441 + return -ENOMEM; 442 + 443 + err = tcf_block_get(&q->tcf_block, &q->tcf_filters, sch, extack); 444 + if (err) 445 + return err; 446 + 447 + q->sch = sch; 448 + dualpi2_reset_default(sch); 449 + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); 450 + 451 + if (opt && nla_len(opt)) { 452 + err = dualpi2_change(sch, opt, extack); 453 + 454 + if (err) 455 + return err; 456 + } 457 + 458 + hrtimer_start(&q->pi2_timer, next_pi2_timeout(q), 459 + HRTIMER_MODE_ABS_PINNED); 460 + return 0; 461 + } 462 + 463 + /* Reset both L-queue and C-queue, internal packet counters, PI probability, 464 + * C-queue protection credit, and timestamps, while preserving current 465 + * configuration of DUALPI2. 466 + */ 467 + static void dualpi2_reset(struct Qdisc *sch) 468 + { 469 + struct dualpi2_sched_data *q = qdisc_priv(sch); 470 + 471 + qdisc_reset_queue(sch); 472 + qdisc_reset_queue(q->l_queue); 473 + q->c_head_ts = 0; 474 + q->l_head_ts = 0; 475 + q->pi2_prob = 0; 476 + q->packets_in_c = 0; 477 + q->packets_in_l = 0; 478 + q->maxq = 0; 479 + q->ecn_mark = 0; 480 + q->step_marks = 0; 481 + q->memory_used = 0; 482 + q->max_memory_used = 0; 483 + dualpi2_reset_c_protection(q); 484 + } 485 + 486 + static void dualpi2_destroy(struct Qdisc *sch) 487 + { 488 + struct dualpi2_sched_data *q = qdisc_priv(sch); 489 + 490 + q->pi2_tupdate = 0; 491 + hrtimer_cancel(&q->pi2_timer); 492 + if (q->l_queue) 493 + qdisc_put(q->l_queue); 494 + tcf_block_put(q->tcf_block); 495 + } 496 + 497 + static struct Qdisc *dualpi2_leaf(struct Qdisc *sch, unsigned long arg) 498 + { 499 + return NULL; 500 + } 501 + 502 + static unsigned long dualpi2_find(struct Qdisc *sch, u32 classid) 503 + { 504 + return 0; 505 + } 506 + 507 + static unsigned long dualpi2_bind(struct Qdisc *sch, unsigned long parent, 508 + u32 classid) 509 + { 510 + return 0; 511 + } 512 + 513 + static void dualpi2_unbind(struct Qdisc *q, unsigned long cl) 514 + { 515 + } 516 + 517 + static struct tcf_block *dualpi2_tcf_block(struct Qdisc *sch, unsigned long cl, 518 + struct netlink_ext_ack *extack) 519 + { 520 + struct dualpi2_sched_data *q = qdisc_priv(sch); 521 + 522 + if (cl) 523 + return NULL; 524 + return q->tcf_block; 525 + } 526 + 527 + static void dualpi2_walk(struct Qdisc *sch, struct qdisc_walker *arg) 528 + { 529 + unsigned int i; 530 + 531 + if (arg->stop) 532 + return; 533 + 534 + /* We statically define only 2 queues */ 535 + for (i = 0; i < 2; i++) { 536 + if (arg->count < arg->skip) { 537 + arg->count++; 538 + continue; 539 + } 540 + if (arg->fn(sch, i + 1, arg) < 0) { 541 + arg->stop = 1; 542 + break; 543 + } 544 + arg->count++; 545 + } 546 + } 547 + 548 + /* Minimal class support to handle tc filters */ 549 + static const struct Qdisc_class_ops dualpi2_class_ops = { 550 + .leaf = dualpi2_leaf, 551 + .find = dualpi2_find, 552 + .tcf_block = dualpi2_tcf_block, 553 + .bind_tcf = dualpi2_bind, 554 + .unbind_tcf = dualpi2_unbind, 555 + .walk = dualpi2_walk, 556 + }; 557 + 558 + static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = { 559 + .id = "dualpi2", 560 + .cl_ops = &dualpi2_class_ops, 561 + .priv_size = sizeof(struct dualpi2_sched_data), 562 + .peek = qdisc_peek_dequeued, 563 + .init = dualpi2_init, 564 + .destroy = dualpi2_destroy, 565 + .reset = dualpi2_reset, 566 + .change = dualpi2_change, 567 + .owner = THIS_MODULE, 568 + }; 569 + 570 + static int __init dualpi2_module_init(void) 571 + { 572 + return register_qdisc(&dualpi2_qdisc_ops); 573 + } 574 + 575 + static void __exit dualpi2_module_exit(void) 576 + { 577 + unregister_qdisc(&dualpi2_qdisc_ops); 578 + } 579 + 580 + module_init(dualpi2_module_init); 581 + module_exit(dualpi2_module_exit); 582 + 583 + MODULE_DESCRIPTION("Dual Queue with Proportional Integral controller Improved with a Square (dualpi2) scheduler"); 584 + MODULE_AUTHOR("Koen De Schepper <koen.de_schepper@nokia-bell-labs.com>"); 585 + MODULE_AUTHOR("Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>"); 586 + MODULE_AUTHOR("Olga Albisser <olga@albisser.org>"); 587 + MODULE_AUTHOR("Henrik Steen <henrist@henrist.net>"); 588 + MODULE_AUTHOR("Olivier Tilmans <olivier.tilmans@nokia.com>"); 589 + 590 + MODULE_LICENSE("Dual BSD/GPL"); 591 + MODULE_VERSION("1.0");