On Sat, 26 Apr 2025 at 18:20, chia-yu.chang@nokia-bell-labs.com wrote:
diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 9ea874395717..b4caeccbea72 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1210,4 +1210,28 @@ enum {
#define TCA_ETS_MAX (__TCA_ETS_MAX - 1)
+/* DUALPI2 */ +enum {
TCA_DUALPI2_UNSPEC,
TCA_DUALPI2_LIMIT, /* Packets */
TCA_DUALPI2_MEMORY_LIMIT, /* Bytes */
TCA_DUALPI2_TARGET, /* us */
TCA_DUALPI2_TUPDATE, /* us */
TCA_DUALPI2_ALPHA, /* Hz scaled up by 256 */
TCA_DUALPI2_BETA, /* HZ scaled up by 256 */
TCA_DUALPI2_STEP_THRESH, /* Packets or us */
TCA_DUALPI2_STEP_PACKETS, /* Whether STEP_THRESH is in packets */
TCA_DUALPI2_MIN_QLEN_STEP, /* Minimum qlen to apply STEP_THRESH */
TCA_DUALPI2_COUPLING, /* Coupling factor between queues */
TCA_DUALPI2_DROP_OVERLOAD, /* Whether to drop on overload */
TCA_DUALPI2_DROP_EARLY, /* Whether to drop on enqueue */
TCA_DUALPI2_C_PROTECTION, /* Percentage */
TCA_DUALPI2_ECN_MASK, /* L4S queue classification mask */
TCA_DUALPI2_SPLIT_GSO, /* Split GSO packets at enqueue */
TCA_DUALPI2_PAD,
__TCA_DUALPI2_MAX
+};
+#define TCA_DUALPI2_MAX (__TCA_DUALPI2_MAX - 1)
Need to define enums here as part of UAPI for each of the new enum types you include in the tc.yaml patch.
#endif diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c new file mode 100644 index 000000000000..bb3d9982b572 --- /dev/null +++ b/net/sched/sch_dualpi2.c @@ -0,0 +1,554 @@ +// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +/* Copyright (C) 2024 Nokia
- Author: Koen De Schepper koen.de_schepper@nokia-bell-labs.com
- Author: Olga Albisser olga@albisser.org
- Author: Henrik Steen henrist@henrist.net
- Author: Olivier Tilmans olivier.tilmans@nokia.com
- Author: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com
- DualPI Improved with a Square (dualpi2):
- Supports congestion controls that comply with the Prague requirements
- in RFC9331 (e.g. TCP-Prague)
- Supports coupled dual-queue with PI2 as defined in RFC9332
- Supports ECN L4S-identifier (IP.ECN==0b*1)
- note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks,
- they do not meet the 'Prague L4S Requirements' listed in RFC 9331
- Section 4, so they can only be used with DualPI2 in a datacenter
- context.
- References:
- De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and
- scalable TCP." in proc. ACM CoNEXT'16, 2016.
- */
+#include <linux/errno.h> +#include <linux/hrtimer.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/limits.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/types.h>
+#include <net/gso.h> +#include <net/inet_ecn.h> +#include <net/pkt_cls.h> +#include <net/pkt_sched.h>
+/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets
- i.e., twice the maximal snd_cwnd.
- MAX_PROB must be consistent with the RNG in dualpi2_roll().
- */
+#define MAX_PROB U32_MAX
+/* alpha/beta values exchanged over netlink are in units of 256ns */ +#define ALPHA_BETA_SHIFT 8
+/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later
- computations. Consequently (see and dualpi2_scale_alpha_beta()), their
- netlink-provided values can use at most 31b, i.e. be at most (2^23)-1
- (~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to
- control flows whose maximal RTTs can be in usec up to few secs.
- */
+#define ALPHA_BETA_MAX ((1U << 31) - 1)
+/* Internal alpha/beta are in units of 64ns.
- This enables to use all alpha/beta values in the allowed range without loss
- of precision due to rounding when scaling them internally, e.g.,
- scale_alpha_beta(1) will not round down to 0.
- */
+#define ALPHA_BETA_GRANULARITY 6
+#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY)
+/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */ +#define MAX_WC 100
+struct dualpi2_sched_data {
struct Qdisc *l_queue; /* The L4S Low latency queue (L-queue) */
struct Qdisc *sch; /* The Classic queue (C-queue) */
/* Registered tc filters */
struct tcf_proto __rcu *tcf_filters;
struct tcf_block *tcf_block;
/* PI2 parameters */
u64 pi2_target; /* Target delay in nanoseconds */
u32 pi2_tupdate; /* Timer frequency in nanoseconds */
u32 pi2_prob; /* Base PI probability */
u32 pi2_alpha; /* Gain factor for the integral rate response */
u32 pi2_beta; /* Gain factor for the proportional response */
struct hrtimer pi2_timer; /* prob update timer */
/* Step AQM (L-queue only) parameters */
u32 step_thresh; /* Step threshold */
bool step_in_packets; /* Whether the step is in packets or time */
/* C-queue starvation protection */
s32 c_protection_credit; /* Credit (sign indicates which queue) */
s32 c_protection_init; /* Reset value of the credit */
u8 c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */
u8 c_protection_wl; /* L-queue weight (MAX_WC - wc) */
/* General dualQ parameters */
u32 memory_limit; /* Memory limit of both queues */
u8 coupling_factor;/* Coupling factor (k) between both queues */
u8 ecn_mask; /* Mask to match packets into L-queue */
u32 min_qlen_step; /* Minimum queue length to apply step thresh */
bool drop_early; /* Drop at enqueue instead of dequeue if true */
bool drop_overload; /* Drop (1) on overload, or overflow (0) */
bool split_gso; /* Split aggregated skb (1) or leave as is */
/* Statistics */
u64 c_head_ts; /* Enqueue timestamp of the C-queue head */
u64 l_head_ts; /* Enqueue timestamp of the L-queue head */
u64 last_qdelay; /* Q delay val at the last probability update */
u32 packets_in_c; /* Enqueue packet counter of the C-queue */
u32 packets_in_l; /* Enqueue packet counter of the L-queue */
u32 maxq; /* Maximum queue size of the C-queue */
u32 ecn_mark; /* ECN mark pkt counter due to PI probability */
u32 step_marks; /* ECN mark pkt counter due to step AQM */
u32 memory_used; /* Memory used of both queues */
u32 max_memory_used;/* Maximum used memory */
+};
+static u32 dualpi2_scale_alpha_beta(u32 param) +{
u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING);
do_div(tmp, NSEC_PER_SEC);
return tmp;
+}
+static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) +{
return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);
+}
+static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q) +{
q->c_protection_credit = q->c_protection_init;
+}
+/* This computes the initial credit value and WRR weight for the L queue (wl)
- from the weight of the C queue (wc).
- If wl > wc, the scheduler will start with the L queue when reset.
- */
+static void dualpi2_calculate_c_protection(struct Qdisc *sch,
struct dualpi2_sched_data *q, u32 wc)
+{
q->c_protection_wc = wc;
q->c_protection_wl = MAX_WC - wc;
q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) *
((int)q->c_protection_wc - (int)q->c_protection_wl);
dualpi2_reset_c_protection(q);
+}
+static s64 __scale_delta(u64 diff) +{
do_div(diff, 1 << ALPHA_BETA_GRANULARITY);
return diff;
+}
+static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c,
u64 *qdelay_l)
+{
u64 now, qc, ql;
now = ktime_get_ns();
qc = READ_ONCE(q->c_head_ts);
ql = READ_ONCE(q->l_head_ts);
*qdelay_c = qc ? now - qc : 0;
*qdelay_l = ql ? now - ql : 0;
+}
+static u32 calculate_probability(struct Qdisc *sch) +{
struct dualpi2_sched_data *q = qdisc_priv(sch);
u32 new_prob;
u64 qdelay_c;
u64 qdelay_l;
u64 qdelay;
s64 delta;
get_queue_delays(q, &qdelay_c, &qdelay_l);
qdelay = max(qdelay_l, qdelay_c);
/* Alpha and beta take at most 32b, i.e, the delay difference would
* overflow for queuing delay differences > ~4.2sec.
*/
delta = ((s64)qdelay - q->pi2_target) * q->pi2_alpha;
delta += ((s64)qdelay - q->last_qdelay) * q->pi2_beta;
if (delta > 0) {
new_prob = __scale_delta(delta) + q->pi2_prob;
if (new_prob < q->pi2_prob)
new_prob = MAX_PROB;
} else {
new_prob = q->pi2_prob - __scale_delta(~delta + 1);
if (new_prob > q->pi2_prob)
new_prob = 0;
}
q->last_qdelay = qdelay;
/* If we do not drop on overload, ensure we cap the L4S probability to
* 100% to keep window fairness when overflowing.
*/
if (!q->drop_overload)
return min_t(u32, new_prob, MAX_PROB / q->coupling_factor);
return new_prob;
+}
+static u32 get_memory_limit(struct Qdisc *sch, u32 limit) +{
/* Apply rule of thumb, i.e., doubling the packet length,
* to further include per packet overhead in memory_limit.
*/
u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch)));
if (upper_32_bits(memlim))
return 0xffffffff;
else
return lower_32_bits(memlim);
+}
+static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) +{
struct dualpi2_sched_data *q = from_timer(q, timer, pi2_timer);
WRITE_ONCE(q->pi2_prob, calculate_probability(q->sch));
hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q));
return HRTIMER_RESTART;
+}
+static struct netlink_range_validation dualpi2_alpha_beta_range = {
.min = 1,
.max = ALPHA_BETA_MAX,
+};
+static struct netlink_range_validation dualpi2_wc_range = {
.min = 0,
.max = MAX_WC,
+};
+static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = {
[TCA_DUALPI2_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1),
[TCA_DUALPI2_MEMORY_LIMIT] = NLA_POLICY_MIN(NLA_U32, 1),
[TCA_DUALPI2_TARGET] = {.type = NLA_U32},
[TCA_DUALPI2_TUPDATE] = NLA_POLICY_MIN(NLA_U32, 1),
[TCA_DUALPI2_ALPHA] =
NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
[TCA_DUALPI2_BETA] =
NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),
[TCA_DUALPI2_STEP_THRESH] = {.type = NLA_U32},
[TCA_DUALPI2_STEP_PACKETS] = {.type = NLA_FLAG},
[TCA_DUALPI2_MIN_QLEN_STEP] = {.type = NLA_U32},
[TCA_DUALPI2_COUPLING] = NLA_POLICY_MIN(NLA_U8, 1),
[TCA_DUALPI2_DROP_OVERLOAD] = {.type = NLA_U8},
[TCA_DUALPI2_DROP_EARLY] = {.type = NLA_U8},
[TCA_DUALPI2_C_PROTECTION] =
NLA_POLICY_FULL_RANGE(NLA_U8, &dualpi2_wc_range),
[TCA_DUALPI2_ECN_MASK] = {.type = NLA_U8},
[TCA_DUALPI2_SPLIT_GSO] = {.type = NLA_U8},
+};
+static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
+{
struct nlattr *tb[TCA_DUALPI2_MAX + 1];
struct dualpi2_sched_data *q;
int old_backlog;
int old_qlen;
int err;
if (!opt)
return -EINVAL;
err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy,
extack);
if (err < 0)
return err;
q = qdisc_priv(sch);
sch_tree_lock(sch);
if (tb[TCA_DUALPI2_LIMIT]) {
u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]);
WRITE_ONCE(sch->limit, limit);
WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit));
}
if (tb[TCA_DUALPI2_MEMORY_LIMIT])
WRITE_ONCE(q->memory_limit,
nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]));
if (tb[TCA_DUALPI2_TARGET]) {
u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]);
WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC);
}
if (tb[TCA_DUALPI2_TUPDATE]) {
u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]);
WRITE_ONCE(q->pi2_tupdate, tupdate * NSEC_PER_USEC);
}
if (tb[TCA_DUALPI2_ALPHA]) {
u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]);
WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha));
}
if (tb[TCA_DUALPI2_BETA]) {
u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]);
WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta));
}
if (tb[TCA_DUALPI2_STEP_THRESH]) {
u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH]);
bool step_pkt = nla_get_flag(tb[TCA_DUALPI2_STEP_PACKETS]);
WRITE_ONCE(q->step_in_packets, step_pkt);
WRITE_ONCE(q->step_thresh,
step_pkt ? step_th : (step_th * NSEC_PER_USEC));
}
if (tb[TCA_DUALPI2_MIN_QLEN_STEP])
WRITE_ONCE(q->min_qlen_step,
nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]));
if (tb[TCA_DUALPI2_COUPLING]) {
u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]);
WRITE_ONCE(q->coupling_factor, coupling);
}
if (tb[TCA_DUALPI2_DROP_OVERLOAD])
WRITE_ONCE(q->drop_overload,
!!nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]));
You declared an enum for this in tc.yaml so this should be implemented using an enum that you define as part of UAPI in pkt_sched.h
if (tb[TCA_DUALPI2_DROP_EARLY])
WRITE_ONCE(q->drop_early,
!!nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]));
You declared an enum for this in tc.yaml so this should be implemented using an enum that you define as part of UAPI in pkt_sched.h
if (tb[TCA_DUALPI2_C_PROTECTION]) {
u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]);
dualpi2_calculate_c_protection(sch, q, wc);
}
if (tb[TCA_DUALPI2_ECN_MASK])
WRITE_ONCE(q->ecn_mask,
nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]));
You declared flags for this in tc.yaml so this should be implemented using flags that you define as part of UAPI in pkt_sched.h and you should check the provided value for validity before accepting it.
if (tb[TCA_DUALPI2_SPLIT_GSO])
WRITE_ONCE(q->split_gso,
!!nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]));
This looks like another flag or enum but it is declared as a u8 in tc.yaml. Please fix the discrepancy.
old_qlen = qdisc_qlen(sch);
old_backlog = sch->qstats.backlog;
while (qdisc_qlen(sch) > sch->limit ||
q->memory_used > q->memory_limit) {
struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
q->memory_used -= skb->truesize;
qdisc_qstats_backlog_dec(sch, skb);
rtnl_qdisc_drop(skb, sch);
}
qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch),
old_backlog - sch->qstats.backlog);
sch_tree_unlock(sch);
return 0;
+}