Re: [PATCH v13 net-next 3/5] sched: Struct definition and parsing of dualpi2 qdisc

29 Apr 2025

On Sat, 26 Apr 2025 at 18:20, chia-yu.chang@nokia-bell-labs.com wrote:
...

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 9ea874395717..b4caeccbea72 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1210,4 +1210,28 @@ enum {
#define TCA_ETS_MAX (__TCA_ETS_MAX - 1)
+/* DUALPI2 */
+enum {

  TCA_DUALPI2_UNSPEC,


  TCA_DUALPI2_LIMIT,              /* Packets */


  TCA_DUALPI2_MEMORY_LIMIT,       /* Bytes */


  TCA_DUALPI2_TARGET,             /* us */


  TCA_DUALPI2_TUPDATE,            /* us */


  TCA_DUALPI2_ALPHA,              /* Hz scaled up by 256 */


  TCA_DUALPI2_BETA,               /* HZ scaled up by 256 */


  TCA_DUALPI2_STEP_THRESH,        /* Packets or us */


  TCA_DUALPI2_STEP_PACKETS,       /* Whether STEP_THRESH is in packets */


  TCA_DUALPI2_MIN_QLEN_STEP,      /* Minimum qlen to apply STEP_THRESH */


  TCA_DUALPI2_COUPLING,           /* Coupling factor between queues */


  TCA_DUALPI2_DROP_OVERLOAD,      /* Whether to drop on overload */


  TCA_DUALPI2_DROP_EARLY,         /* Whether to drop on enqueue */


  TCA_DUALPI2_C_PROTECTION,       /* Percentage */


  TCA_DUALPI2_ECN_MASK,           /* L4S queue classification mask */


  TCA_DUALPI2_SPLIT_GSO,          /* Split GSO packets at enqueue */


  TCA_DUALPI2_PAD,


  __TCA_DUALPI2_MAX



+};



+#define TCA_DUALPI2_MAX   (__TCA_DUALPI2_MAX - 1)
Need to define enums here as part of UAPI for each of the new enum
types you include in the tc.yaml patch.
...
#endif
diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c
new file mode 100644
index 000000000000..bb3d9982b572
--- /dev/null
+++ b/net/sched/sch_dualpi2.c
@@ -0,0 +1,554 @@
+// SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+/* Copyright (C) 2024 Nokia






Author: Koen De Schepper koen.de_schepper@nokia-bell-labs.com



Author: Olga Albisser olga@albisser.org



Author: Henrik Steen henrist@henrist.net



Author: Olivier Tilmans olivier.tilmans@nokia.com



Author: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com







DualPI Improved with a Square (dualpi2):




Supports congestion controls that comply with the Prague requirements





in RFC9331 (e.g. TCP-Prague)




Supports coupled dual-queue with PI2 as defined in RFC9332






Supports ECN L4S-identifier (IP.ECN==0b*1)









note: Although DCTCP and BBRv3 can use shallow-threshold ECN marks,



they do not meet the 'Prague L4S Requirements' listed in RFC 9331



Section 4, so they can only be used with DualPI2 in a datacenter



context.







References:




RFC9332: https://datatracker.ietf.org/doc/html/rfc9332






De Schepper, Koen, et al. "PI 2: A linearized AQM for both classic and





scalable TCP."  in proc. ACM CoNEXT'16, 2016.


*/


+#include <linux/errno.h>
+#include <linux/hrtimer.h>
+#include <linux/if_vlan.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>



+#include <net/gso.h>
+#include <net/inet_ecn.h>
+#include <net/pkt_cls.h>
+#include <net/pkt_sched.h>



+/* 32b enable to support flows with windows up to ~8.6 * 1e9 packets


i.e., twice the maximal snd_cwnd.



MAX_PROB must be consistent with the RNG in dualpi2_roll().


*/

+#define MAX_PROB U32_MAX



+/* alpha/beta values exchanged over netlink are in units of 256ns */
+#define ALPHA_BETA_SHIFT 8



+/* Scaled values of alpha/beta must fit in 32b to avoid overflow in later


computations. Consequently (see and dualpi2_scale_alpha_beta()), their



netlink-provided values can use at most 31b, i.e. be at most (2^23)-1



(~4MHz) as those are given in 1/256th. This enable to tune alpha/beta to



control flows whose maximal RTTs can be in usec up to few secs.


*/

+#define ALPHA_BETA_MAX ((1U << 31) - 1)



+/* Internal alpha/beta are in units of 64ns.


This enables to use all alpha/beta values in the allowed range without loss



of precision due to rounding when scaling them internally, e.g.,



scale_alpha_beta(1) will not round down to 0.


*/

+#define ALPHA_BETA_GRANULARITY 6



+#define ALPHA_BETA_SCALING (ALPHA_BETA_SHIFT - ALPHA_BETA_GRANULARITY)



+/* We express the weights (wc, wl) in %, i.e., wc + wl = 100 */
+#define MAX_WC 100



+struct dualpi2_sched_data {

  struct Qdisc *l_queue;  /* The L4S Low latency queue (L-queue) */


  struct Qdisc *sch;      /* The Classic queue (C-queue) */



  /* Registered tc filters */


  struct tcf_proto __rcu *tcf_filters;


  struct tcf_block *tcf_block;



  /* PI2 parameters */


  u64     pi2_target;     /* Target delay in nanoseconds */


  u32     pi2_tupdate;    /* Timer frequency in nanoseconds */


  u32     pi2_prob;       /* Base PI probability */


  u32     pi2_alpha;      /* Gain factor for the integral rate response */


  u32     pi2_beta;       /* Gain factor for the proportional response */


  struct hrtimer pi2_timer; /* prob update timer */



  /* Step AQM (L-queue only) parameters */


  u32     step_thresh;    /* Step threshold */


  bool    step_in_packets; /* Whether the step is in packets or time */



  /* C-queue starvation protection */


  s32     c_protection_credit; /* Credit (sign indicates which queue) */


  s32     c_protection_init; /* Reset value of the credit */


  u8      c_protection_wc; /* C-queue weight (between 0 and MAX_WC) */


  u8      c_protection_wl; /* L-queue weight (MAX_WC - wc) */



  /* General dualQ parameters */


  u32     memory_limit;   /* Memory limit of both queues */


  u8      coupling_factor;/* Coupling factor (k) between both queues */


  u8      ecn_mask;       /* Mask to match packets into L-queue */


  u32     min_qlen_step;  /* Minimum queue length to apply step thresh */


  bool    drop_early;     /* Drop at enqueue instead of dequeue if true */


  bool    drop_overload;  /* Drop (1) on overload, or overflow (0) */


  bool    split_gso;      /* Split aggregated skb (1) or leave as is */



  /* Statistics */


  u64     c_head_ts;      /* Enqueue timestamp of the C-queue head */


  u64     l_head_ts;      /* Enqueue timestamp of the L-queue head */


  u64     last_qdelay;    /* Q delay val at the last probability update */


  u32     packets_in_c;   /* Enqueue packet counter of the C-queue */


  u32     packets_in_l;   /* Enqueue packet counter of the L-queue */


  u32     maxq;           /* Maximum queue size of the C-queue */


  u32     ecn_mark;       /* ECN mark pkt counter due to PI probability */


  u32     step_marks;     /* ECN mark pkt counter due to step AQM */


  u32     memory_used;    /* Memory used of both queues */


  u32     max_memory_used;/* Maximum used memory */



+};



+static u32 dualpi2_scale_alpha_beta(u32 param)
+{

  u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING);



  do_div(tmp, NSEC_PER_SEC);


  return tmp;



+}



+static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q)
+{

  return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate);



+}



+static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q)
+{

  q->c_protection_credit = q->c_protection_init;



+}



+/* This computes the initial credit value and WRR weight for the L queue (wl)


from the weight of the C queue (wc).



If wl > wc, the scheduler will start with the L queue when reset.


*/

+static void dualpi2_calculate_c_protection(struct Qdisc *sch,

                                     struct dualpi2_sched_data *q, u32 wc)



+{

  q->c_protection_wc = wc;


  q->c_protection_wl = MAX_WC - wc;


  q->c_protection_init = (s32)psched_mtu(qdisc_dev(sch)) *


          ((int)q->c_protection_wc - (int)q->c_protection_wl);


  dualpi2_reset_c_protection(q);



+}



+static s64 __scale_delta(u64 diff)
+{

  do_div(diff, 1 << ALPHA_BETA_GRANULARITY);


  return diff;



+}



+static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c,

                       u64 *qdelay_l)



+{

  u64 now, qc, ql;



  now = ktime_get_ns();


  qc = READ_ONCE(q->c_head_ts);


  ql = READ_ONCE(q->l_head_ts);



  *qdelay_c = qc ? now - qc : 0;


  *qdelay_l = ql ? now - ql : 0;



+}



+static u32 calculate_probability(struct Qdisc *sch)
+{

  struct dualpi2_sched_data *q = qdisc_priv(sch);


  u32 new_prob;


  u64 qdelay_c;


  u64 qdelay_l;


  u64 qdelay;


  s64 delta;



  get_queue_delays(q, &qdelay_c, &qdelay_l);


  qdelay = max(qdelay_l, qdelay_c);


  /* Alpha and beta take at most 32b, i.e, the delay difference would


   * overflow for queuing delay differences > ~4.2sec.


   */


  delta = ((s64)qdelay - q->pi2_target) * q->pi2_alpha;


  delta += ((s64)qdelay - q->last_qdelay) * q->pi2_beta;


  if (delta > 0) {


          new_prob = __scale_delta(delta) + q->pi2_prob;


          if (new_prob < q->pi2_prob)


                  new_prob = MAX_PROB;


  } else {


          new_prob = q->pi2_prob - __scale_delta(~delta + 1);


          if (new_prob > q->pi2_prob)


                  new_prob = 0;


  }


  q->last_qdelay = qdelay;


  /* If we do not drop on overload, ensure we cap the L4S probability to


   * 100% to keep window fairness when overflowing.


   */


  if (!q->drop_overload)


          return min_t(u32, new_prob, MAX_PROB / q->coupling_factor);


  return new_prob;



+}



+static u32 get_memory_limit(struct Qdisc *sch, u32 limit)
+{

  /* Apply rule of thumb, i.e., doubling the packet length,


   * to further include per packet overhead in memory_limit.


   */


  u64 memlim = mul_u32_u32(limit, 2 * psched_mtu(qdisc_dev(sch)));



  if (upper_32_bits(memlim))


          return 0xffffffff;


  else


          return lower_32_bits(memlim);



+}



+static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer)
+{

  struct dualpi2_sched_data *q = from_timer(q, timer, pi2_timer);



  WRITE_ONCE(q->pi2_prob, calculate_probability(q->sch));



  hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q));


  return HRTIMER_RESTART;



+}



+static struct netlink_range_validation dualpi2_alpha_beta_range = {

  .min = 1,


  .max = ALPHA_BETA_MAX,



+};



+static struct netlink_range_validation dualpi2_wc_range = {

  .min = 0,


  .max = MAX_WC,



+};



+static const struct nla_policy dualpi2_policy[TCA_DUALPI2_MAX + 1] = {

  [TCA_DUALPI2_LIMIT]             = NLA_POLICY_MIN(NLA_U32, 1),


  [TCA_DUALPI2_MEMORY_LIMIT]      = NLA_POLICY_MIN(NLA_U32, 1),


  [TCA_DUALPI2_TARGET]            = {.type = NLA_U32},


  [TCA_DUALPI2_TUPDATE]           = NLA_POLICY_MIN(NLA_U32, 1),


  [TCA_DUALPI2_ALPHA]             =


          NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),


  [TCA_DUALPI2_BETA]              =


          NLA_POLICY_FULL_RANGE(NLA_U32, &dualpi2_alpha_beta_range),


  [TCA_DUALPI2_STEP_THRESH]       = {.type = NLA_U32},


  [TCA_DUALPI2_STEP_PACKETS]      = {.type = NLA_FLAG},


  [TCA_DUALPI2_MIN_QLEN_STEP]     = {.type = NLA_U32},


  [TCA_DUALPI2_COUPLING]          = NLA_POLICY_MIN(NLA_U8, 1),


  [TCA_DUALPI2_DROP_OVERLOAD]     = {.type = NLA_U8},


  [TCA_DUALPI2_DROP_EARLY]        = {.type = NLA_U8},


  [TCA_DUALPI2_C_PROTECTION]      =


          NLA_POLICY_FULL_RANGE(NLA_U8, &dualpi2_wc_range),


  [TCA_DUALPI2_ECN_MASK]          = {.type = NLA_U8},


  [TCA_DUALPI2_SPLIT_GSO]         = {.type = NLA_U8},



+};



+static int dualpi2_change(struct Qdisc *sch, struct nlattr *opt,

                    struct netlink_ext_ack *extack)



+{

  struct nlattr *tb[TCA_DUALPI2_MAX + 1];


  struct dualpi2_sched_data *q;


  int old_backlog;


  int old_qlen;


  int err;



  if (!opt)


          return -EINVAL;


  err = nla_parse_nested(tb, TCA_DUALPI2_MAX, opt, dualpi2_policy,


                         extack);


  if (err < 0)


          return err;



  q = qdisc_priv(sch);


  sch_tree_lock(sch);



  if (tb[TCA_DUALPI2_LIMIT]) {


          u32 limit = nla_get_u32(tb[TCA_DUALPI2_LIMIT]);



          WRITE_ONCE(sch->limit, limit);


          WRITE_ONCE(q->memory_limit, get_memory_limit(sch, limit));


  }



  if (tb[TCA_DUALPI2_MEMORY_LIMIT])


          WRITE_ONCE(q->memory_limit,


                     nla_get_u32(tb[TCA_DUALPI2_MEMORY_LIMIT]));



  if (tb[TCA_DUALPI2_TARGET]) {


          u64 target = nla_get_u32(tb[TCA_DUALPI2_TARGET]);



          WRITE_ONCE(q->pi2_target, target * NSEC_PER_USEC);


  }



  if (tb[TCA_DUALPI2_TUPDATE]) {


          u64 tupdate = nla_get_u32(tb[TCA_DUALPI2_TUPDATE]);



          WRITE_ONCE(q->pi2_tupdate, tupdate * NSEC_PER_USEC);


  }



  if (tb[TCA_DUALPI2_ALPHA]) {


          u32 alpha = nla_get_u32(tb[TCA_DUALPI2_ALPHA]);



          WRITE_ONCE(q->pi2_alpha, dualpi2_scale_alpha_beta(alpha));


  }



  if (tb[TCA_DUALPI2_BETA]) {


          u32 beta = nla_get_u32(tb[TCA_DUALPI2_BETA]);



          WRITE_ONCE(q->pi2_beta, dualpi2_scale_alpha_beta(beta));


  }



  if (tb[TCA_DUALPI2_STEP_THRESH]) {


          u32 step_th = nla_get_u32(tb[TCA_DUALPI2_STEP_THRESH]);


          bool step_pkt = nla_get_flag(tb[TCA_DUALPI2_STEP_PACKETS]);



          WRITE_ONCE(q->step_in_packets, step_pkt);


          WRITE_ONCE(q->step_thresh,


                     step_pkt ? step_th : (step_th * NSEC_PER_USEC));


  }



  if (tb[TCA_DUALPI2_MIN_QLEN_STEP])


          WRITE_ONCE(q->min_qlen_step,


                     nla_get_u32(tb[TCA_DUALPI2_MIN_QLEN_STEP]));



  if (tb[TCA_DUALPI2_COUPLING]) {


          u8 coupling = nla_get_u8(tb[TCA_DUALPI2_COUPLING]);



          WRITE_ONCE(q->coupling_factor, coupling);


  }



  if (tb[TCA_DUALPI2_DROP_OVERLOAD])


          WRITE_ONCE(q->drop_overload,


                     !!nla_get_u8(tb[TCA_DUALPI2_DROP_OVERLOAD]));



You declared an enum for this in tc.yaml so this should be implemented
using an enum that you define as part of UAPI in pkt_sched.h
...


  if (tb[TCA_DUALPI2_DROP_EARLY])


          WRITE_ONCE(q->drop_early,


                     !!nla_get_u8(tb[TCA_DUALPI2_DROP_EARLY]));



You declared an enum for this in tc.yaml so this should be implemented
using an enum that you define as part of UAPI in pkt_sched.h
...


  if (tb[TCA_DUALPI2_C_PROTECTION]) {


          u8 wc = nla_get_u8(tb[TCA_DUALPI2_C_PROTECTION]);



          dualpi2_calculate_c_protection(sch, q, wc);


  }



  if (tb[TCA_DUALPI2_ECN_MASK])


          WRITE_ONCE(q->ecn_mask,


                     nla_get_u8(tb[TCA_DUALPI2_ECN_MASK]));



You declared flags for this in tc.yaml so this should be implemented
using flags that you define as part of UAPI in pkt_sched.h and you
should check the provided value for validity before accepting it.
...


  if (tb[TCA_DUALPI2_SPLIT_GSO])


          WRITE_ONCE(q->split_gso,


                     !!nla_get_u8(tb[TCA_DUALPI2_SPLIT_GSO]));



This looks like another flag or enum but it is declared as a u8 in
tc.yaml. Please fix the discrepancy.
...


  old_qlen = qdisc_qlen(sch);


  old_backlog = sch->qstats.backlog;


  while (qdisc_qlen(sch) > sch->limit ||


         q->memory_used > q->memory_limit) {


          struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);



          q->memory_used -= skb->truesize;


          qdisc_qstats_backlog_dec(sch, skb);


          rtnl_qdisc_drop(skb, sch);


  }


  qdisc_tree_reduce_backlog(sch, old_qlen - qdisc_qlen(sch),


                            old_backlog - sch->qstats.backlog);



  sch_tree_unlock(sch);


  return 0;



+}




    

2025

2024

2023

2022

2021

2020

2019

2018

2017

Re: [PATCH v13 net-next 3/5] sched: Struct definition and parsing of dualpi2 qdisc