diff options
| -rw-r--r-- | include/net/dropreason-core.h | 6 | ||||
| -rw-r--r-- | net/sched/Kconfig | 12 | ||||
| -rw-r--r-- | net/sched/Makefile | 1 | ||||
| -rw-r--r-- | net/sched/sch_dualpi2.c | 472 |
4 files changed, 487 insertions, 4 deletions
diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index e19184dd1b0f..d8ff24a33459 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -126,6 +126,7 @@ FN(CANFD_RX_INVALID_FRAME) \ FN(CANXL_RX_INVALID_FRAME) \ FN(PFMEMALLOC) \ + FN(DUALPI2_STEP_DROP) \ FNe(MAX) /** @@ -605,6 +606,11 @@ enum skb_drop_reason { */ SKB_DROP_REASON_PFMEMALLOC, /** + * @SKB_DROP_REASON_DUALPI2_STEP_DROP: dropped by the step drop + * threshold of DualPI2 qdisc. + */ + SKB_DROP_REASON_DUALPI2_STEP_DROP, + /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen */ diff --git a/net/sched/Kconfig b/net/sched/Kconfig index ad914d2b2e22..6ddff028b81a 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -415,6 +415,18 @@ config NET_SCH_BPF If unsure, say N. +config NET_SCH_DUALPI2 + tristate "Dual Queue PI Square (DUALPI2) scheduler" + help + Say Y here if you want to use the Dual Queue Proportional Integral + Controller Improved with a Square scheduling algorithm. + For more information, please see https://tools.ietf.org/html/rfc9332 + + To compile this driver as a module, choose M here: the module + will be called sch_dualpi2. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" help diff --git a/net/sched/Makefile b/net/sched/Makefile index 904d784902d1..5078ea84e6ad 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o +obj-$(CONFIG_NET_SCH_DUALPI2) += sch_dualpi2.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 0a96d57c40d1..845375ebd4ea 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -113,8 +113,44 @@ struct dualpi2_sched_data { u32 step_marks; /* ECN mark pkt counter due to step AQM */ u32 memory_used; /* Memory used of both queues */ u32 max_memory_used;/* Maximum used memory */ + + /* Deferred drop statistics */ + u32 deferred_drops_cnt; /* Packets dropped */ + u32 deferred_drops_len; /* Bytes dropped */ +}; + +struct dualpi2_skb_cb { + u64 ts; /* Timestamp at enqueue */ + u8 apply_step:1, /* Can we apply the step threshold */ + classified:2, /* Packet classification results */ + ect:2; /* Packet ECT codepoint */ +}; + +enum dualpi2_classification_results { + DUALPI2_C_CLASSIC = 0, /* C-queue */ + DUALPI2_C_L4S = 1, /* L-queue (scale mark/classic drop) */ + DUALPI2_C_LLLL = 2, /* L-queue (no drops/marks) */ + __DUALPI2_C_MAX /* Keep last*/ }; +static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb)); + return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference) +{ + return reference - dualpi2_skb_cb(skb)->ts; +} + +static u64 head_enqueue_time(struct Qdisc *q) +{ + struct sk_buff *skb = qdisc_peek_head(q); + + return skb ? dualpi2_skb_cb(skb)->ts : 0; +} + static u32 dualpi2_scale_alpha_beta(u32 param) { u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING); @@ -136,6 +172,30 @@ static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate); } +static bool skb_is_l4s(struct sk_buff *skb) +{ + return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S; +} + +static bool skb_in_l_queue(struct sk_buff *skb) +{ + return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC; +} + +static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q) +{ + return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step; +} + +static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb) +{ + if (INET_ECN_set_ce(skb)) { + q->ecn_mark++; + return true; + } + return false; +} + static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q) { q->c_protection_credit = q->c_protection_init; @@ -155,6 +215,408 @@ static void dualpi2_calculate_c_protection(struct Qdisc *sch, dualpi2_reset_c_protection(q); } +static bool dualpi2_roll(u32 prob) +{ + return get_random_u32() <= prob; +} + +/* Packets in the C-queue are subject to a marking probability pC, which is the + * square of the internal PI probability (i.e., have an overall lower mark/drop + * probability). If the qdisc is overloaded, ignore ECT values and only drop. + * + * Note that this marking scheme is also applied to L4S packets during overload. + * Return true if packet dropping is required in C queue + */ +static bool dualpi2_classic_marking(struct dualpi2_sched_data *q, + struct sk_buff *skb, u32 prob, + bool overload) +{ + if (dualpi2_roll(prob) && dualpi2_roll(prob)) { + if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) + return true; + dualpi2_mark(q, skb); + } + return false; +} + +/* Packets in the L-queue are subject to a marking probability pL given by the + * internal PI probability scaled by the coupling factor. + * + * On overload (i.e., @local_l_prob is >= 100%): + * - if the qdisc is configured to trade losses to preserve latency (i.e., + * @q->drop_overload), apply classic drops first before marking. + * - otherwise, preserve the "no loss" property of ECN at the cost of queueing + * delay, eventually resulting in taildrop behavior once sch->limit is + * reached. + * Return true if packet dropping is required in L queue + */ +static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q, + struct sk_buff *skb, + u64 local_l_prob, u32 prob, + bool overload) +{ + if (overload) { + /* Apply classic drop */ + if (!q->drop_overload || + !(dualpi2_roll(prob) && dualpi2_roll(prob))) + goto mark; + return true; + } + + /* We can safely cut the upper 32b as overload==false */ + if (dualpi2_roll(local_l_prob)) { + /* Non-ECT packets could have classified as L4S by filters. */ + if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) + return true; +mark: + dualpi2_mark(q, skb); + } + return false; +} + +/* Decide whether a given packet must be dropped (or marked if ECT), according + * to the PI2 probability. + * + * Never mark/drop if we have a standing queue of less than 2 MTUs. + */ +static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q, + struct sk_buff *skb) +{ + u64 local_l_prob; + bool overload; + u32 prob; + + if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch))) + return false; + + prob = READ_ONCE(q->pi2_prob); + local_l_prob = (u64)prob * q->coupling_factor; + overload = local_l_prob > MAX_PROB; + + switch (dualpi2_skb_cb(skb)->classified) { + case DUALPI2_C_CLASSIC: + return dualpi2_classic_marking(q, skb, prob, overload); + case DUALPI2_C_L4S: + return dualpi2_scalable_marking(q, skb, local_l_prob, prob, + overload); + default: /* DUALPI2_C_LLLL */ + return false; + } +} + +static void dualpi2_read_ect(struct sk_buff *skb) +{ + struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); + int wlen = skb_network_offset(skb); + + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + goto not_ecn; + + cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK; + break; + case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + goto not_ecn; + + cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK; + break; + default: + goto not_ecn; + } + return; + +not_ecn: + /* Non pullable/writable packets can only be dropped hence are + * classified as not ECT. + */ + cb->ect = INET_ECN_NOT_ECT; +} + +static int dualpi2_skb_classify(struct dualpi2_sched_data *q, + struct sk_buff *skb) +{ + struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); + struct tcf_result res; + struct tcf_proto *fl; + int result; + + dualpi2_read_ect(skb); + if (cb->ect & q->ecn_mask) { + cb->classified = DUALPI2_C_L4S; + return NET_XMIT_SUCCESS; + } + + if (TC_H_MAJ(skb->priority) == q->sch->handle && + TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) { + cb->classified = TC_H_MIN(skb->priority); + return NET_XMIT_SUCCESS; + } + + fl = rcu_dereference_bh(q->tcf_filters); + if (!fl) { + cb->classified = DUALPI2_C_CLASSIC; + return NET_XMIT_SUCCESS; + } + + result = tcf_classify(skb, NULL, fl, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } +#endif + cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ? + TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC; + } + return NET_XMIT_SUCCESS; +} + +static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct dualpi2_skb_cb *cb; + + if (unlikely(qdisc_qlen(sch) >= sch->limit) || + unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) { + qdisc_qstats_overlimit(sch); + if (skb_in_l_queue(skb)) + qdisc_qstats_overlimit(q->l_queue); + return qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_OVERLIMIT); + } + + if (q->drop_early && must_drop(sch, q, skb)) { + qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_CONGESTED); + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } + + cb = dualpi2_skb_cb(skb); + cb->ts = ktime_get_ns(); + q->memory_used += skb->truesize; + if (q->memory_used > q->max_memory_used) + q->max_memory_used = q->memory_used; + + if (qdisc_qlen(sch) > q->maxq) + q->maxq = qdisc_qlen(sch); + + if (skb_in_l_queue(skb)) { + /* Apply step thresh if skb is L4S && L-queue len >= min_qlen */ + dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q); + + /* Keep the overall qdisc stats consistent */ + ++sch->q.qlen; + qdisc_qstats_backlog_inc(sch, skb); + ++q->packets_in_l; + if (!q->l_head_ts) + q->l_head_ts = cb->ts; + return qdisc_enqueue_tail(skb, q->l_queue); + } + ++q->packets_in_c; + if (!q->c_head_ts) + q->c_head_ts = cb->ts; + return qdisc_enqueue_tail(skb, sch); +} + +/* By default, dualpi2 will split GSO skbs into independent skbs and enqueue + * each of those individually. This yields the following benefits, at the + * expense of CPU usage: + * - Finer-grained AQM actions as the sub-packets of a burst no longer share the + * same fate (e.g., the random mark/drop probability is applied individually) + * - Improved precision of the starvation protection/WRR scheduler at dequeue, + * as the size of the dequeued packets will be smaller. + */ +static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + int err; + + err = dualpi2_skb_classify(q, skb); + if (err != NET_XMIT_SUCCESS) { + if (err & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return err; + } + + if (q->split_gso && skb_is_gso(skb)) { + netdev_features_t features; + struct sk_buff *nskb, *next; + int cnt, byte_len, orig_len; + int err; + + features = netif_skb_features(skb); + nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(nskb)) + return qdisc_drop(skb, sch, to_free); + + cnt = 1; + byte_len = 0; + orig_len = qdisc_pkt_len(skb); + skb_list_walk_safe(nskb, nskb, next) { + skb_mark_not_on_list(nskb); + + /* Iterate through GSO fragments of an skb: + * (1) Set pkt_len from the single GSO fragments + * (2) Copy classified and ect values of an skb + * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb + */ + qdisc_skb_cb(nskb)->pkt_len = nskb->len; + dualpi2_skb_cb(nskb)->classified = + dualpi2_skb_cb(skb)->classified; + dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; + err = dualpi2_enqueue_skb(nskb, sch, to_free); + + if (err == NET_XMIT_SUCCESS) { + /* Compute the backlog adjustment that needs + * to be propagated in the qdisc tree to reflect + * all new skbs successfully enqueued. + */ + ++cnt; + byte_len += nskb->len; + } + } + if (cnt > 1) { + /* The caller will add the original skb stats to its + * backlog, compensate this if any nskb is enqueued. + */ + --cnt; + byte_len -= orig_len; + } + qdisc_tree_reduce_backlog(sch, -cnt, -byte_len); + consume_skb(skb); + return err; + } + return dualpi2_enqueue_skb(skb, sch, to_free); +} + +/* Select the queue from which the next packet can be dequeued, ensuring that + * neither queue can starve the other with a WRR scheduler. + * + * The sign of the WRR credit determines the next queue, while the size of + * the dequeued packet determines the magnitude of the WRR credit change. If + * either queue is empty, the WRR credit is kept unchanged. + * + * As the dequeued packet can be dropped later, the caller has to perform the + * qdisc_bstats_update() calls. + */ +static struct sk_buff *dequeue_packet(struct Qdisc *sch, + struct dualpi2_sched_data *q, + int *credit_change, + u64 now) +{ + struct sk_buff *skb = NULL; + int c_len; + + *credit_change = 0; + c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue); + if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) { + skb = __qdisc_dequeue_head(&q->l_queue->q); + WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue)); + if (c_len) + *credit_change = q->c_protection_wc; + qdisc_qstats_backlog_dec(q->l_queue, skb); + + /* Keep the global queue size consistent */ + --sch->q.qlen; + q->memory_used -= skb->truesize; + } else if (c_len) { + skb = __qdisc_dequeue_head(&sch->q); + WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch)); + if (qdisc_qlen(q->l_queue)) + *credit_change = ~((s32)q->c_protection_wl) + 1; + q->memory_used -= skb->truesize; + } else { + dualpi2_reset_c_protection(q); + return NULL; + } + *credit_change *= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); + return skb; +} + +static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb, + u64 now) +{ + u64 qdelay = 0; + + if (q->step_in_packets) + qdelay = qdisc_qlen(q->l_queue); + else + qdelay = dualpi2_sojourn_time(skb, now); + + if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) { + if (!dualpi2_skb_cb(skb)->ect) { + /* Drop this non-ECT packet */ + return 1; + } + + if (dualpi2_mark(q, skb)) + ++q->step_marks; + } + qdisc_bstats_update(q->l_queue, skb); + return 0; +} + +static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb, + struct Qdisc *sch, enum skb_drop_reason reason) +{ + ++q->deferred_drops_cnt; + q->deferred_drops_len += qdisc_pkt_len(skb); + kfree_skb_reason(skb, reason); + qdisc_qstats_drop(sch); +} + +static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + int credit_change; + u64 now; + + now = ktime_get_ns(); + + while ((skb = dequeue_packet(sch, q, &credit_change, now))) { + if (!q->drop_early && must_drop(sch, q, skb)) { + drop_and_retry(q, skb, sch, + SKB_DROP_REASON_QDISC_CONGESTED); + continue; + } + + if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) { + qdisc_qstats_drop(q->l_queue); + drop_and_retry(q, skb, sch, + SKB_DROP_REASON_DUALPI2_STEP_DROP); + continue; + } + + q->c_protection_credit += credit_change; + qdisc_bstats_update(sch, skb); + break; + } + + if (q->deferred_drops_cnt) { + qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt, + q->deferred_drops_len); + q->deferred_drops_cnt = 0; + q->deferred_drops_len = 0; + } + return skb; +} + static s64 __scale_delta(u64 diff) { do_div(diff, 1 << ALPHA_BETA_GRANULARITY); @@ -167,8 +629,8 @@ static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c, u64 now, qc, ql; now = ktime_get_ns(); - qc = q->c_head_ts; - ql = q->l_head_ts; + qc = READ_ONCE(q->c_head_ts); + ql = READ_ONCE(q->l_head_ts); *qdelay_c = qc ? now - qc : 0; *qdelay_l = ql ? now - ql : 0; @@ -254,7 +716,7 @@ static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); - q->pi2_prob = calculate_probability(sch); + WRITE_ONCE(q->pi2_prob, calculate_probability(sch)); hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q)); spin_unlock(root_lock); @@ -559,7 +1021,7 @@ static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct dualpi2_sched_data *q = qdisc_priv(sch); struct tc_dualpi2_xstats st = { - .prob = q->pi2_prob, + .prob = READ_ONCE(q->pi2_prob), .packets_in_c = q->packets_in_c, .packets_in_l = q->packets_in_l, .maxq = q->maxq, @@ -677,6 +1139,8 @@ static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = { .id = "dualpi2", .cl_ops = &dualpi2_class_ops, .priv_size = sizeof(struct dualpi2_sched_data), + .enqueue = dualpi2_qdisc_enqueue, + .dequeue = dualpi2_qdisc_dequeue, .peek = qdisc_peek_dequeued, .init = dualpi2_init, .destroy = dualpi2_destroy, |
