Merge branch 'net-optimize-tx-throughput-and-efficiency'

Eric Dumazet says: ==================== net: optimize TX throughput and efficiency In this series, I replace the busylock spinlock we have in __dev_queue_xmit() and use lockless list (llist) to reduce spinlock contention to the minimum. Idea is that only one cpu might spin on the qdisc spinlock, while others simply add their skb in the llist. After this series, we get a 300 % (4x) improvement on heavy TX workloads, sending twice the number of packets per second, for half the cpu cycles. ==================== Link: https://patch.msgid.link/20251014171907.3554413-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
author: Jakub Kicinski <kuba@kernel.org> 2025-10-16 16:25:16 -0700
committer: Jakub Kicinski <kuba@kernel.org> 2025-10-16 16:25:16 -0700
commit: 2df75cc5bdc48f8a6f393eaa9d18480aeddac7f2 (patch)
tree: 474b900af346b7131228a19382b88a04694b0fd9
parent: 01b6aca22bb9f8fbbebbf8bdbb80aadf11318e3d (diff)
parent: 100dfa74cad9d4665cdcf0cc8e673b123a3ea910 (diff)
7 files changed, 111 insertions, 104 deletions
diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h
index 813a19122ebb..cc232508e695 100644
--- a/include/linux/netdevice_xmit.h
+++ b/include/linux/netdevice_xmit.h
@@ -2,6 +2,12 @@
 #ifndef _LINUX_NETDEVICE_XMIT_H
 #define _LINUX_NETDEVICE_XMIT_H
 
+#if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
+#define MIRRED_NEST_LIMIT	4
+#endif
+
+struct net_device;
+
 struct netdev_xmit {
 	u16 recursion;
 	u8  more;
@@ -9,7 +15,8 @@ struct netdev_xmit {
 	u8  skip_txqueue;
 #endif
 #if IS_ENABLED(CONFIG_NET_ACT_MIRRED)
-	u8 sched_mirred_nest;
+	u8			sched_mirred_nest;
+	struct net_device	*sched_mirred_dev[MIRRED_NEST_LIMIT];
 #endif
 #if IS_ENABLED(CONFIG_NF_DUP_NETDEV)
 	u8 nf_dup_skb_recursion;
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 738cd5b13c62..94966692ccdf 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -41,13 +41,6 @@ enum qdisc_state_t {
 	__QDISC_STATE_DRAINING,
 };
 
-enum qdisc_state2_t {
-	/* Only for !TCQ_F_NOLOCK qdisc. Never access it directly.
-	 * Use qdisc_run_begin/end() or qdisc_is_running() instead.
-	 */
-	__QDISC_STATE2_RUNNING,
-};
-
 #define QDISC_STATE_MISSED	BIT(__QDISC_STATE_MISSED)
 #define QDISC_STATE_DRAINING	BIT(__QDISC_STATE_DRAINING)
 
@@ -117,13 +110,14 @@ struct Qdisc {
 	struct qdisc_skb_head	q;
 	struct gnet_stats_basic_sync bstats;
 	struct gnet_stats_queue	qstats;
-	int                     owner;
+	bool			running; /* must be written under qdisc spinlock */
 	unsigned long		state;
-	unsigned long		state2; /* must be written under qdisc spinlock */
 	struct Qdisc            *next_sched;
 	struct sk_buff_head	skb_bad_txq;
 
-	spinlock_t		busylock ____cacheline_aligned_in_smp;
+	atomic_long_t		defer_count ____cacheline_aligned_in_smp;
+	struct llist_head	defer_list;
+
 	spinlock_t		seqlock;
 
 	struct rcu_head		rcu;
@@ -168,7 +162,7 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc)
 {
 	if (qdisc->flags & TCQ_F_NOLOCK)
 		return spin_is_locked(&qdisc->seqlock);
-	return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+	return READ_ONCE(qdisc->running);
 }
 
 static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc)
@@ -211,7 +205,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 		 */
 		return spin_trylock(&qdisc->seqlock);
 	}
-	return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+	if (READ_ONCE(qdisc->running))
+		return false;
+	WRITE_ONCE(qdisc->running, true);
+	return true;
 }
 
 static inline void qdisc_run_end(struct Qdisc *qdisc)
@@ -229,7 +226,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
 				      &qdisc->state)))
 			__netif_schedule(qdisc);
 	} else {
-		__clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+		WRITE_ONCE(qdisc->running, false);
 	}
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 35010faf0b78..821e7c718924 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4125,9 +4125,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct net_device *dev,
 				 struct netdev_queue *txq)
 {
+	struct sk_buff *next, *to_free = NULL;
 	spinlock_t *root_lock = qdisc_lock(q);
-	struct sk_buff *to_free = NULL;
-	bool contended;
+	struct llist_node *ll_list, *first_n;
+	unsigned long defer_count = 0;
 	int rc;
 
 	qdisc_calculate_pkt_len(skb, q);
@@ -4167,67 +4168,81 @@ no_lock_out:
 		return rc;
 	}
 
-	if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
-		kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
-		return NET_XMIT_DROP;
-	}
-	/*
-	 * Heuristic to force contended enqueues to serialize on a
-	 * separate lock before trying to get qdisc main lock.
-	 * This permits qdisc->running owner to get the lock more
-	 * often and dequeue packets faster.
-	 * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
-	 * and then other tasks will only enqueue packets. The packets will be
-	 * sent after the qdisc owner is scheduled again. To prevent this
-	 * scenario the task always serialize on the lock.
+	/* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
+	 * In the try_cmpxchg() loop, we want to increment q->defer_count
+	 * at most once to limit the number of skbs in defer_list.
+	 * We perform the defer_count increment only if the list is not empty,
+	 * because some arches have slow atomic_long_inc_return().
+	 */
+	first_n = READ_ONCE(q->defer_list.first);
+	do {
+		if (first_n && !defer_count) {
+			defer_count = atomic_long_inc_return(&q->defer_count);
+			if (unlikely(defer_count > q->limit)) {
+				kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+				return NET_XMIT_DROP;
+			}
+		}
+		skb->ll_node.next = first_n;
+	} while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
+
+	/* If defer_list was not empty, we know the cpu which queued
+	 * the first skb will process the whole list for us.
 	 */
-	contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
-	if (unlikely(contended))
-		spin_lock(&q->busylock);
+	if (first_n)
+		return NET_XMIT_SUCCESS;
 
 	spin_lock(root_lock);
+
+	ll_list = llist_del_all(&q->defer_list);
+	/* There is a small race because we clear defer_count not atomically
+	 * with the prior llist_del_all(). This means defer_list could grow
+	 * over q->limit.
+	 */
+	atomic_long_set(&q->defer_count, 0);
+
+	ll_list = llist_reverse_order(ll_list);
+
 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
-		__qdisc_drop(skb, &to_free);
+		llist_for_each_entry_safe(skb, next, ll_list, ll_node)
+			__qdisc_drop(skb, &to_free);
 		rc = NET_XMIT_DROP;
-	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
-		   qdisc_run_begin(q)) {
+		goto unlock;
+	}
+	if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+	    !llist_next(ll_list) && qdisc_run_begin(q)) {
 		/*
 		 * This is a work-conserving queue; there are no old skbs
 		 * waiting to be sent out; and the qdisc is not running -
 		 * xmit the skb directly.
 		 */
 
+		DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
+							  struct sk_buff,
+							  ll_node));
 		qdisc_bstats_update(q, skb);
-
-		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
-			if (unlikely(contended)) {
-				spin_unlock(&q->busylock);
-				contended = false;
-			}
+		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
 			__qdisc_run(q);
-		}
-
 		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
-		WRITE_ONCE(q->owner, smp_processor_id());
-		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
-		WRITE_ONCE(q->owner, -1);
-		if (qdisc_run_begin(q)) {
-			if (unlikely(contended)) {
-				spin_unlock(&q->busylock);
-				contended = false;
-			}
-			__qdisc_run(q);
-			qdisc_run_end(q);
+		int count = 0;
+
+		llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+			prefetch(next);
+			skb_mark_not_on_list(skb);
+			rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+			count++;
 		}
+		qdisc_run(q);
+		if (count != 1)
+			rc = NET_XMIT_SUCCESS;
 	}
+unlock:
 	spin_unlock(root_lock);
 	if (unlikely(to_free))
 		kfree_skb_list_reason(to_free,
 				      tcf_get_drop_reason(to_free));
-	if (unlikely(contended))
-		spin_unlock(&q->busylock);
 	return rc;
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6be01454f262..8eb3c5820724 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1136,7 +1136,16 @@ void skb_release_head_state(struct sk_buff *skb)
 	skb_dst_drop(skb);
 	if (skb->destructor) {
 		DEBUG_NET_WARN_ON_ONCE(in_hardirq());
-		skb->destructor(skb);
+#ifdef CONFIG_INET
+		INDIRECT_CALL_3(skb->destructor,
+				tcp_wfree, __sock_wfree, sock_wfree,
+				skb);
+#else
+		INDIRECT_CALL_1(skb->destructor,
+				sock_wfree,
+				skb);
+
+#endif
 	}
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 	nf_conntrack_put(skb_nfct(skb));
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 5f01f567c934..f27b583def78 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -29,31 +29,6 @@
 static LIST_HEAD(mirred_list);
 static DEFINE_SPINLOCK(mirred_list_lock);
 
-#define MIRRED_NEST_LIMIT    4
-
-#ifndef CONFIG_PREEMPT_RT
-static u8 tcf_mirred_nest_level_inc_return(void)
-{
-	return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest);
-}
-
-static void tcf_mirred_nest_level_dec(void)
-{
-	__this_cpu_dec(softnet_data.xmit.sched_mirred_nest);
-}
-
-#else
-static u8 tcf_mirred_nest_level_inc_return(void)
-{
-	return current->net_xmit.sched_mirred_nest++;
-}
-
-static void tcf_mirred_nest_level_dec(void)
-{
-	current->net_xmit.sched_mirred_nest--;
-}
-#endif
-
 static bool tcf_mirred_is_act_redirect(int action)
 {
 	return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
@@ -439,44 +414,53 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
 {
 	struct tcf_mirred *m = to_mirred(a);
 	int retval = READ_ONCE(m->tcf_action);
-	unsigned int nest_level;
+	struct netdev_xmit *xmit;
 	bool m_mac_header_xmit;
 	struct net_device *dev;
-	int m_eaction;
+	int i, m_eaction;
 	u32 blockid;
 
-	nest_level = tcf_mirred_nest_level_inc_return();
-	if (unlikely(nest_level > MIRRED_NEST_LIMIT)) {
+#ifdef CONFIG_PREEMPT_RT
+	xmit = &current->net_xmit;
+#else
+	xmit = this_cpu_ptr(&softnet_data.xmit);
+#endif
+	if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) {
 		net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
 				     netdev_name(skb->dev));
-		retval = TC_ACT_SHOT;
-		goto dec_nest_level;
+		return TC_ACT_SHOT;
 	}
 
 	tcf_lastuse_update(&m->tcf_tm);
 	tcf_action_update_bstats(&m->common, skb);
 
 	blockid = READ_ONCE(m->tcfm_blockid);
-	if (blockid) {
-		retval = tcf_blockcast(skb, m, blockid, res, retval);
-		goto dec_nest_level;
-	}
+	if (blockid)
+		return tcf_blockcast(skb, m, blockid, res, retval);
 
 	dev = rcu_dereference_bh(m->tcfm_dev);
 	if (unlikely(!dev)) {
 		pr_notice_once("tc mirred: target device is gone\n");
 		tcf_action_inc_overlimit_qstats(&m->common);
-		goto dec_nest_level;
+		return retval;
 	}
+	for (i = 0; i < xmit->sched_mirred_nest; i++) {
+		if (xmit->sched_mirred_dev[i] != dev)
+			continue;
+		pr_notice_once("tc mirred: loop on device %s\n",
+			       netdev_name(dev));
+		tcf_action_inc_overlimit_qstats(&m->common);
+		return retval;
+	}
+
+	xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev;
 
 	m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
 	m_eaction = READ_ONCE(m->tcfm_eaction);
 
 	retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction,
 				   retval);
-
-dec_nest_level:
-	tcf_mirred_nest_level_dec();
+	xmit->sched_mirred_nest--;
 
 	return retval;
 }
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 1e008a228ebd..d9a98d02a55f 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -666,7 +666,6 @@ struct Qdisc noop_qdisc = {
 	.ops		=	&noop_qdisc_ops,
 	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
 	.dev_queue	=	&noop_netdev_queue,
-	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
 	.gso_skb = {
 		.next = (struct sk_buff *)&noop_qdisc.gso_skb,
 		.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
@@ -679,7 +678,6 @@ struct Qdisc noop_qdisc = {
 		.qlen = 0,
 		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
 	},
-	.owner = -1,
 };
 EXPORT_SYMBOL(noop_qdisc);
 
@@ -971,10 +969,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 		}
 	}
 
-	spin_lock_init(&sch->busylock);
-	lockdep_set_class(&sch->busylock,
-			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);
-
 	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
 	spin_lock_init(&sch->seqlock);
 	lockdep_set_class(&sch->seqlock,
@@ -985,7 +979,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	sch->enqueue = ops->enqueue;
 	sch->dequeue = ops->dequeue;
 	sch->dev_queue = dev_queue;
-	sch->owner = -1;
 	netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL);
 	refcount_set(&sch->refcnt, 1);
 
diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt
index 183051ba0cae..6882b8240a8a 100644
--- a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt
+++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt
@@ -23,14 +23,16 @@
 
 // install a qdisc dropping all packets
    +0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev tun0 root pfifo limit 0`
+
    +0 write(4, ..., 24) = 24
    // When qdisc is congested we retry every 500ms
    // (TCP_RESOURCE_PROBE_INTERVAL) and therefore
    // we retry 6 times before hitting 3s timeout.
    // First verify that the connection is alive:
-+3.250 write(4, ..., 24) = 24
++3 write(4, ..., 24) = 24
+
    // Now verify that shortly after that the socket is dead:
- +.100 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out)
++1 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out)
 
    +0 %{ assert tcpi_probes == 6, tcpi_probes; \
          assert tcpi_backoff == 0, tcpi_backoff }%
author	Jakub Kicinski <kuba@kernel.org>	2025-10-16 16:25:16 -0700
committer	Jakub Kicinski <kuba@kernel.org>	2025-10-16 16:25:16 -0700
commit	2df75cc5bdc48f8a6f393eaa9d18480aeddac7f2 (patch)
tree	474b900af346b7131228a19382b88a04694b0fd9
parent	01b6aca22bb9f8fbbebbf8bdbb80aadf11318e3d (diff)
parent	100dfa74cad9d4665cdcf0cc8e673b123a3ea910 (diff)