summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2025-12-10 01:15:32 -0800
committerJakub Kicinski <kuba@kernel.org>2025-12-10 01:15:33 -0800
commit6bcb7727d9e612011b70d64a34401688b986d6ab (patch)
treee84afd235ba47350a3981cfc3f770aa5dd32bb5b
parent2f6e056e95ff5020260ccfd85391a6474d87e4b5 (diff)
parent92df4c56cf5b739c2977001c581badeaf82b9857 (diff)
Merge branch 'inet-frags-flush-pending-skbs-in-fqdir_pre_exit'
Jakub Kicinski says: ==================== inet: frags: flush pending skbs in fqdir_pre_exit() Fix the issue reported by NIPA starting on Sep 18th [1], where pernet_ops_rwsem is constantly held by a reader, preventing writers from grabbing it (specifically driver modules from loading). The fact that reports started around that time seems coincidental. The issue seems to be skbs queued for defrag preventing conntrack from exiting. First patch fixes another theoretical issue, it's mostly a leftover from an attempt to get rid of the inet_frag_queue refcnt, which I gave up on (still think it's doable but a bit of a time sink). Second patch is a minor refactor. The real fix is in the third patch. It's the simplest fix I can think of which is to flush the frag queues. Perhaps someone has a better suggestion? Last patch adds an explicit warning for conntrack getting stuck, as this seems like something that can easily happen if bugs sneak in. The warning will hopefully save us the first 20% of the investigation effort. Link: https://lore.kernel.org/20251001082036.0fc51440@kernel.org # [1] ==================== Link: https://patch.msgid.link/20251207010942.1672972-1-kuba@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--include/net/inet_frag.h18
-rw-r--r--include/net/ipv6_frag.h9
-rw-r--r--net/ipv4/inet_fragment.c55
-rw-r--r--net/ipv4/ip_fragment.c22
-rw-r--r--net/netfilter/nf_conntrack_core.c3
5 files changed, 72 insertions, 35 deletions
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 0eccd9c3a883..365925c9d262 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -123,27 +123,15 @@ void inet_frags_fini(struct inet_frags *);
int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net);
-static inline void fqdir_pre_exit(struct fqdir *fqdir)
-{
- /* Prevent creation of new frags.
- * Pairs with READ_ONCE() in inet_frag_find().
- */
- WRITE_ONCE(fqdir->high_thresh, 0);
-
- /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire()
- * and ip6frag_expire_frag_queue().
- */
- WRITE_ONCE(fqdir->dead, true);
-}
+void fqdir_pre_exit(struct fqdir *fqdir);
void fqdir_exit(struct fqdir *fqdir);
void inet_frag_kill(struct inet_frag_queue *q, int *refs);
void inet_frag_destroy(struct inet_frag_queue *q);
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
-/* Free all skbs in the queue; return the sum of their truesizes. */
-unsigned int inet_frag_rbtree_purge(struct rb_root *root,
- enum skb_drop_reason reason);
+void inet_frag_queue_flush(struct inet_frag_queue *q,
+ enum skb_drop_reason reason);
static inline void inet_frag_putn(struct inet_frag_queue *q, int refs)
{
diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
index 38ef66826939..41d9fc6965f9 100644
--- a/include/net/ipv6_frag.h
+++ b/include/net/ipv6_frag.h
@@ -69,9 +69,6 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
int refs = 1;
rcu_read_lock();
- /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */
- if (READ_ONCE(fq->q.fqdir->dead))
- goto out_rcu_unlock;
spin_lock(&fq->q.lock);
if (fq->q.flags & INET_FRAG_COMPLETE)
@@ -80,6 +77,12 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
fq->q.flags |= INET_FRAG_DROP;
inet_frag_kill(&fq->q, &refs);
+ /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */
+ if (READ_ONCE(fq->q.fqdir->dead)) {
+ inet_frag_queue_flush(&fq->q, 0);
+ goto out;
+ }
+
dev = dev_get_by_index_rcu(net, fq->iif);
if (!dev)
goto out;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 025895eb6ec5..001ee5c4d962 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -218,6 +218,41 @@ static int __init inet_frag_wq_init(void)
pure_initcall(inet_frag_wq_init);
+void fqdir_pre_exit(struct fqdir *fqdir)
+{
+ struct inet_frag_queue *fq;
+ struct rhashtable_iter hti;
+
+ /* Prevent creation of new frags.
+ * Pairs with READ_ONCE() in inet_frag_find().
+ */
+ WRITE_ONCE(fqdir->high_thresh, 0);
+
+ /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire()
+ * and ip6frag_expire_frag_queue().
+ */
+ WRITE_ONCE(fqdir->dead, true);
+
+ rhashtable_walk_enter(&fqdir->rhashtable, &hti);
+ rhashtable_walk_start(&hti);
+
+ while ((fq = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(fq)) {
+ if (PTR_ERR(fq) != -EAGAIN)
+ break;
+ continue;
+ }
+ spin_lock_bh(&fq->lock);
+ if (!(fq->flags & INET_FRAG_COMPLETE))
+ inet_frag_queue_flush(fq, 0);
+ spin_unlock_bh(&fq->lock);
+ }
+
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+}
+EXPORT_SYMBOL(fqdir_pre_exit);
+
void fqdir_exit(struct fqdir *fqdir)
{
INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
@@ -263,8 +298,8 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
kmem_cache_free(f->frags_cachep, q);
}
-unsigned int inet_frag_rbtree_purge(struct rb_root *root,
- enum skb_drop_reason reason)
+static unsigned int
+inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason)
{
struct rb_node *p = rb_first(root);
unsigned int sum = 0;
@@ -284,7 +319,17 @@ unsigned int inet_frag_rbtree_purge(struct rb_root *root,
}
return sum;
}
-EXPORT_SYMBOL(inet_frag_rbtree_purge);
+
+void inet_frag_queue_flush(struct inet_frag_queue *q,
+ enum skb_drop_reason reason)
+{
+ unsigned int sum;
+
+ reason = reason ?: SKB_DROP_REASON_FRAG_REASM_TIMEOUT;
+ sum = inet_frag_rbtree_purge(&q->rb_fragments, reason);
+ sub_frag_mem_limit(q->fqdir, sum);
+}
+EXPORT_SYMBOL(inet_frag_queue_flush);
void inet_frag_destroy(struct inet_frag_queue *q)
{
@@ -327,7 +372,9 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
timer_setup(&q->timer, f->frag_expire, 0);
spin_lock_init(&q->lock);
- /* One reference for the timer, one for the hash table. */
+ /* One reference for the timer, one for the hash table.
+ * We never take any extra references, only decrement this field.
+ */
refcount_set(&q->refcnt, 2);
return q;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index f7012479713b..56b0f738d2f2 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -134,11 +134,6 @@ static void ip_expire(struct timer_list *t)
net = qp->q.fqdir->net;
rcu_read_lock();
-
- /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */
- if (READ_ONCE(qp->q.fqdir->dead))
- goto out_rcu_unlock;
-
spin_lock(&qp->q.lock);
if (qp->q.flags & INET_FRAG_COMPLETE)
@@ -146,6 +141,13 @@ static void ip_expire(struct timer_list *t)
qp->q.flags |= INET_FRAG_DROP;
inet_frag_kill(&qp->q, &refs);
+
+ /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */
+ if (READ_ONCE(qp->q.fqdir->dead)) {
+ inet_frag_queue_flush(&qp->q, 0);
+ goto out;
+ }
+
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
@@ -240,16 +242,10 @@ static int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp)
{
- unsigned int sum_truesize = 0;
-
- if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
- refcount_inc(&qp->q.refcnt);
+ if (!mod_timer_pending(&qp->q.timer, jiffies + qp->q.fqdir->timeout))
return -ETIMEDOUT;
- }
- sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
- SKB_DROP_REASON_FRAG_TOO_FAR);
- sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
+ inet_frag_queue_flush(&qp->q, SKB_DROP_REASON_FRAG_TOO_FAR);
qp->q.flags = 0;
qp->q.len = 0;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0b95f226f211..d1f8eb725d42 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2487,6 +2487,7 @@ void nf_conntrack_cleanup_net(struct net *net)
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
{
struct nf_ct_iter_data iter_data = {};
+ unsigned long start = jiffies;
struct net *net;
int busy;
@@ -2507,6 +2508,8 @@ i_see_dead_people:
busy = 1;
}
if (busy) {
+ DEBUG_NET_WARN_ONCE(time_after(jiffies, start + 60 * HZ),
+ "conntrack cleanup blocked for 60s");
schedule();
goto i_see_dead_people;
}