From 3cade698881eb238f88cbbfec82acc2110440a3f Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Wed, 15 Apr 2026 14:08:33 +0800 Subject: net: enetc: fix NTMP DMA use-after-free issue The AI-generated review reported a potential DMA use-after-free issue [1]. If netc_xmit_ntmp_cmd() times out and returns an error, the pending command is not explicitly aborted, while ntmp_free_data_mem() unconditionally frees the DMA buffer. If the buffer has already been reallocated elsewhere, this may lead to silent memory corruption. Because the hardware eventually processes the pending command and perform a DMA write of the response to the physical address of the freed buffer. To resolve this issue, this patch does the following modifications: 1. Convert cbdr->ring_lock from a spinlock to a mutex The lock was originally a spinlock in case NTMP operations might be invoked from atomic context. After downstream support for all NTMP tables, no such usage has materialized. A mutex lock is now required because the driver now needs to reclaim used BDs and release associated DMA memory within the lock's context, while dma_free_coherent() might sleep. 2. Introduce software command BD (struct netc_swcbd) The hardware write-back overwrites the addr and len fields of the BD, so the driver cannot rely on the hardware BD to free the associated DMA memory. The driver now maintains a software shadow BD storing the DMA buffer pointer, DMA address, and size. And netc_xmit_ntmp_cmd() only reclaims older BDs when the number of used BDs reaches NETC_CBDR_CLEAN_WORK (16). The software BD enables correct DMA memory release. With this, struct ntmp_dma_buf and ntmp_free_data_mem() are no longer needed and are removed. 3. Require callers to hold ring_lock across netc_xmit_ntmp_cmd() netc_xmit_ntmp_cmd() releases the ring_lock before the caller finishes consuming the response. At this point, if a concurrent thread submits a new command, it may trigger ntmp_clean_cbdr() and free the DMA buffer while it is still in use. Move ring_lock ownership to the caller to ensure the response buffer cannot be reclaimed prematurely. So the helpers ntmp_select_and_lock_cbdr() and ntmp_unlock_cbdr() are added. These changes eliminate the DMA use-after-free condition and ensure safe and consistent BD reclamation and DMA buffer lifecycle management. Fixes: 4701073c3deb ("net: enetc: add initial netc-lib driver to support NTMP") Link: https://lore.kernel.org/netdev/20260403011729.1795413-1-kuba@kernel.org/ # [1] Signed-off-by: Wei Fang Link: https://patch.msgid.link/20260415060833.2303846-3-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/fsl/ntmp.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fsl/ntmp.h b/include/linux/fsl/ntmp.h index 916dc4fe7de3..83a449b4d6ec 100644 --- a/include/linux/fsl/ntmp.h +++ b/include/linux/fsl/ntmp.h @@ -31,6 +31,12 @@ struct netc_tbl_vers { u8 rsst_ver; }; +struct netc_swcbd { + void *buf; + dma_addr_t dma; + size_t size; +}; + struct netc_cbdr { struct device *dev; struct netc_cbdr_regs regs; @@ -44,9 +50,10 @@ struct netc_cbdr { void *addr_base_align; dma_addr_t dma_base; dma_addr_t dma_base_align; + struct netc_swcbd *swcbd; /* Serialize the order of command BD ring */ - spinlock_t ring_lock; + struct mutex ring_lock; }; struct ntmp_user { -- cgit v1.2.3 From 267bf3cf9a6f0ffb98b8afd983c1950e835f07c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Apr 2026 20:03:06 +0000 Subject: tcp: annotate data-races in tcp_get_info_chrono_stats() tcp_get_timestamping_opt_stats() does not own the socket lock, this is intentional. It calls tcp_get_info_chrono_stats() while other threads could change chrono fields in tcp_chrono_set(). I do not think we need coherent TCP socket state snapshot in tcp_get_timestamping_opt_stats(), I chose to only add annotations to keep KCSAN happy. Fixes: 1c885808e456 ("tcp: SOF_TIMESTAMPING_OPT_STATS option for SO_TIMESTAMPING") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260416200319.3608680-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index dfa52ceefd23..674af493882c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2208,10 +2208,14 @@ static inline void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new const u32 now = tcp_jiffies32; enum tcp_chrono old = tp->chrono_type; + /* Following WRITE_ONCE()s pair with READ_ONCE()s in + * tcp_get_info_chrono_stats(). + */ if (old > TCP_CHRONO_UNSPEC) - tp->chrono_stat[old - 1] += now - tp->chrono_start; - tp->chrono_start = now; - tp->chrono_type = new; + WRITE_ONCE(tp->chrono_stat[old - 1], + tp->chrono_stat[old - 1] + now - tp->chrono_start); + WRITE_ONCE(tp->chrono_start, now); + WRITE_ONCE(tp->chrono_type, new); } static inline void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) -- cgit v1.2.3 From 829ba1f329cb7cbd56d599a6d225997fba66dc32 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Apr 2026 20:03:08 +0000 Subject: tcp: add data-races annotations around tp->reordering, tp->snd_cwnd tcp_get_timestamping_opt_stats() intentionally runs lockless, we must add READ_ONCE(), WRITE_ONCE() data_race() annotations to keep KCSAN happy. Fixes: bb7c19f96012 ("tcp: add related fields into SCM_TIMESTAMPING_OPT_STATS") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260416200319.3608680-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 674af493882c..ecbadcb3a744 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1513,7 +1513,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp) static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val) { WARN_ON_ONCE((int)val <= 0); - tp->snd_cwnd = val; + WRITE_ONCE(tp->snd_cwnd, val); } static inline bool tcp_in_slow_start(const struct tcp_sock *tp) -- cgit v1.2.3 From faa886ad3ce5fc8f5156493491fe189b2b726bc9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Apr 2026 20:03:10 +0000 Subject: tcp: annotate data-races around tp->delivered and tp->delivered_ce tcp_get_timestamping_opt_stats() intentionally runs lockless, we must add READ_ONCE() and WRITE_ONCE() annotations to keep KCSAN happy. Fixes: feb5f2ec6464 ("tcp: export packets delivery info") Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260416200319.3608680-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp_ecn.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index e9a933641636..865d5c5a7718 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -181,7 +181,7 @@ static inline void tcp_accecn_third_ack(struct sock *sk, tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && !tp->delivered_ce) - tp->delivered_ce++; + WRITE_ONCE(tp->delivered_ce, 1); } break; } -- cgit v1.2.3 From cc1ff87bce1ccd38410ab10960f576dcd17db679 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Wed, 15 Apr 2026 10:24:51 +0800 Subject: pppoe: drop PFC frames RFC 2516 Section 7 states that Protocol Field Compression (PFC) is NOT RECOMMENDED for PPPoE. In practice, pppd does not support negotiating PFC for PPPoE sessions, and the current PPPoE driver assumes an uncompressed (2-byte) protocol field. However, the generic PPP layer function ppp_input() is not aware of the negotiation result, and still accepts PFC frames. If a peer with a broken implementation or an attacker sends a frame with a compressed (1-byte) protocol field, the subsequent PPP payload is shifted by one byte. This causes the network header to be 4-byte misaligned, which may trigger unaligned access exceptions on some architectures. To reduce the attack surface, drop PPPoE PFC frames. Introduce ppp_skb_is_compressed_proto() helper function to be used in both ppp_generic.c and pppoe.c to avoid open-coding. Fixes: 7fb1b8ca8fa1 ("ppp: Move PFC decompression to PPP generic layer") Signed-off-by: Qingfang Deng Reviewed-by: Simon Horman Link: https://patch.msgid.link/20260415022456.141758-2-qingfang.deng@linux.dev Signed-off-by: Jakub Kicinski --- include/linux/ppp_defs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/ppp_defs.h b/include/linux/ppp_defs.h index b7e57fdbd413..b1d1f46d7d3b 100644 --- a/include/linux/ppp_defs.h +++ b/include/linux/ppp_defs.h @@ -8,6 +8,7 @@ #define _PPP_DEFS_H_ #include +#include #include #define PPP_FCS(fcs, c) crc_ccitt_byte(fcs, c) @@ -25,4 +26,19 @@ static inline bool ppp_proto_is_valid(u16 proto) return !!((proto & 0x0101) == 0x0001); } +/** + * ppp_skb_is_compressed_proto - checks if PPP protocol in a skb is compressed + * @skb: skb to check + * + * Check if the PPP protocol field is compressed (the least significant + * bit of the most significant octet is 1). skb->data must point to the PPP + * protocol header. + * + * Return: Whether the PPP protocol field is compressed. + */ +static inline bool ppp_skb_is_compressed_proto(const struct sk_buff *skb) +{ + return unlikely(skb->data[0] & 0x01); +} + #endif /* _PPP_DEFS_H_ */ -- cgit v1.2.3 From a663bac71a2f0b3ac6c373168ca57b2a6e6381aa Mon Sep 17 00:00:00 2001 From: Yuan Zhaoming Date: Fri, 17 Apr 2026 22:13:40 +0800 Subject: net: mctp: fix don't require received header reserved bits to be zero From the MCTP Base specification (DSP0236 v1.2.1), the first byte of the MCTP header contains a 4 bit reserved field, and 4 bit version. On our current receive path, we require those 4 reserved bits to be zero, but the 9500-8i card is non-conformant, and may set these reserved bits. DSP0236 states that the reserved bits must be written as zero, and ignored when read. While the device might not conform to the former, we should accept these message to conform to the latter. Relax our check on the MCTP version byte to allow non-zero bits in the reserved field. Fixes: 889b7da23abf ("mctp: Add initial routing framework") Signed-off-by: Yuan Zhaoming Cc: stable@vger.kernel.org Acked-by: Jeremy Kerr Link: https://patch.msgid.link/20260417141340.5306-1-yuanzhaoming901030@126.com Signed-off-by: Jakub Kicinski --- include/net/mctp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/mctp.h b/include/net/mctp.h index e1e0a69afdce..d8bf9074110d 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -26,6 +26,9 @@ struct mctp_hdr { #define MCTP_VER_MIN 1 #define MCTP_VER_MAX 1 +/* Definitions for ver field */ +#define MCTP_HDR_VER_MASK GENMASK(3, 0) + /* Definitions for flags_seq_tag field */ #define MCTP_HDR_FLAG_SOM BIT(7) #define MCTP_HDR_FLAG_EOM BIT(6) -- cgit v1.2.3 From db9e726525e45dbd713c07897a4d20bc18333ccc Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 16 Apr 2026 11:56:58 -0700 Subject: net: add address list snapshot and reconciliation infrastructure Introduce __hw_addr_list_snapshot() and __hw_addr_list_reconcile() for use by the upcoming ndo_set_rx_mode_async callback. The async rx_mode path needs to snapshot the device's unicast and multicast address lists under the addr_lock, hand those snapshots to the driver (which may sleep), and then propagate any sync_cnt changes back to the real lists. Two identical snapshots are taken: a work copy for the driver to pass to __hw_addr_sync_dev() and a reference copy to compute deltas against. __hw_addr_list_reconcile() walks the reference snapshot comparing each entry against the work snapshot to determine what the driver synced or unsynced. It then applies those deltas to the real list, handling concurrent modifications: - If the real entry was concurrently removed but the driver synced it to hardware (delta > 0), re-insert a stale entry so the next work run properly unsyncs it from hardware. - If the entry still exists, apply the delta normally. An entry whose refcount drops to zero is removed. # dev_addr_test_snapshot_benchmark: 1024 addrs x 1000 snapshots: 89872802 ns total, 89872 ns/iter # dev_addr_test_snapshot_benchmark.speed: slow Reviewed-by: Aleksandr Loktionov Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20260416185712.2155425-2-sdf@fomichev.me Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7969fcdd5ac4..a84c55488b8c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5004,6 +5004,13 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, int (*unsync)(struct net_device *, const unsigned char *)); void __hw_addr_init(struct netdev_hw_addr_list *list); +void __hw_addr_flush(struct netdev_hw_addr_list *list); +int __hw_addr_list_snapshot(struct netdev_hw_addr_list *snap, + const struct netdev_hw_addr_list *list, + int addr_len); +void __hw_addr_list_reconcile(struct netdev_hw_addr_list *real_list, + struct netdev_hw_addr_list *work, + struct netdev_hw_addr_list *ref, int addr_len); /* Functions used for device addresses handling */ void dev_addr_mod(struct net_device *dev, unsigned int offset, -- cgit v1.2.3 From 3554b4345d855089ab7af5e3557f5dc3262d14c9 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 16 Apr 2026 11:56:59 -0700 Subject: net: introduce ndo_set_rx_mode_async and netdev_rx_mode_work Add ndo_set_rx_mode_async callback that drivers can implement instead of the legacy ndo_set_rx_mode. The legacy callback runs under the netif_addr_lock spinlock with BHs disabled, preventing drivers from sleeping. The async variant runs from a work queue with rtnl_lock and netdev_lock_ops held, in fully sleepable context. When __dev_set_rx_mode() sees ndo_set_rx_mode_async, it schedules netdev_rx_mode_work instead of calling the driver inline. The work function takes two snapshots of each address list (uc/mc) under the addr_lock, then drops the lock and calls the driver with the work copies. After the driver returns, it reconciles the snapshots back to the real lists under the lock. Add netif_rx_mode_sync() to opportunistically execute the pending workqueue update inline, so that rx mode changes are committed before returning to userspace: - dev_change_flags (SIOCSIFFLAGS / RTM_NEWLINK) - dev_set_promiscuity - dev_set_allmulti - dev_ifsioc SIOCADDMULTI / SIOCDELMULTI - do_setlink (RTM_SETLINK) Note that some deep hierarchies still do skip the lower updates via: - dev_uc_sync - dev_mc_sync If we do end up hitting user-visible issues, we can add more calls to netif_rx_mode_sync in specific places. But hopefully we should not, the actual user-visible lists are still synced, it's that just HW state that might be lagging. Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20260416185712.2155425-3-sdf@fomichev.me Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a84c55488b8c..6ed97f4c3bc6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1119,6 +1119,16 @@ struct netdev_net_notifier { * This function is called device changes address list filtering. * If driver handles unicast address filtering, it should set * IFF_UNICAST_FLT in its priv_flags. + * Cannot sleep, called with netif_addr_lock_bh held. + * Deprecated in favor of ndo_set_rx_mode_async. + * + * void (*ndo_set_rx_mode_async)(struct net_device *dev, + * struct netdev_hw_addr_list *uc, + * struct netdev_hw_addr_list *mc); + * Async version of ndo_set_rx_mode which runs in process context + * with rtnl_lock and netdev_lock_ops(dev) held. The uc/mc parameters + * are snapshots of the address lists - iterate with + * netdev_hw_addr_list_for_each(ha, uc). * * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); * This function is called when the Media Access Control address @@ -1439,6 +1449,10 @@ struct net_device_ops { void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); + void (*ndo_set_rx_mode_async)( + struct net_device *dev, + struct netdev_hw_addr_list *uc, + struct netdev_hw_addr_list *mc); int (*ndo_set_mac_address)(struct net_device *dev, void *addr); int (*ndo_validate_addr)(struct net_device *dev); @@ -1903,6 +1917,8 @@ enum netdev_reg_state { * has been enabled due to the need to listen to * additional unicast addresses in a device that * does not implement ndo_set_rx_mode() + * @rx_mode_node: List entry for rx_mode work processing + * @rx_mode_tracker: Refcount tracker for rx_mode work * @uc: unicast mac addresses * @mc: multicast mac addresses * @dev_addrs: list of device hw addresses @@ -2294,6 +2310,8 @@ struct net_device { unsigned int promiscuity; unsigned int allmulti; bool uc_promisc; + struct list_head rx_mode_node; + netdevice_tracker rx_mode_tracker; #ifdef CONFIG_LOCKDEP unsigned char nested_level; #endif -- cgit v1.2.3 From a4c833278144917982510ca43a3438155756122a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 16 Apr 2026 11:57:00 -0700 Subject: net: cache snapshot entries for ndo_set_rx_mode_async Add a per-device netdev_hw_addr_list cache (rx_mode_addr_cache) that allows __hw_addr_list_snapshot() and __hw_addr_list_reconcile() to reuse previously allocated entries instead of hitting GFP_ATOMIC on every snapshot cycle. snapshot pops entries from the cache when available, falling back to __hw_addr_create(). reconcile splices both snapshot lists back into the cache via __hw_addr_splice(). The cache is flushed in free_netdev(). Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20260416185712.2155425-4-sdf@fomichev.me Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6ed97f4c3bc6..97b435da5771 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1919,6 +1919,7 @@ enum netdev_reg_state { * does not implement ndo_set_rx_mode() * @rx_mode_node: List entry for rx_mode work processing * @rx_mode_tracker: Refcount tracker for rx_mode work + * @rx_mode_addr_cache: Recycled snapshot entries for rx_mode work * @uc: unicast mac addresses * @mc: multicast mac addresses * @dev_addrs: list of device hw addresses @@ -2312,6 +2313,7 @@ struct net_device { bool uc_promisc; struct list_head rx_mode_node; netdevice_tracker rx_mode_tracker; + struct netdev_hw_addr_list rx_mode_addr_cache; #ifdef CONFIG_LOCKDEP unsigned char nested_level; #endif @@ -5025,10 +5027,11 @@ void __hw_addr_init(struct netdev_hw_addr_list *list); void __hw_addr_flush(struct netdev_hw_addr_list *list); int __hw_addr_list_snapshot(struct netdev_hw_addr_list *snap, const struct netdev_hw_addr_list *list, - int addr_len); + int addr_len, struct netdev_hw_addr_list *cache); void __hw_addr_list_reconcile(struct netdev_hw_addr_list *real_list, struct netdev_hw_addr_list *work, - struct netdev_hw_addr_list *ref, int addr_len); + struct netdev_hw_addr_list *ref, int addr_len, + struct netdev_hw_addr_list *cache); /* Functions used for device addresses handling */ void dev_addr_mod(struct net_device *dev, unsigned int offset, -- cgit v1.2.3 From 3bfcf396081ace536733b454ff128d53116581e5 Mon Sep 17 00:00:00 2001 From: Kohei Enju Date: Mon, 20 Apr 2026 10:54:23 +0000 Subject: net: validate skb->napi_id in RX tracepoints Since commit 2bd82484bb4c ("xps: fix xps for stacked devices"), skb->napi_id shares storage with sender_cpu. RX tracepoints using net_dev_rx_verbose_template read skb->napi_id directly and can therefore report sender_cpu values as if they were NAPI IDs. For example, on the loopback path this can report 1 as napi_id, where 1 comes from raw_smp_processor_id() + 1 in the XPS path: # bpftrace -e 'tracepoint:net:netif_rx_entry{ print(args->napi_id); }' # taskset -c 0 ping -c 1 ::1 Report only valid NAPI IDs in these tracepoints and use 0 otherwise. Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices") Signed-off-by: Kohei Enju Reviewed-by: Simon Horman Reviewed-by: Jiayuan Chen Link: https://patch.msgid.link/20260420105427.162816-1-kohei@enjuk.jp Signed-off-by: Jakub Kicinski --- include/trace/events/net.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/net.h b/include/trace/events/net.h index fdd9ad474ce3..dbc2c5598e35 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -10,6 +10,7 @@ #include #include #include +#include TRACE_EVENT(net_dev_start_xmit, @@ -208,7 +209,8 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, TP_fast_assign( __assign_str(name); #ifdef CONFIG_NET_RX_BUSY_POLL - __entry->napi_id = skb->napi_id; + __entry->napi_id = napi_id_valid(skb->napi_id) ? + skb->napi_id : 0; #else __entry->napi_id = 0; #endif -- cgit v1.2.3 From 5154561d9b119f781249f8e845fecf059b38b483 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Apr 2026 14:29:44 +0000 Subject: net/sched: sch_pie: annotate data-races in pie_dump_stats() pie_dump_stats() only runs with RTNL held, reading fields that can be changed in qdisc fast path. Add READ_ONCE()/WRITE_ONCE() annotations. Alternative would be to acquire the qdisc spinlock, but our long-term goal is to make qdisc dump operations lockless as much as we can. tc_pie_xstats fields don't need to be latched atomically, otherwise this bug would have been caught earlier. Fixes: edb09eb17ed8 ("net: sched: do not acquire qdisc spinlock in qdisc/class stats dump") Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260421142944.4009941-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/pie.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/pie.h b/include/net/pie.h index 01cbc66825a4..1f3db0c35514 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -104,7 +104,7 @@ static inline void pie_vars_init(struct pie_vars *vars) vars->dq_tstamp = DTIME_INVALID; vars->accu_prob = 0; vars->dq_count = DQCOUNT_INVALID; - vars->avg_dq_rate = 0; + WRITE_ONCE(vars->avg_dq_rate, 0); } static inline struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) -- cgit v1.2.3 From fc69decc811b155a0ed8eef17ee940f28c4f6dbc Mon Sep 17 00:00:00 2001 From: Longxuan Yu Date: Mon, 20 Apr 2026 11:18:45 +0800 Subject: 8021q: use RCU for egress QoS mappings The TX fast path and reporting paths walk egress QoS mappings without RTNL. Convert the mapping lists to RCU-protected pointers, use RCU reader annotations in readers, and defer freeing mapping nodes with an embedded rcu_head. This prepares the egress QoS mapping code for safe removal of mapping nodes in a follow-up change while preserving the current behavior. Co-developed-by: Yuan Tan Signed-off-by: Yuan Tan Signed-off-by: Longxuan Yu Signed-off-by: Ren Wei Link: https://patch.msgid.link/9136768189f8c6d3f824f476c62d2fa1111688e8.1776647968.git.yuantan098@gmail.com Signed-off-by: Paolo Abeni --- include/linux/if_vlan.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index e6272f9c5e42..20cc16ea4e5a 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -147,11 +147,13 @@ extern __be16 vlan_dev_vlan_proto(const struct net_device *dev); * @priority: skb priority * @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000 * @next: pointer to next struct + * @rcu: used for deferred freeing of mapping nodes */ struct vlan_priority_tci_mapping { u32 priority; u16 vlan_qos; - struct vlan_priority_tci_mapping *next; + struct vlan_priority_tci_mapping __rcu *next; + struct rcu_head rcu; }; struct proc_dir_entry; @@ -177,7 +179,7 @@ struct vlan_dev_priv { unsigned int nr_ingress_mappings; u32 ingress_priority_map[8]; unsigned int nr_egress_mappings; - struct vlan_priority_tci_mapping *egress_priority_map[16]; + struct vlan_priority_tci_mapping __rcu *egress_priority_map[16]; __be16 vlan_proto; u16 vlan_id; @@ -209,19 +211,24 @@ static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio) { struct vlan_priority_tci_mapping *mp; + u16 vlan_qos = 0; - smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */ + rcu_read_lock(); - mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)]; + mp = rcu_dereference(vlan_dev_priv(dev)->egress_priority_map[skprio & 0xF]); while (mp) { if (mp->priority == skprio) { - return mp->vlan_qos; /* This should already be shifted - * to mask correctly with the - * VLAN's TCI */ + vlan_qos = READ_ONCE(mp->vlan_qos); + break; } - mp = mp->next; + mp = rcu_dereference(mp->next); } - return 0; + rcu_read_unlock(); + + /* This should already be shifted to mask correctly with + * the VLAN's TCI. + */ + return vlan_qos; } extern bool vlan_do_receive(struct sk_buff **skb); -- cgit v1.2.3 From def304aae2edf321d2671fd6ca766a93c21f877e Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Apr 2026 17:14:31 +0100 Subject: rxrpc: Fix rxkad crypto unalignment handling Fix handling of a packet with a misaligned crypto length. Also handle non-ENOMEM errors from decryption by aborting. Further, remove the WARN_ON_ONCE() so that it can't be remotely triggered (a trace line can still be emitted). Fixes: f93af41b9f5f ("rxrpc: Fix missing error checks for rxkad encryption/decryption failure") Closes: https://sashiko.dev/#/patchset/20260408121252.2249051-1-dhowells%40redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260422161438.2593376-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 578b8038b211..5820d7e41ea0 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -37,6 +37,7 @@ EM(rxkad_abort_1_short_encdata, "rxkad1-short-encdata") \ EM(rxkad_abort_1_short_header, "rxkad1-short-hdr") \ EM(rxkad_abort_2_short_check, "rxkad2-short-check") \ + EM(rxkad_abort_2_crypto_unaligned, "rxkad2-crypto-unaligned") \ EM(rxkad_abort_2_short_data, "rxkad2-short-data") \ EM(rxkad_abort_2_short_header, "rxkad2-short-hdr") \ EM(rxkad_abort_2_short_len, "rxkad2-short-len") \ -- cgit v1.2.3 From 1f2740150f904bfa60e4bad74d65add3ccb5e7f8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 22 Apr 2026 17:14:32 +0100 Subject: rxrpc: Fix potential UAF after skb_unshare() failure If skb_unshare() fails to unshare a packet due to allocation failure in rxrpc_input_packet(), the skb pointer in the parent (rxrpc_io_thread()) will be NULL'd out. This will likely cause the call to trace_rxrpc_rx_done() to oops. Fix this by moving the unsharing down to where rxrpc_input_call_event() calls rxrpc_input_call_packet(). There are a number of places prior to that where we ignore DATA packets for a variety of reasons (such as the call already being complete) for which an unshare is then avoided. And with that, rxrpc_input_packet() doesn't need to take a pointer to the pointer to the packet, so change that to just a pointer. Fixes: 2d1faf7a0ca3 ("rxrpc: Simplify skbuff accounting in receive path") Closes: https://sashiko.dev/#/patchset/20260408121252.2249051-1-dhowells%40redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260422161438.2593376-4-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 5820d7e41ea0..13b9d017f8e1 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -162,8 +162,6 @@ E_(rxrpc_call_poke_timer_now, "Timer-now") #define rxrpc_skb_traces \ - EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ - EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ EM(rxrpc_skb_get_call_rx, "GET call-rx ") \ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ @@ -190,6 +188,7 @@ EM(rxrpc_skb_put_purge, "PUT purge ") \ EM(rxrpc_skb_put_purge_oob, "PUT purge-oob") \ EM(rxrpc_skb_put_response, "PUT response ") \ + EM(rxrpc_skb_put_response_copy, "PUT resp-cpy ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ @@ -198,6 +197,7 @@ EM(rxrpc_skb_see_recvmsg_oob, "SEE recvm-oob") \ EM(rxrpc_skb_see_reject, "SEE reject ") \ EM(rxrpc_skb_see_rotate, "SEE rotate ") \ + EM(rxrpc_skb_see_unshare_nomem, "SEE unshar-nm") \ E_(rxrpc_skb_see_version, "SEE version ") #define rxrpc_local_traces \ -- cgit v1.2.3 From 0422e7a4883f25101903f3e8105c0808aa5f4ce9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Apr 2026 21:09:07 +0100 Subject: rxrpc: Fix re-decryption of RESPONSE packets If a RESPONSE packet gets a temporary failure during processing, it may end up in a partially decrypted state - and then get requeued for a retry. Fix this by just discarding the packet; we will send another CHALLENGE packet and thereby elicit a further response. Similarly, discard an incoming CHALLENGE packet if we get an error whilst generating a RESPONSE; the server will send another CHALLENGE. Fixes: 17926a79320a ("[AF_RXRPC]: Provide secure RxRPC sockets for use by userspace and kernel both") Closes: https://sashiko.dev/#/patchset/20260422161438.2593376-4-dhowells@redhat.com Signed-off-by: David Howells cc: Marc Dionne cc: Jeffrey Altman cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/20260423200909.3049438-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 13b9d017f8e1..573f2df3a2c9 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -285,7 +285,6 @@ EM(rxrpc_conn_put_unidle, "PUT unidle ") \ EM(rxrpc_conn_put_work, "PUT work ") \ EM(rxrpc_conn_queue_challenge, "QUE chall ") \ - EM(rxrpc_conn_queue_retry_work, "QUE retry-wk") \ EM(rxrpc_conn_queue_rx_work, "QUE rx-work ") \ EM(rxrpc_conn_see_new_service_conn, "SEE new-svc ") \ EM(rxrpc_conn_see_reap_service, "SEE reap-svc") \ -- cgit v1.2.3