summaryrefslogtreecommitdiff
path: root/include/net/sock.h
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 15:17:01 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 15:17:01 -0700
commit07fdad3a93756b872da7b53647715c48d0f4a2d0 (patch)
tree133af559ac91e6b24358b57a025abc060a782129 /include/net/sock.h
parentf79e772258df311c2cb21594ca0996318e720d28 (diff)
parentf1455695d2d99894b65db233877acac9a0e120b9 (diff)
Merge tag 'net-next-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "Core & protocols: - Improve drop account scalability on NUMA hosts for RAW and UDP sockets and the backlog, almost doubling the Pps capacity under DoS - Optimize the UDP RX performance under stress, reducing contention, revisiting the binary layout of the involved data structs and implementing NUMA-aware locking. This improves UDP RX performance by an additional 50%, even more under extreme conditions - Add support for PSP encryption of TCP connections; this mechanism has some similarities with IPsec and TLS, but offers superior HW offloads capabilities - Ongoing work to support Accurate ECN for TCP. AccECN allows more than one congestion notification signal per RTT and is a building block for Low Latency, Low Loss, and Scalable Throughput (L4S) - Reorganize the TCP socket binary layout for data locality, reducing the number of touched cachelines in the fastpath - Refactor skb deferral free to better scale on large multi-NUMA hosts, this improves TCP and UDP RX performances significantly on such HW - Increase the default socket memory buffer limits from 256K to 4M to better fit modern link speeds - Improve handling of setups with a large number of nexthop, making dump operating scaling linearly and avoiding unneeded synchronize_rcu() on delete - Improve bridge handling of VLAN FDB, storing a single entry per bridge instead of one entry per port; this makes the dump order of magnitude faster on large switches - Restore IP ID correctly for encapsulated packets at GSO segmentation time, allowing GRO to merge packets in more scenarios - Improve netfilter matching performance on large sets - Improve MPTCP receive path performance by leveraging recently introduced core infrastructure (skb deferral free) and adopting recent TCP autotuning changes - Allow bridges to redirect to a backup port when the bridge port is administratively down - Introduce MPTCP 'laminar' endpoint that con be used only once per connection and simplify common MPTCP setups - Add RCU safety to dst->dev, closing a lot of possible races - A significant crypto library API for SCTP, MPTCP and IPv6 SR, reducing code duplication - Supports pulling data from an skb frag into the linear area of an XDP buffer Things we sprinkled into general kernel code: - Generate netlink documentation from YAML using an integrated YAML parser Driver API: - Support using IPv6 Flow Label in Rx hash computation and RSS queue selection - Introduce API for fetching the DMA device for a given queue, allowing TCP zerocopy RX on more H/W setups - Make XDP helpers compatible with unreadable memory, allowing more easily building DevMem-enabled drivers with a unified XDP/skbs datapath - Add a new dedicated ethtool callback enabling drivers to provide the number of RX rings directly, improving efficiency and clarity in RX ring queries and RSS configuration - Introduce a burst period for the health reporter, allowing better handling of multiple errors due to the same root cause - Support for DPLL phase offset exponential moving average, controlling the average smoothing factor Device drivers: - Add a new Huawei driver for 3rd gen NIC (hinic3) - Add a new SpacemiT driver for K1 ethernet MAC - Add a generic abstraction for shared memory communication devices (dibps) - Ethernet high-speed NICs: - nVidia/Mellanox: - Use multiple per-queue doorbell, to avoid MMIO contention issues - support adjacent functions, allowing them to delegate their SR-IOV VFs to sibling PFs - support RSS for IPSec offload - support exposing raw cycle counters in PTP and mlx5 - support for disabling host PFs. - Intel (100G, ice, idpf): - ice: support for SRIOV VFs over an Active-Active link aggregate - ice: support for firmware logging via debugfs - ice: support for Earliest TxTime First (ETF) hardware offload - idpf: support basic XDP functionalities and XSk - Broadcom (bnxt): - support Hyper-V VF ID - dynamic SRIOV resource allocations for RoCE - Meta (fbnic): - support queue API, zero-copy Rx and Tx - support basic XDP functionalities - devlink health support for FW crashes and OTP mem corruptions - expand hardware stats coverage to FEC, PHY, and Pause - Wangxun: - support ethtool coalesce options - support for multiple RSS contexts - Ethernet virtual: - Macsec: - replace custom netlink attribute checks with policy-level checks - Bonding: - support aggregator selection based on port priority - Microsoft vNIC: - use page pool fragments for RX buffers instead of full pages to improve memory efficiency - Ethernet NICs consumer, and embedded: - Qualcomm: support Ethernet function for IPQ9574 SoC - Airoha: implement wlan offloading via NPU - Freescale - enetc: add NETC timer PTP driver and add PTP support - fec: enable the Jumbo frame support for i.MX8QM - Renesas (R-Car S4): - support HW offloading for layer 2 switching - support for RZ/{T2H, N2H} SoCs - Cadence (macb): support TAPRIO traffic scheduling - TI: - support for Gigabit ICSS ethernet SoC (icssm-prueth) - Synopsys (stmmac): a lot of cleanups - Ethernet PHYs: - Support 10g-qxgmi phy-mode for AQR412C, Felix DSA and Lynx PCS driver - Support bcm63268 GPHY power control - Support for Micrel lan8842 PHY and PTP - Support for Aquantia AQR412 and AQR115 - CAN: - a large CAN-XL preparation work - reorganize raw_sock and uniqframe struct to minimize memory usage - rcar_canfd: update the CAN-FD handling - WiFi: - extended Neighbor Awareness Networking (NAN) support - S1G channel representation cleanup - improve S1G support - WiFi drivers: - Intel (iwlwifi): - major refactor and cleanup - Broadcom (brcm80211): - support for AP isolation - RealTek (rtw88/89) rtw88/89: - preparation work for RTL8922DE support - MediaTek (mt76): - HW restart improvements - MLO support - Qualcomm/Atheros (ath10k): - GTK rekey fixes - Bluetooth drivers: - btusb: support for several new IDs for MT7925 - btintel: support for BlazarIW core - btintel_pcie: support for _suspend() / _resume() - btintel_pcie: support for Scorpious, Panther Lake-H484 IDs" * tag 'net-next-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1536 commits) net: stmmac: Add support for Allwinner A523 GMAC200 dt-bindings: net: sun8i-emac: Add A523 GMAC200 compatible Revert "Documentation: net: add flow control guide and document ethtool API" octeontx2-pf: fix bitmap leak octeontx2-vf: fix bitmap leak net/mlx5e: Use extack in set rxfh callback net/mlx5e: Introduce mlx5e_rss_params for RSS configuration net/mlx5e: Introduce mlx5e_rss_init_params net/mlx5e: Remove unused mdev param from RSS indir init net/mlx5: Improve QoS error messages with actual depth values net/mlx5e: Prevent entering switchdev mode with inconsistent netns net/mlx5: HWS, Generalize complex matchers net/mlx5: Improve write-combining test reliability for ARM64 Grace CPUs selftests/net: add tcp_port_share to .gitignore Revert "net/mlx5e: Update and set Xon/Xoff upon MTU set" net: add NUMA awareness to skb_attempt_defer_free() net: use llist for sd->defer_list net: make softnet_data.defer_count an atomic selftests: drv-net: psp: add tests for destroying devices selftests: drv-net: psp: add test for auto-adjusting TCP MSS ...
Diffstat (limited to 'include/net/sock.h')
-rw-r--r--include/net/sock.h135
1 files changed, 101 insertions, 34 deletions
diff --git a/include/net/sock.h b/include/net/sock.h
index 2e14283c5be1..60bcb13f045c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -249,6 +249,7 @@ struct sk_filter;
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
+ * @psp_assoc: PSP association, if socket is PSP-secured
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
* @sk_tsq_flags: TCP Small Queues flags
@@ -282,6 +283,7 @@ struct sk_filter;
* @sk_err_soft: errors that don't cause failure but are the cause of a
* persistent failure not just 'timed out'
* @sk_drops: raw/udp drops counter
+ * @sk_drop_counters: optional pointer to numa_drop_counters
* @sk_ack_backlog: current listen backlog
* @sk_max_ack_backlog: listen backlog set in listen()
* @sk_uid: user id of owner
@@ -444,10 +446,15 @@ struct sock {
__cacheline_group_begin(sock_read_rxtx);
int sk_err;
struct socket *sk_socket;
+#ifdef CONFIG_MEMCG
struct mem_cgroup *sk_memcg;
+#endif
#ifdef CONFIG_XFRM
struct xfrm_policy __rcu *sk_policy[2];
#endif
+#if IS_ENABLED(CONFIG_INET_PSP)
+ struct psp_assoc __rcu *psp_assoc;
+#endif
__cacheline_group_end(sock_read_rxtx);
__cacheline_group_begin(sock_write_rxtx);
@@ -460,7 +467,7 @@ struct sock {
__cacheline_group_begin(sock_write_tx);
int sk_write_pending;
atomic_t sk_omem_alloc;
- int sk_sndbuf;
+ int sk_err_soft;
int sk_wmem_queued;
refcount_t sk_wmem_alloc;
@@ -485,6 +492,9 @@ struct sock {
long sk_sndtimeo;
u32 sk_priority;
u32 sk_mark;
+ kuid_t sk_uid;
+ u16 sk_protocol;
+ u16 sk_type;
struct dst_entry __rcu *sk_dst_cache;
netdev_features_t sk_route_caps;
#ifdef CONFIG_SOCK_VALIDATE_XMIT
@@ -497,6 +507,7 @@ struct sock {
unsigned int sk_gso_max_size;
gfp_t sk_allocation;
u32 sk_txhash;
+ int sk_sndbuf;
u8 sk_pacing_shift;
bool sk_use_task_frag;
__cacheline_group_end(sock_read_tx);
@@ -510,15 +521,11 @@ struct sock {
sk_no_check_tx : 1,
sk_no_check_rx : 1;
u8 sk_shutdown;
- u16 sk_type;
- u16 sk_protocol;
unsigned long sk_lingertime;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
- int sk_err_soft;
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
- kuid_t sk_uid;
unsigned long sk_ino;
spinlock_t sk_peer_lock;
int sk_bind_phc;
@@ -564,6 +571,7 @@ struct sock {
#ifdef CONFIG_BPF_SYSCALL
struct bpf_local_storage __rcu *sk_bpf_storage;
#endif
+ struct numa_drop_counters *sk_drop_counters;
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
struct xarray sk_user_frags;
@@ -1346,8 +1354,6 @@ struct proto {
unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */
- unsigned int __percpu *orphan_count;
-
struct request_sock_ops *rsk_prot;
struct timewait_sock_ops *twsk_prot;
@@ -1488,6 +1494,10 @@ static inline int __sk_prot_rehash(struct sock *sk)
#define SOCK_BINDADDR_LOCK 4
#define SOCK_BINDPORT_LOCK 8
+/**
+ * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time
+ */
+#define SOCK_CONNECT_BIND 16
struct socket_alloc {
struct socket socket;
@@ -2604,6 +2614,50 @@ static inline gfp_t gfp_memcg_charge(void)
return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
+{
+ return sk->sk_memcg;
+}
+
+static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
+{
+ return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk);
+}
+
+static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);
+
+#ifdef CONFIG_MEMCG_V1
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return !!memcg->tcpmem_pressure;
+#endif /* CONFIG_MEMCG_V1 */
+
+ do {
+ if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg)))
+ return true;
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ return false;
+}
+#else
+static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
+{
+ return NULL;
+}
+
+static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
+{
+ return false;
+}
+
+static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
+{
+ return false;
+}
+#endif
+
static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo);
@@ -2646,18 +2700,53 @@ struct sock_skb_cb {
#define sock_skb_cb_check_size(size) \
BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET)
+static inline void sk_drops_add(struct sock *sk, int segs)
+{
+ struct numa_drop_counters *ndc = sk->sk_drop_counters;
+
+ if (ndc)
+ numa_drop_add(ndc, segs);
+ else
+ atomic_add(segs, &sk->sk_drops);
+}
+
+static inline void sk_drops_inc(struct sock *sk)
+{
+ sk_drops_add(sk, 1);
+}
+
+static inline int sk_drops_read(const struct sock *sk)
+{
+ const struct numa_drop_counters *ndc = sk->sk_drop_counters;
+
+ if (ndc) {
+ DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops));
+ return numa_drop_read(ndc);
+ }
+ return atomic_read(&sk->sk_drops);
+}
+
+static inline void sk_drops_reset(struct sock *sk)
+{
+ struct numa_drop_counters *ndc = sk->sk_drop_counters;
+
+ if (ndc)
+ numa_drop_reset(ndc);
+ atomic_set(&sk->sk_drops, 0);
+}
+
static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
- atomic_read(&sk->sk_drops) : 0;
+ sk_drops_read(sk) : 0;
}
-static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
+static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb)
{
int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
- atomic_add(segs, &sk->sk_drops);
+ sk_drops_add(sk, segs);
}
static inline ktime_t sock_read_timestamp(struct sock *sk)
@@ -2876,28 +2965,6 @@ sk_requests_wifi_status(struct sock *sk)
return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS);
}
-/* Checks if this SKB belongs to an HW offloaded socket
- * and whether any SW fallbacks are required based on dev.
- * Check decrypted mark in case skb_orphan() cleared socket.
- */
-static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
- struct net_device *dev)
-{
-#ifdef CONFIG_SOCK_VALIDATE_XMIT
- struct sock *sk = skb->sk;
-
- if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
- skb = sk->sk_validate_xmit_skb(sk, dev, skb);
- } else if (unlikely(skb_is_decrypted(skb))) {
- pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
- kfree_skb(skb);
- skb = NULL;
- }
-#endif
-
- return skb;
-}
-
/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
* SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
*/
@@ -2934,8 +3001,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo);
*/
#define _SK_MEM_PACKETS 256
#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
-#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
-#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_WMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_RMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;