summaryrefslogtreecommitdiff
path: root/net/xfrm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-26 21:48:21 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-26 21:48:21 -0700
commit1a9239bb4253f9076b5b4b2a1a4e8d7defd77a95 (patch)
tree286dda5e84757594218e684b94b01b3a3cac15a2 /net/xfrm
parente61f33273ca755b3e2ebee4520a76097199dc7a8 (diff)
parent023b1e9d265ca0662111a9df23d22b4632717a8a (diff)
Merge tag 'net-next-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Continue Netlink conversions to per-namespace RTNL lock (IPv4 routing, routing rules, routing next hops, ARP ioctls) - Continue extending the use of netdev instance locks. As a driver opt-in protect queue operations and (in due course) ethtool operations with the instance lock and not RTNL lock. - Support collecting TCP timestamps (data submitted, sent, acked) in BPF, allowing for transparent (to the application) and lower overhead tracking of TCP RPC performance. - Tweak existing networking Rx zero-copy infra to support zero-copy Rx via io_uring. - Optimize MPTCP performance in single subflow mode by 29%. - Enable GRO on packets which went thru XDP CPU redirect (were queued for processing on a different CPU). Improving TCP stream performance up to 2x. - Improve performance of contended connect() by 200% by searching for an available 4-tuple under RCU rather than a spin lock. Bring an additional 229% improvement by tweaking hash distribution. - Avoid unconditionally touching sk_tsflags on RX, improving performance under UDP flood by as much as 10%. - Avoid skb_clone() dance in ping_rcv() to improve performance under ping flood. - Avoid FIB lookup in netfilter if socket is available, 20% perf win. - Rework network device creation (in-kernel) API to more clearly identify network namespaces and their roles. There are up to 4 namespace roles but we used to have just 2 netns pointer arguments, interpreted differently based on context. - Use sysfs_break_active_protection() instead of trylock to avoid deadlocks between unregistering objects and sysfs access. - Add a new sysctl and sockopt for capping max retransmit timeout in TCP. - Support masking port and DSCP in routing rule matches. - Support dumping IPv4 multicast addresses with RTM_GETMULTICAST. - Support specifying at what time packet should be sent on AF_XDP sockets. - Expose TCP ULP diagnostic info (for TLS and MPTCP) to non-admin users. - Add Netlink YAML spec for WiFi (nl80211) and conntrack. - Introduce EXPORT_IPV6_MOD() and EXPORT_IPV6_MOD_GPL() for symbols which only need to be exported when IPv6 support is built as a module. - Age FDB entries based on Rx not Tx traffic in VxLAN, similar to normal bridging. - Allow users to specify source port range for GENEVE tunnels. - netconsole: allow attaching kernel release, CPU ID and task name to messages as metadata Driver API: - Continue rework / fixing of Energy Efficient Ethernet (EEE) across the SW layers. Delegate the responsibilities to phylink where possible. Improve its handling in phylib. - Support symmetric OR-XOR RSS hashing algorithm. - Support tracking and preserving IRQ affinity by NAPI itself. - Support loopback mode speed selection for interface selftests. Device drivers: - Remove the IBM LCS driver for s390 - Remove the sb1000 cable modem driver - Add support for SFP module access over SMBus - Add MCTP transport driver for MCTP-over-USB - Enable XDP metadata support in multiple drivers - Ethernet high-speed NICs: - Broadcom (bnxt): - add PCIe TLP Processing Hints (TPH) support for new AMD platforms - support dumping RoCE queue state for debug - opt into instance locking - Intel (100G, ice, idpf): - ice: rework MSI-X IRQ management and distribution - ice: support for E830 devices - iavf: add support for Rx timestamping - iavf: opt into instance locking - nVidia/Mellanox: - mlx4: use page pool memory allocator for Rx - mlx5: support for one PTP device per hardware clock - mlx5: support for 200Gbps per-lane link modes - mlx5: move IPSec policy check after decryption - AMD/Solarflare: - support FW flashing via devlink - Cisco (enic): - use page pool memory allocator for Rx - enable 32, 64 byte CQEs - get max rx/tx ring size from the device - Meta (fbnic): - support flow steering and RSS configuration - report queue stats - support TCP segmentation - support IRQ coalescing - support ring size configuration - Marvell/Cavium: - support AF_XDP - Wangxun: - support for PTP clock and timestamping - Huawei (hibmcge): - checksum offload - add more statistics - Ethernet virtual: - VirtIO net: - aggressively suppress Tx completions, improve perf by 96% with 1 CPU and 55% with 2 CPUs - expose NAPI to IRQ mapping and persist NAPI settings - Google (gve): - support XDP in DQO RDA Queue Format - opt into instance locking - Microsoft vNIC: - support BIG TCP - Ethernet NICs consumer, and embedded: - Synopsys (stmmac): - cleanup Tx and Tx clock setting and other link-focused cleanups - enable SGMII and 2500BASEX mode switching for Intel platforms - support Sophgo SG2044 - Broadcom switches (b53): - support for BCM53101 - TI: - iep: add perout configuration support - icssg: support XDP - Cadence (macb): - implement BQL - Xilinx (axinet): - support dynamic IRQ moderation and changing coalescing at runtime - implement BQL - report standard stats - MediaTek: - support phylink managed EEE - Intel: - igc: don't restart the interface on every XDP program change - RealTek (r8169): - support reading registers of internal PHYs directly - increase max jumbo packet size on RTL8125/RTL8126 - Airoha: - support for RISC-V NPU packet processing unit - enable scatter-gather and support MTU up to 9kB - Tehuti (tn40xx): - support cards with TN4010 MAC and an Aquantia AQR105 PHY - Ethernet PHYs: - support for TJA1102S, TJA1121 - dp83tg720: add randomized polling intervals for link detection - dp83822: support changing the transmit amplitude voltage - support for LEDs on 88q2xxx - CAN: - canxl: support Remote Request Substitution bit access - flexcan: add S32G2/S32G3 SoC - WiFi: - remove cooked monitor support - strict mode for better AP testing - basic EPCS support - OMI RX bandwidth reduction support - batman-adv: add support for jumbo frames - WiFi drivers: - RealTek (rtw88): - support RTL8814AE and RTL8814AU - RealTek (rtw89): - switch using wiphy_lock and wiphy_work - add BB context to manipulate two PHY as preparation of MLO - improve BT-coexistence mechanism to play A2DP smoothly - Intel (iwlwifi): - add new iwlmld sub-driver for latest HW/FW combinations - MediaTek (mt76): - preparation for mt7996 Multi-Link Operation (MLO) support - Qualcomm/Atheros (ath12k): - continued work on MLO - Silabs (wfx): - Wake-on-WLAN support - Bluetooth: - add support for skb TX SND/COMPLETION timestamping - hci_core: enable buffer flow control for SCO/eSCO - coredump: log devcd dumps into the monitor - Bluetooth drivers: - intel: add support to configure TX power - nxp: handle bootloader error during cmd5 and cmd7" * tag 'net-next-6.15' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1681 commits) unix: fix up for "apparmor: add fine grained af_unix mediation" mctp: Fix incorrect tx flow invalidation condition in mctp-i2c net: usb: asix: ax88772: Increase phy_name size net: phy: Introduce PHY_ID_SIZE — minimum size for PHY ID string net: libwx: fix Tx L4 checksum net: libwx: fix Tx descriptor content for some tunnel packets atm: Fix NULL pointer dereference net: tn40xx: add pci-id of the aqr105-based Tehuti TN4010 cards net: tn40xx: prepare tn40xx driver to find phy of the TN9510 card net: tn40xx: create swnode for mdio and aqr105 phy and add to mdiobus net: phy: aquantia: add essential functions to aqr105 driver net: phy: aquantia: search for firmware-name in fwnode net: phy: aquantia: add probe function to aqr105 for firmware loading net: phy: Add swnode support to mdiobus_scan gve: add XDP DROP and PASS support for DQ gve: update XDP allocation path support RX buffer posting gve: merge packet buffer size fields gve: update GQ RX to use buf_size gve: introduce config-based allocation for XDP gve: remove xdp_xsk_done and xdp_xsk_wakeup statistics ...
Diffstat (limited to 'net/xfrm')
-rw-r--r--net/xfrm/xfrm_device.c46
-rw-r--r--net/xfrm/xfrm_interface_core.c15
-rw-r--r--net/xfrm/xfrm_output.c6
-rw-r--r--net/xfrm/xfrm_policy.c2
-rw-r--r--net/xfrm/xfrm_state.c54
-rw-r--r--net/xfrm/xfrm_user.c14
6 files changed, 84 insertions, 53 deletions
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index d1fa94e52cea..d62f76161d83 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -244,11 +244,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
xfrm_address_t *daddr;
bool is_packet_offload;
- if (!x->type_offload) {
- NL_SET_ERR_MSG(extack, "Type doesn't support offload");
- return -EINVAL;
- }
-
if (xuo->flags &
~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) {
NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
@@ -310,6 +305,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
return -EINVAL;
}
+ xfrm_set_type_offload(x);
+ if (!x->type_offload) {
+ NL_SET_ERR_MSG(extack, "Type doesn't support offload");
+ dev_put(dev);
+ return -EINVAL;
+ }
+
xso->dev = dev;
netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC);
xso->real_dev = dev;
@@ -332,6 +334,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
netdev_put(dev, &xso->dev_tracker);
xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+ xfrm_unset_type_offload(x);
/* User explicitly requested packet offload mode and configured
* policy in addition to the XFRM state. So be civil to users,
* and return an error instead of taking fallback path.
@@ -415,14 +418,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
struct dst_entry *dst = skb_dst(skb);
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
struct net_device *dev = x->xso.dev;
+ bool check_tunnel_size;
- if (!x->type_offload ||
- (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap))
+ if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED)
return false;
- if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET ||
- ((!dev || (dev == xfrm_dst_path(dst)->dev)) &&
- !xdst->child->xfrm)) {
+ if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) {
mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
if (skb->len <= mtu)
goto ok;
@@ -434,8 +435,29 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
return false;
ok:
- if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_offload_ok)
- return x->xso.dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x);
+ check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET &&
+ x->props.mode == XFRM_MODE_TUNNEL;
+ switch (x->props.family) {
+ case AF_INET:
+ /* Check for IPv4 options */
+ if (ip_hdr(skb)->ihl != 5)
+ return false;
+ if (check_tunnel_size && xfrm4_tunnel_check_size(skb))
+ return false;
+ break;
+ case AF_INET6:
+ /* Check for IPv6 extensions */
+ if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr))
+ return false;
+ if (check_tunnel_size && xfrm6_tunnel_check_size(skb))
+ return false;
+ break;
+ default:
+ break;
+ }
+
+ if (dev->xfrmdev_ops->xdo_dev_offload_ok)
+ return dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x);
return true;
}
diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c
index c397eb99d867..622445f041d3 100644
--- a/net/xfrm/xfrm_interface_core.c
+++ b/net/xfrm/xfrm_interface_core.c
@@ -242,10 +242,9 @@ static void xfrmi_dev_free(struct net_device *dev)
gro_cells_destroy(&xi->gro_cells);
}
-static int xfrmi_create(struct net_device *dev)
+static int xfrmi_create(struct net *net, struct net_device *dev)
{
struct xfrm_if *xi = netdev_priv(dev);
- struct net *net = dev_net(dev);
struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
int err;
@@ -814,15 +813,17 @@ static void xfrmi_netlink_parms(struct nlattr *data[],
parms->collect_md = true;
}
-static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
- struct netlink_ext_ack *extack)
+static int xfrmi_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
{
- struct net *net = dev_net(dev);
+ struct nlattr **data = params->data;
struct xfrm_if_parms p = {};
struct xfrm_if *xi;
+ struct net *net;
int err;
+ net = params->link_net ? : dev_net(dev);
xfrmi_netlink_parms(data, &p);
if (p.collect_md) {
struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
@@ -851,7 +852,7 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
xi->net = net;
xi->dev = dev;
- err = xfrmi_create(dev);
+ err = xfrmi_create(net, dev);
return err;
}
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 3cabc87978dd..9077730ff7d0 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -827,7 +827,7 @@ out:
}
EXPORT_SYMBOL_GPL(xfrm_output);
-static int xfrm4_tunnel_check_size(struct sk_buff *skb)
+int xfrm4_tunnel_check_size(struct sk_buff *skb)
{
int mtu, ret = 0;
@@ -853,6 +853,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
out:
return ret;
}
+EXPORT_SYMBOL_GPL(xfrm4_tunnel_check_size);
static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
{
@@ -875,7 +876,7 @@ static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
}
#if IS_ENABLED(CONFIG_IPV6)
-static int xfrm6_tunnel_check_size(struct sk_buff *skb)
+int xfrm6_tunnel_check_size(struct sk_buff *skb)
{
int mtu, ret = 0;
struct dst_entry *dst = skb_dst(skb);
@@ -905,6 +906,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
out:
return ret;
}
+EXPORT_SYMBOL_GPL(xfrm6_tunnel_check_size);
#endif
static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 6551e588fe52..30970d40a454 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3294,7 +3294,7 @@ no_transform:
ok:
xfrm_pols_put(pols, drop_pols);
- if (dst && dst->xfrm &&
+ if (dst->xfrm &&
(dst->xfrm->props.mode == XFRM_MODE_TUNNEL ||
dst->xfrm->props.mode == XFRM_MODE_IPTFS))
dst->flags |= DST_XFRM_TUNNEL;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 9bd14fdb67a5..d896c3fefb07 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -424,18 +424,18 @@ void xfrm_unregister_type_offload(const struct xfrm_type_offload *type,
}
EXPORT_SYMBOL(xfrm_unregister_type_offload);
-static const struct xfrm_type_offload *
-xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load)
+void xfrm_set_type_offload(struct xfrm_state *x)
{
const struct xfrm_type_offload *type = NULL;
struct xfrm_state_afinfo *afinfo;
+ bool try_load = true;
retry:
- afinfo = xfrm_state_get_afinfo(family);
+ afinfo = xfrm_state_get_afinfo(x->props.family);
if (unlikely(afinfo == NULL))
- return NULL;
+ goto out;
- switch (proto) {
+ switch (x->id.proto) {
case IPPROTO_ESP:
type = afinfo->type_offload_esp;
break;
@@ -449,18 +449,16 @@ retry:
rcu_read_unlock();
if (!type && try_load) {
- request_module("xfrm-offload-%d-%d", family, proto);
+ request_module("xfrm-offload-%d-%d", x->props.family,
+ x->id.proto);
try_load = false;
goto retry;
}
- return type;
-}
-
-static void xfrm_put_type_offload(const struct xfrm_type_offload *type)
-{
- module_put(type->owner);
+out:
+ x->type_offload = type;
}
+EXPORT_SYMBOL(xfrm_set_type_offload);
static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = {
[XFRM_MODE_BEET] = {
@@ -609,8 +607,6 @@ static void ___xfrm_state_destroy(struct xfrm_state *x)
kfree(x->coaddr);
kfree(x->replay_esn);
kfree(x->preplay_esn);
- if (x->type_offload)
- xfrm_put_type_offload(x->type_offload);
if (x->type) {
x->type->destructor(x);
xfrm_put_type(x->type);
@@ -784,6 +780,8 @@ void xfrm_dev_state_free(struct xfrm_state *x)
struct xfrm_dev_offload *xso = &x->xso;
struct net_device *dev = READ_ONCE(xso->dev);
+ xfrm_unset_type_offload(x);
+
if (dev && dev->xfrmdev_ops) {
spin_lock_bh(&xfrm_state_dev_gc_lock);
if (!hlist_unhashed(&x->dev_gclist))
@@ -2315,12 +2313,12 @@ xfrm_state_lookup_byaddr(struct net *net, u32 mark,
struct xfrm_hash_state_ptrs state_ptrs;
struct xfrm_state *x;
- spin_lock_bh(&net->xfrm.xfrm_state_lock);
+ rcu_read_lock();
xfrm_hash_ptrs_get(net, &state_ptrs);
x = __xfrm_state_lookup_byaddr(&state_ptrs, mark, daddr, saddr, proto, family);
- spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ rcu_read_unlock();
return x;
}
EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
@@ -3122,8 +3120,7 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
}
EXPORT_SYMBOL_GPL(xfrm_state_mtu);
-int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
- struct netlink_ext_ack *extack)
+int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
const struct xfrm_mode *inner_mode;
const struct xfrm_mode *outer_mode;
@@ -3178,8 +3175,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
goto error;
}
- x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload);
-
err = x->type->init_state(x, extack);
if (err)
goto error;
@@ -3192,12 +3187,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
}
x->outer_mode = *outer_mode;
- if (init_replay) {
- err = xfrm_init_replay(x, extack);
- if (err)
- goto error;
- }
-
if (x->nat_keepalive_interval) {
if (x->dir != XFRM_SA_DIR_OUT) {
NL_SET_ERR_MSG(extack, "NAT keepalive is only supported for outbound SAs");
@@ -3229,11 +3218,16 @@ int xfrm_init_state(struct xfrm_state *x)
{
int err;
- err = __xfrm_init_state(x, true, false, NULL);
- if (!err)
- x->km.state = XFRM_STATE_VALID;
+ err = __xfrm_init_state(x, NULL);
+ if (err)
+ return err;
- return err;
+ err = xfrm_init_replay(x, NULL);
+ if (err)
+ return err;
+
+ x->km.state = XFRM_STATE_VALID;
+ return 0;
}
EXPORT_SYMBOL(xfrm_init_state);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 08c6d6f0179f..784a2d124749 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -178,6 +178,12 @@ static inline int verify_replay(struct xfrm_usersa_info *p,
"Replay seq and seq_hi should be 0 for output SA");
return -EINVAL;
}
+ if (rs->oseq_hi && !(p->flags & XFRM_STATE_ESN)) {
+ NL_SET_ERR_MSG(
+ extack,
+ "Replay oseq_hi should be 0 in non-ESN mode for output SA");
+ return -EINVAL;
+ }
if (rs->bmp_len) {
NL_SET_ERR_MSG(extack, "Replay bmp_len should 0 for output SA");
return -EINVAL;
@@ -190,6 +196,12 @@ static inline int verify_replay(struct xfrm_usersa_info *p,
"Replay oseq and oseq_hi should be 0 for input SA");
return -EINVAL;
}
+ if (rs->seq_hi && !(p->flags & XFRM_STATE_ESN)) {
+ NL_SET_ERR_MSG(
+ extack,
+ "Replay seq_hi should be 0 in non-ESN mode for input SA");
+ return -EINVAL;
+ }
}
return 0;
@@ -907,7 +919,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
goto error;
}
- err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack);
+ err = __xfrm_init_state(x, extack);
if (err)
goto error;