summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 16:43:10 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-28 16:43:10 -0700
commit3a8a670eeeaa40d87bd38a587438952741980c18 (patch)
treed5546d311271503eadf75b45d87e12720e72899f /net/core
parent6a8cbd9253abc1bd0df4d60c4c24fa555190376d (diff)
parentae230642190a51b85656d6da2df744d534d59544 (diff)
Merge tag 'net-next-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking changes from Jakub Kicinski: "WiFi 7 and sendpage changes are the biggest pieces of work for this release. The latter will definitely require fixes but I think that we got it to a reasonable point. Core: - Rework the sendpage & splice implementations Instead of feeding data into sockets page by page extend sendmsg handlers to support taking a reference on the data, controlled by a new flag called MSG_SPLICE_PAGES Rework the handling of unexpected-end-of-file to invoke an additional callback instead of trying to predict what the right combination of MORE/NOTLAST flags is Remove the MSG_SENDPAGE_NOTLAST flag completely - Implement SCM_PIDFD, a new type of CMSG type analogous to SCM_CREDENTIALS, but it contains pidfd instead of plain pid - Enable socket busy polling with CONFIG_RT - Improve reliability and efficiency of reporting for ref_tracker - Auto-generate a user space C library for various Netlink families Protocols: - Allow TCP to shrink the advertised window when necessary, prevent sk_rcvbuf auto-tuning from growing the window all the way up to tcp_rmem[2] - Use per-VMA locking for "page-flipping" TCP receive zerocopy - Prepare TCP for device-to-device data transfers, by making sure that payloads are always attached to skbs as page frags - Make the backoff time for the first N TCP SYN retransmissions linear. Exponential backoff is unnecessarily conservative - Create a new MPTCP getsockopt to retrieve all info (MPTCP_FULL_INFO) - Avoid waking up applications using TLS sockets until we have a full record - Allow using kernel memory for protocol ioctl callbacks, paving the way to issuing ioctls over io_uring - Add nolocalbypass option to VxLAN, forcing packets to be fully encapsulated even if they are destined for a local IP address - Make TCPv4 use consistent hash in TIME_WAIT and SYN_RECV. Ensure in-kernel ECMP implementation (e.g. Open vSwitch) select the same link for all packets. Support L4 symmetric hashing in Open vSwitch - PPPoE: make number of hash bits configurable - Allow DNS to be overwritten by DHCPACK in the in-kernel DHCP client (ipconfig) - Add layer 2 miss indication and filtering, allowing higher layers (e.g. ACL filters) to make forwarding decisions based on whether packet matched forwarding state in lower devices (bridge) - Support matching on Connectivity Fault Management (CFM) packets - Hide the "link becomes ready" IPv6 messages by demoting their printk level to debug - HSR: don't enable promiscuous mode if device offloads the proto - Support active scanning in IEEE 802.15.4 - Continue work on Multi-Link Operation for WiFi 7 BPF: - Add precision propagation for subprogs and callbacks. This allows maintaining verification efficiency when subprograms are used, or in fact passing the verifier at all for complex programs, especially those using open-coded iterators - Improve BPF's {g,s}setsockopt() length handling. Previously BPF assumed the length is always equal to the amount of written data. But some protos allow passing a NULL buffer to discover what the output buffer *should* be, without writing anything - Accept dynptr memory as memory arguments passed to helpers - Add routing table ID to bpf_fib_lookup BPF helper - Support O_PATH FDs in BPF_OBJ_PIN and BPF_OBJ_GET commands - Drop bpf_capable() check in BPF_MAP_FREEZE command (used to mark maps as read-only) - Show target_{obj,btf}_id in tracing link fdinfo - Addition of several new kfuncs (most of the names are self-explanatory): - Add a set of new dynptr kfuncs: bpf_dynptr_adjust(), bpf_dynptr_is_null(), bpf_dynptr_is_rdonly(), bpf_dynptr_size() and bpf_dynptr_clone(). - bpf_task_under_cgroup() - bpf_sock_destroy() - force closing sockets - bpf_cpumask_first_and(), rework bpf_cpumask_any*() kfuncs Netfilter: - Relax set/map validation checks in nf_tables. Allow checking presence of an entry in a map without using the value - Increase ip_vs_conn_tab_bits range for 64BIT builds - Allow updating size of a set - Improve NAT tuple selection when connection is closing Driver API: - Integrate netdev with LED subsystem, to allow configuring HW "offloaded" blinking of LEDs based on link state and activity (i.e. packets coming in and out) - Support configuring rate selection pins of SFP modules - Factor Clause 73 auto-negotiation code out of the drivers, provide common helper routines - Add more fool-proof helpers for managing lifetime of MDIO devices associated with the PCS layer - Allow drivers to report advanced statistics related to Time Aware scheduler offload (taprio) - Allow opting out of VF statistics in link dump, to allow more VFs to fit into the message - Split devlink instance and devlink port operations New hardware / drivers: - Ethernet: - Synopsys EMAC4 IP support (stmmac) - Marvell 88E6361 8 port (5x1GE + 3x2.5GE) switches - Marvell 88E6250 7 port switches - Microchip LAN8650/1 Rev.B0 PHYs - MediaTek MT7981/MT7988 built-in 1GE PHY driver - WiFi: - Realtek RTL8192FU, 2.4 GHz, b/g/n mode, 2T2R, 300 Mbps - Realtek RTL8723DS (SDIO variant) - Realtek RTL8851BE - CAN: - Fintek F81604 Drivers: - Ethernet NICs: - Intel (100G, ice): - support dynamic interrupt allocation - use meta data match instead of VF MAC addr on slow-path - nVidia/Mellanox: - extend link aggregation to handle 4, rather than just 2 ports - spawn sub-functions without any features by default - OcteonTX2: - support HTB (Tx scheduling/QoS) offload - make RSS hash generation configurable - support selecting Rx queue using TC filters - Wangxun (ngbe/txgbe): - add basic Tx/Rx packet offloads - add phylink support (SFP/PCS control) - Freescale/NXP (enetc): - report TAPRIO packet statistics - Solarflare/AMD: - support matching on IP ToS and UDP source port of outer header - VxLAN and GENEVE tunnel encapsulation over IPv4 or IPv6 - add devlink dev info support for EF10 - Virtual NICs: - Microsoft vNIC: - size the Rx indirection table based on requested configuration - support VLAN tagging - Amazon vNIC: - try to reuse Rx buffers if not fully consumed, useful for ARM servers running with 16kB pages - Google vNIC: - support TCP segmentation of >64kB frames - Ethernet embedded switches: - Marvell (mv88e6xxx): - enable USXGMII (88E6191X) - Microchip: - lan966x: add support for Egress Stage 0 ACL engine - lan966x: support mapping packet priority to internal switch priority (based on PCP or DSCP) - Ethernet PHYs: - Broadcom PHYs: - support for Wake-on-LAN for BCM54210E/B50212E - report LPI counter - Microsemi PHYs: support RGMII delay configuration (VSC85xx) - Micrel PHYs: receive timestamp in the frame (LAN8841) - Realtek PHYs: support optional external PHY clock - Altera TSE PCS: merge the driver into Lynx PCS which it is a variant of - CAN: Kvaser PCIEcan: - support packet timestamping - WiFi: - Intel (iwlwifi): - major update for new firmware and Multi-Link Operation (MLO) - configuration rework to drop test devices and split the different families - support for segmented PNVM images and power tables - new vendor entries for PPAG (platform antenna gain) feature - Qualcomm 802.11ax (ath11k): - Multiple Basic Service Set Identifier (MBSSID) and Enhanced MBSSID Advertisement (EMA) support in AP mode - support factory test mode - RealTek (rtw89): - add RSSI based antenna diversity - support U-NII-4 channels on 5 GHz band - RealTek (rtl8xxxu): - AP mode support for 8188f - support USB RX aggregation for the newer chips" * tag 'net-next-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1602 commits) net: scm: introduce and use scm_recv_unix helper af_unix: Skip SCM_PIDFD if scm->pid is NULL. net: lan743x: Simplify comparison netlink: Add __sock_i_ino() for __netlink_diag_dump(). net: dsa: avoid suspicious RCU usage for synced VLAN-aware MAC addresses Revert "af_unix: Call scm_recv() only after scm_set_cred()." phylink: ReST-ify the phylink_pcs_neg_mode() kdoc libceph: Partially revert changes to support MSG_SPLICE_PAGES net: phy: mscc: fix packet loss due to RGMII delays net: mana: use vmalloc_array and vcalloc net: enetc: use vmalloc_array and vcalloc ionic: use vmalloc_array and vcalloc pds_core: use vmalloc_array and vcalloc gve: use vmalloc_array and vcalloc octeon_ep: use vmalloc_array and vcalloc net: usb: qmi_wwan: add u-blox 0x1312 composition perf trace: fix MSG_SPLICE_PAGES build error ipvlan: Fix return value of ipvlan_queue_xmit() netfilter: nf_tables: fix underflow in chain reference counter netfilter: nf_tables: unbind non-anonymous set if rule construction fails ...
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile2
-rw-r--r--net/core/dev.c158
-rw-r--r--net/core/filter.c212
-rw-r--r--net/core/flow_dissector.c40
-rw-r--r--net/core/gro.c114
-rw-r--r--net/core/gso.c273
-rw-r--r--net/core/net_namespace.c4
-rw-r--r--net/core/netdev-genl-gen.c2
-rw-r--r--net/core/netdev-genl-gen.h2
-rw-r--r--net/core/netpoll.c5
-rw-r--r--net/core/pktgen.c13
-rw-r--r--net/core/rtnetlink.c187
-rw-r--r--net/core/skbuff.c308
-rw-r--r--net/core/sock.c160
-rw-r--r--net/core/sock_map.c4
15 files changed, 950 insertions, 534 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 8f367813bc68..731db2eaa610 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -13,7 +13,7 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
fib_notifier.o xdp.o flow_offload.o gro.o \
- netdev-genl.o netdev-genl-gen.o
+ netdev-genl.o netdev-genl-gen.o gso.o
obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
diff --git a/net/core/dev.c b/net/core/dev.c
index c29f3e1db3ca..69a3e544676c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -758,29 +758,43 @@ struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
}
EXPORT_SYMBOL(dev_get_by_name_rcu);
+/* Deprecated for new users, call netdev_get_by_name() instead */
+struct net_device *dev_get_by_name(struct net *net, const char *name)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(dev_get_by_name);
+
/**
- * dev_get_by_name - find a device by its name
+ * netdev_get_by_name() - find a device by its name
* @net: the applicable net namespace
* @name: name to find
+ * @tracker: tracking object for the acquired reference
+ * @gfp: allocation flags for the tracker
*
* Find an interface by name. This can be called from any
* context and does its own locking. The returned handle has
- * the usage count incremented and the caller must use dev_put() to
+ * the usage count incremented and the caller must use netdev_put() to
* release it when it is no longer needed. %NULL is returned if no
* matching device is found.
*/
-
-struct net_device *dev_get_by_name(struct net *net, const char *name)
+struct net_device *netdev_get_by_name(struct net *net, const char *name,
+ netdevice_tracker *tracker, gfp_t gfp)
{
struct net_device *dev;
- rcu_read_lock();
- dev = dev_get_by_name_rcu(net, name);
- dev_hold(dev);
- rcu_read_unlock();
+ dev = dev_get_by_name(net, name);
+ if (dev)
+ netdev_tracker_alloc(dev, tracker, gfp);
return dev;
}
-EXPORT_SYMBOL(dev_get_by_name);
+EXPORT_SYMBOL(netdev_get_by_name);
/**
* __dev_get_by_index - find a device by its ifindex
@@ -831,29 +845,42 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
}
EXPORT_SYMBOL(dev_get_by_index_rcu);
+/* Deprecated for new users, call netdev_get_by_index() instead */
+struct net_device *dev_get_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(dev_get_by_index);
/**
- * dev_get_by_index - find a device by its ifindex
+ * netdev_get_by_index() - find a device by its ifindex
* @net: the applicable net namespace
* @ifindex: index of device
+ * @tracker: tracking object for the acquired reference
+ * @gfp: allocation flags for the tracker
*
* Search for an interface by index. Returns NULL if the device
* is not found or a pointer to the device. The device returned has
* had a reference added and the pointer is safe until the user calls
- * dev_put to indicate they have finished with it.
+ * netdev_put() to indicate they have finished with it.
*/
-
-struct net_device *dev_get_by_index(struct net *net, int ifindex)
+struct net_device *netdev_get_by_index(struct net *net, int ifindex,
+ netdevice_tracker *tracker, gfp_t gfp)
{
struct net_device *dev;
- rcu_read_lock();
- dev = dev_get_by_index_rcu(net, ifindex);
- dev_hold(dev);
- rcu_read_unlock();
+ dev = dev_get_by_index(net, ifindex);
+ if (dev)
+ netdev_tracker_alloc(dev, tracker, gfp);
return dev;
}
-EXPORT_SYMBOL(dev_get_by_index);
+EXPORT_SYMBOL(netdev_get_by_index);
/**
* dev_get_by_napi_id - find a device by napi_id
@@ -3209,7 +3236,7 @@ static u16 skb_tx_hash(const struct net_device *dev,
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
-static void skb_warn_bad_offload(const struct sk_buff *skb)
+void skb_warn_bad_offload(const struct sk_buff *skb)
{
static const netdev_features_t null_features;
struct net_device *dev = skb->dev;
@@ -3338,74 +3365,6 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
return vlan_get_protocol_and_depth(skb, type, depth);
}
-/* openvswitch calls this on rx path, so we need a different check.
- */
-static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
-{
- if (tx_path)
- return skb->ip_summed != CHECKSUM_PARTIAL &&
- skb->ip_summed != CHECKSUM_UNNECESSARY;
-
- return skb->ip_summed == CHECKSUM_NONE;
-}
-
-/**
- * __skb_gso_segment - Perform segmentation on skb.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- * @tx_path: whether it is called in TX path
- *
- * This function segments the given skb and returns a list of segments.
- *
- * It may return NULL if the skb requires no segmentation. This is
- * only possible when GSO is used for verifying header integrity.
- *
- * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
- */
-struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
- netdev_features_t features, bool tx_path)
-{
- struct sk_buff *segs;
-
- if (unlikely(skb_needs_check(skb, tx_path))) {
- int err;
-
- /* We're going to init ->check field in TCP or UDP header */
- err = skb_cow_head(skb, 0);
- if (err < 0)
- return ERR_PTR(err);
- }
-
- /* Only report GSO partial support if it will enable us to
- * support segmentation on this frame without needing additional
- * work.
- */
- if (features & NETIF_F_GSO_PARTIAL) {
- netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
- struct net_device *dev = skb->dev;
-
- partial_features |= dev->features & dev->gso_partial_features;
- if (!skb_gso_ok(skb, features | partial_features))
- features &= ~NETIF_F_GSO_PARTIAL;
- }
-
- BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
- sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
-
- SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
- SKB_GSO_CB(skb)->encap_level = 0;
-
- skb_reset_mac_header(skb);
- skb_reset_mac_len(skb);
-
- segs = skb_mac_gso_segment(skb, features);
-
- if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
- skb_warn_bad_offload(skb);
-
- return segs;
-}
-EXPORT_SYMBOL(__skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
@@ -6199,7 +6158,8 @@ restart:
if (!napi)
goto out;
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
for (;;) {
int work = 0;
@@ -6241,7 +6201,8 @@ count:
if (unlikely(need_resched())) {
if (napi_poll)
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
rcu_read_unlock();
cond_resched();
if (loop_end(loop_end_arg, start_time))
@@ -6252,7 +6213,8 @@ count:
}
if (napi_poll)
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
- preempt_enable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
out:
rcu_read_unlock();
}
@@ -8822,9 +8784,11 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
if (err)
return err;
- err = ops->ndo_set_mac_address(dev, sa);
- if (err)
- return err;
+ if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
+ err = ops->ndo_set_mac_address(dev, sa);
+ if (err)
+ return err;
+ }
dev->addr_assign_type = NET_ADDR_SET;
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -10570,8 +10534,10 @@ void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
{
WARN_ON(dev->reg_state == NETREG_REGISTERED);
- dev->gro_flush_timeout = 20000;
- dev->napi_defer_hard_irqs = 1;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ dev->gro_flush_timeout = 20000;
+ dev->napi_defer_hard_irqs = 1;
+ }
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
@@ -10632,7 +10598,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
- ref_tracker_dir_init(&dev->refcnt_tracker, 128);
+ ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
#ifdef CONFIG_PCPU_DEV_REFCNT
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
diff --git a/net/core/filter.c b/net/core/filter.c
index d9ce04ca22ce..06ba0e56e369 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3948,20 +3948,21 @@ void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
{
- struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
u32 size = xdp->data_end - xdp->data;
+ struct skb_shared_info *sinfo;
void *addr = xdp->data;
int i;
if (unlikely(offset > 0xffff || len > 0xffff))
return ERR_PTR(-EFAULT);
- if (offset + len > xdp_get_buff_len(xdp))
+ if (unlikely(offset + len > xdp_get_buff_len(xdp)))
return ERR_PTR(-EINVAL);
- if (offset < size) /* linear area */
+ if (likely(offset < size)) /* linear area */
goto out;
+ sinfo = xdp_get_shared_info_from_buff(xdp);
offset -= size;
for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
u32 frag_size = skb_frag_size(&sinfo->frags[i]);
@@ -5803,6 +5804,12 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
struct fib_table *tb;
+ if (flags & BPF_FIB_LOOKUP_TBID) {
+ tbid = params->tbid;
+ /* zero out for vlan output */
+ params->tbid = 0;
+ }
+
tb = fib_get_table(net, tbid);
if (unlikely(!tb))
return BPF_FIB_LKUP_RET_NOT_FWDED;
@@ -5936,6 +5943,12 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
struct fib6_table *tb;
+ if (flags & BPF_FIB_LOOKUP_TBID) {
+ tbid = params->tbid;
+ /* zero out for vlan output */
+ params->tbid = 0;
+ }
+
tb = ipv6_stub->fib6_get_table(net, tbid);
if (unlikely(!tb))
return BPF_FIB_LKUP_RET_NOT_FWDED;
@@ -6008,7 +6021,7 @@ set_fwd_params:
#endif
#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
- BPF_FIB_LOOKUP_SKIP_NEIGH)
+ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID)
BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
struct bpf_fib_lookup *, params, int, plen, u32, flags)
@@ -6555,12 +6568,11 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
static struct sock *
__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
- u64 flags)
+ u64 flags, int sdif)
{
struct sock *sk = NULL;
struct net *net;
u8 family;
- int sdif;
if (len == sizeof(tuple->ipv4))
family = AF_INET;
@@ -6572,10 +6584,12 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))
goto out;
- if (family == AF_INET)
- sdif = inet_sdif(skb);
- else
- sdif = inet6_sdif(skb);
+ if (sdif < 0) {
+ if (family == AF_INET)
+ sdif = inet_sdif(skb);
+ else
+ sdif = inet6_sdif(skb);
+ }
if ((s32)netns_id < 0) {
net = caller_net;
@@ -6595,10 +6609,11 @@ out:
static struct sock *
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
- u64 flags)
+ u64 flags, int sdif)
{
struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
- ifindex, proto, netns_id, flags);
+ ifindex, proto, netns_id, flags,
+ sdif);
if (sk) {
struct sock *sk2 = sk_to_full_sk(sk);
@@ -6638,7 +6653,7 @@ bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
}
return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
- netns_id, flags);
+ netns_id, flags, -1);
}
static struct sock *
@@ -6727,6 +6742,78 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
.arg5_type = ARG_ANYTHING,
};
+BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ struct net_device *dev = skb->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net,
+ ifindex, IPPROTO_TCP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
+ .func = bpf_tc_skc_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ struct net_device *dev = skb->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
+ ifindex, IPPROTO_TCP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
+ .func = bpf_tc_sk_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ struct net_device *dev = skb->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
+ ifindex, IPPROTO_UDP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
+ .func = bpf_tc_sk_lookup_udp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
if (sk && sk_is_refcounted(sk))
@@ -6744,12 +6831,13 @@ static const struct bpf_func_proto bpf_sk_release_proto = {
BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
- struct net *caller_net = dev_net(ctx->rxq->dev);
- int ifindex = ctx->rxq->dev->ifindex;
+ struct net_device *dev = ctx->rxq->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
ifindex, IPPROTO_UDP, netns_id,
- flags);
+ flags, sdif);
}
static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
@@ -6767,12 +6855,13 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
- struct net *caller_net = dev_net(ctx->rxq->dev);
- int ifindex = ctx->rxq->dev->ifindex;
+ struct net_device *dev = ctx->rxq->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
ifindex, IPPROTO_TCP, netns_id,
- flags);
+ flags, sdif);
}
static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
@@ -6790,12 +6879,13 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
- struct net *caller_net = dev_net(ctx->rxq->dev);
- int ifindex = ctx->rxq->dev->ifindex;
+ struct net_device *dev = ctx->rxq->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
ifindex, IPPROTO_TCP, netns_id,
- flags);
+ flags, sdif);
}
static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
@@ -6815,7 +6905,8 @@ BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
{
return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
sock_net(ctx->sk), 0,
- IPPROTO_TCP, netns_id, flags);
+ IPPROTO_TCP, netns_id, flags,
+ -1);
}
static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
@@ -6834,7 +6925,7 @@ BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
{
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
sock_net(ctx->sk), 0, IPPROTO_TCP,
- netns_id, flags);
+ netns_id, flags, -1);
}
static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
@@ -6853,7 +6944,7 @@ BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
{
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
sock_net(ctx->sk), 0, IPPROTO_UDP,
- netns_id, flags);
+ netns_id, flags, -1);
}
static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
@@ -6916,6 +7007,8 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
FIELD)); \
} while (0)
+ BTF_TYPE_EMIT(struct bpf_tcp_sock);
+
switch (si->off) {
case offsetof(struct bpf_tcp_sock, rtt_min):
BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
@@ -7980,9 +8073,9 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
#endif
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
- return &bpf_sk_lookup_tcp_proto;
+ return &bpf_tc_sk_lookup_tcp_proto;
case BPF_FUNC_sk_lookup_udp:
- return &bpf_sk_lookup_udp_proto;
+ return &bpf_tc_sk_lookup_udp_proto;
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
case BPF_FUNC_tcp_sock:
@@ -7990,7 +8083,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_listener_sock:
return &bpf_get_listener_sock_proto;
case BPF_FUNC_skc_lookup_tcp:
- return &bpf_skc_lookup_tcp_proto;
+ return &bpf_tc_skc_lookup_tcp_proto;
case BPF_FUNC_tcp_check_syncookie:
return &bpf_tcp_check_syncookie_proto;
case BPF_FUNC_skb_ecn_set_ce:
@@ -11721,3 +11814,66 @@ static int __init bpf_kfunc_init(void)
return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
}
late_initcall(bpf_kfunc_init);
+
+/* Disables missing prototype warnings */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
+ *
+ * The function expects a non-NULL pointer to a socket, and invokes the
+ * protocol specific socket destroy handlers.
+ *
+ * The helper can only be called from BPF contexts that have acquired the socket
+ * locks.
+ *
+ * Parameters:
+ * @sock: Pointer to socket to be destroyed
+ *
+ * Return:
+ * On error, may return EPROTONOSUPPORT, EINVAL.
+ * EPROTONOSUPPORT if protocol specific destroy handler is not supported.
+ * 0 otherwise
+ */
+__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
+{
+ struct sock *sk = (struct sock *)sock;
+
+ /* The locking semantics that allow for synchronous execution of the
+ * destroy handlers are only supported for TCP and UDP.
+ * Supporting protocols will need to acquire sock lock in the BPF context
+ * prior to invoking this kfunc.
+ */
+ if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP &&
+ sk->sk_protocol != IPPROTO_UDP))
+ return -EOPNOTSUPP;
+
+ return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
+}
+
+__diag_pop()
+
+BTF_SET8_START(bpf_sk_iter_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
+BTF_SET8_END(bpf_sk_iter_kfunc_ids)
+
+static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) &&
+ prog->expected_attach_type != BPF_TRACE_ITER)
+ return -EACCES;
+ return 0;
+}
+
+static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_sk_iter_kfunc_ids,
+ .filter = tracing_iter_filter,
+};
+
+static int init_subsystem(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set);
+}
+late_initcall(init_subsystem);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 25fb0bbc310f..85a2d0d9bd39 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -27,6 +27,7 @@
#include <linux/tcp.h>
#include <linux/ptp_classify.h>
#include <net/flow_dissector.h>
+#include <net/pkt_cls.h>
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
#include <linux/bpf.h>
@@ -241,6 +242,15 @@ void skb_flow_dissect_meta(const struct sk_buff *skb,
FLOW_DISSECTOR_KEY_META,
target_container);
meta->ingress_ifindex = skb->skb_iif;
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ if (tc_skb_ext_tc_enabled()) {
+ struct tc_skb_ext *ext;
+
+ ext = skb_ext_find(skb, TC_SKB_EXT);
+ if (ext)
+ meta->l2_miss = ext->l2_miss;
+ }
+#endif
}
EXPORT_SYMBOL(skb_flow_dissect_meta);
@@ -548,6 +558,30 @@ __skb_flow_dissect_arp(const struct sk_buff *skb,
}
static enum flow_dissect_ret
+__skb_flow_dissect_cfm(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_cfm *key, *hdr, _hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM))
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr);
+ if (!hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM,
+ target_container);
+
+ key->mdl_ver = hdr->mdl_ver;
+ key->opcode = hdr->opcode;
+
+ return FLOW_DISSECT_RET_OUT_GOOD;
+}
+
+static enum flow_dissect_ret
__skb_flow_dissect_gre(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
struct flow_dissector *flow_dissector,
@@ -1390,6 +1424,12 @@ proto_again:
break;
}
+ case htons(ETH_P_CFM):
+ fdret = __skb_flow_dissect_cfm(skb, flow_dissector,
+ target_container, data,
+ nhoff, hlen);
+ break;
+
default:
fdret = FLOW_DISSECT_RET_OUT_BAD;
break;
diff --git a/net/core/gro.c b/net/core/gro.c
index 2d84165cb4f1..0759277dc14e 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -10,7 +10,7 @@
#define GRO_MAX_HEAD (MAX_HEADER + 128)
static DEFINE_SPINLOCK(offload_lock);
-static struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base);
+struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base);
/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
int gro_normal_batch __read_mostly = 8;
@@ -92,63 +92,6 @@ void dev_remove_offload(struct packet_offload *po)
}
EXPORT_SYMBOL(dev_remove_offload);
-/**
- * skb_eth_gso_segment - segmentation handler for ethernet protocols.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- * @type: Ethernet Protocol ID
- */
-struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb,
- netdev_features_t features, __be16 type)
-{
- struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
- struct packet_offload *ptype;
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, &offload_base, list) {
- if (ptype->type == type && ptype->callbacks.gso_segment) {
- segs = ptype->callbacks.gso_segment(skb, features);
- break;
- }
- }
- rcu_read_unlock();
-
- return segs;
-}
-EXPORT_SYMBOL(skb_eth_gso_segment);
-
-/**
- * skb_mac_gso_segment - mac layer segmentation handler.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- */
-struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
-{
- struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
- struct packet_offload *ptype;
- int vlan_depth = skb->mac_len;
- __be16 type = skb_network_protocol(skb, &vlan_depth);
-
- if (unlikely(!type))
- return ERR_PTR(-EINVAL);
-
- __skb_pull(skb, vlan_depth);
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, &offload_base, list) {
- if (ptype->type == type && ptype->callbacks.gso_segment) {
- segs = ptype->callbacks.gso_segment(skb, features);
- break;
- }
- }
- rcu_read_unlock();
-
- __skb_push(skb, skb->data - skb_mac_header(skb));
-
- return segs;
-}
-EXPORT_SYMBOL(skb_mac_gso_segment);
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
{
@@ -239,9 +182,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
- __skb_frag_set_page(frag, page);
- skb_frag_off_set(frag, first_offset);
- skb_frag_size_set(frag, first_size);
+ skb_frag_fill_page_desc(frag, page, first_offset, first_size);
memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
/* We dont need to clear skbinfo->nr_frags here */
@@ -363,6 +304,24 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
}
EXPORT_SYMBOL(napi_gro_flush);
+static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb,
+ const struct sk_buff *p,
+ unsigned long diffs)
+{
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ struct tc_skb_ext *skb_ext;
+ struct tc_skb_ext *p_ext;
+
+ skb_ext = skb_ext_find(skb, TC_SKB_EXT);
+ p_ext = skb_ext_find(p, TC_SKB_EXT);
+
+ diffs |= (!!p_ext) ^ (!!skb_ext);
+ if (!diffs && unlikely(skb_ext))
+ diffs |= p_ext->chain ^ skb_ext->chain;
+#endif
+ return diffs;
+}
+
static void gro_list_prepare(const struct list_head *head,
const struct sk_buff *skb)
{
@@ -397,23 +356,11 @@ static void gro_list_prepare(const struct list_head *head,
* avoid trying too hard to skip each of them individually
*/
if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) {
-#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- struct tc_skb_ext *skb_ext;
- struct tc_skb_ext *p_ext;
-#endif
-
diffs |= p->sk != skb->sk;
diffs |= skb_metadata_dst_cmp(p, skb);
diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
-#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- skb_ext = skb_ext_find(skb, TC_SKB_EXT);
- p_ext = skb_ext_find(p, TC_SKB_EXT);
-
- diffs |= (!!p_ext) ^ (!!skb_ext);
- if (!diffs && unlikely(skb_ext))
- diffs |= p_ext->chain ^ skb_ext->chain;
-#endif
+ diffs |= gro_list_prepare_tc_ext(skb, p, diffs);
}
NAPI_GRO_CB(p)->same_flow = !diffs;
@@ -460,6 +407,14 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
}
}
+static void gro_try_pull_from_frag0(struct sk_buff *skb)
+{
+ int grow = skb_gro_offset(skb) - skb_headlen(skb);
+
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
+}
+
static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
{
struct sk_buff *oldest;
@@ -489,7 +444,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
struct sk_buff *pp = NULL;
enum gro_result ret;
int same_flow;
- int grow;
if (netif_elide_gro(skb->dev))
goto normal;
@@ -564,17 +518,14 @@ found_ptype:
else
gro_list->count++;
+ /* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */
+ gro_try_pull_from_frag0(skb);
NAPI_GRO_CB(skb)->age = jiffies;
NAPI_GRO_CB(skb)->last = skb;
if (!skb_is_gso(skb))
skb_shinfo(skb)->gso_size = skb_gro_len(skb);
list_add(&skb->list, &gro_list->list);
ret = GRO_HELD;
-
-pull:
- grow = skb_gro_offset(skb) - skb_headlen(skb);
- if (grow > 0)
- gro_pull_from_frag0(skb, grow);
ok:
if (gro_list->count) {
if (!test_bit(bucket, &napi->gro_bitmask))
@@ -587,7 +538,8 @@ ok:
normal:
ret = GRO_NORMAL;
- goto pull;
+ gro_try_pull_from_frag0(skb);
+ goto ok;
}
struct packet_offload *gro_find_receive_by_type(__be16 type)
diff --git a/net/core/gso.c b/net/core/gso.c
new file mode 100644
index 000000000000..9e1803bfc9c6
--- /dev/null
+++ b/net/core/gso.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/skbuff.h>
+#include <linux/sctp.h>
+#include <net/gso.h>
+#include <net/gro.h>
+
+/**
+ * skb_eth_gso_segment - segmentation handler for ethernet protocols.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @type: Ethernet Protocol ID
+ */
+struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, __be16 type)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_eth_gso_segment);
+
+/**
+ * skb_mac_gso_segment - mac layer segmentation handler.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
+
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, vlan_depth);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_mac_gso_segment);
+/* openvswitch calls this on rx path, so we need a different check.
+ */
+static bool skb_needs_check(const struct sk_buff *skb, bool tx_path)
+{
+ if (tx_path)
+ return skb->ip_summed != CHECKSUM_PARTIAL &&
+ skb->ip_summed != CHECKSUM_UNNECESSARY;
+
+ return skb->ip_summed == CHECKSUM_NONE;
+}
+
+/**
+ * __skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @tx_path: whether it is called in TX path
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ *
+ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
+ */
+struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, bool tx_path)
+{
+ struct sk_buff *segs;
+
+ if (unlikely(skb_needs_check(skb, tx_path))) {
+ int err;
+
+ /* We're going to init ->check field in TCP or UDP header */
+ err = skb_cow_head(skb, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ /* Only report GSO partial support if it will enable us to
+ * support segmentation on this frame without needing additional
+ * work.
+ */
+ if (features & NETIF_F_GSO_PARTIAL) {
+ netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
+ struct net_device *dev = skb->dev;
+
+ partial_features |= dev->features & dev->gso_partial_features;
+ if (!skb_gso_ok(skb, features | partial_features))
+ features &= ~NETIF_F_GSO_PARTIAL;
+ }
+
+ BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
+ sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
+
+ SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
+ SKB_GSO_CB(skb)->encap_level = 0;
+
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ segs = skb_mac_gso_segment(skb, features);
+
+ if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
+ skb_warn_bad_offload(skb);
+
+ return segs;
+}
+EXPORT_SYMBOL(__skb_gso_segment);
+
+/**
+ * skb_gso_transport_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_transport_seglen is used to determine the real size of the
+ * individual segments, including Layer4 headers (TCP/UDP).
+ *
+ * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
+ */
+static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ unsigned int thlen = 0;
+
+ if (skb->encapsulation) {
+ thlen = skb_inner_transport_header(skb) -
+ skb_transport_header(skb);
+
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ thlen += inner_tcp_hdrlen(skb);
+ } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ thlen = tcp_hdrlen(skb);
+ } else if (unlikely(skb_is_gso_sctp(skb))) {
+ thlen = sizeof(struct sctphdr);
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+ thlen = sizeof(struct udphdr);
+ }
+ /* UFO sets gso_size to the size of the fragmentation
+ * payload, i.e. the size of the L4 (UDP) header is already
+ * accounted for.
+ */
+ return thlen + shinfo->gso_size;
+}
+
+/**
+ * skb_gso_network_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_network_seglen is used to determine the real size of the
+ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
+ *
+ * The MAC/L2 header is not accounted for.
+ */
+static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) -
+ skb_network_header(skb);
+
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/**
+ * skb_gso_mac_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_mac_seglen is used to determine the real size of the
+ * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
+ * headers (TCP/UDP).
+ */
+static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/**
+ * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
+ *
+ * There are a couple of instances where we have a GSO skb, and we
+ * want to determine what size it would be after it is segmented.
+ *
+ * We might want to check:
+ * - L3+L4+payload size (e.g. IP forwarding)
+ * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
+ *
+ * This is a helper to do that correctly considering GSO_BY_FRAGS.
+ *
+ * @skb: GSO skb
+ *
+ * @seg_len: The segmented length (from skb_gso_*_seglen). In the
+ * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
+ *
+ * @max_len: The maximum permissible length.
+ *
+ * Returns true if the segmented length <= max length.
+ */
+static inline bool skb_gso_size_check(const struct sk_buff *skb,
+ unsigned int seg_len,
+ unsigned int max_len) {
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ const struct sk_buff *iter;
+
+ if (shinfo->gso_size != GSO_BY_FRAGS)
+ return seg_len <= max_len;
+
+ /* Undo this so we can re-use header sizes */
+ seg_len -= GSO_BY_FRAGS;
+
+ skb_walk_frags(skb, iter) {
+ if (seg_len + skb_headlen(iter) > max_len)
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
+ *
+ * @skb: GSO skb
+ * @mtu: MTU to validate against
+ *
+ * skb_gso_validate_network_len validates if a given skb will fit a
+ * wanted MTU once split. It considers L3 headers, L4 headers, and the
+ * payload.
+ */
+bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
+{
+ return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
+}
+EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
+
+/**
+ * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
+ *
+ * @skb: GSO skb
+ * @len: length to validate against
+ *
+ * skb_gso_validate_mac_len validates if a given skb will fit a wanted
+ * length once split, including L2, L3 and L4 headers and the payload.
+ */
+bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
+{
+ return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
+}
+EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
+
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3e3598cd49f2..f4183c4c1ec8 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -308,7 +308,7 @@ EXPORT_SYMBOL_GPL(get_net_ns_by_id);
/* init code that must occur even if setup_net() is not called. */
static __net_init void preinit_net(struct net *net)
{
- ref_tracker_dir_init(&net->notrefcnt_tracker, 128);
+ ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt");
}
/*
@@ -322,7 +322,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
LIST_HEAD(net_exit_list);
refcount_set(&net->ns.count, 1);
- ref_tracker_dir_init(&net->refcnt_tracker, 128);
+ ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
refcount_set(&net->passive, 1);
get_random_bytes(&net->hash_mix, sizeof(u32));
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index de17ca2f7dbf..ea9231378aa6 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -8,7 +8,7 @@
#include "netdev-genl-gen.h"
-#include <linux/netdev.h>
+#include <uapi/linux/netdev.h>
/* NETDEV_CMD_DEV_GET - do */
static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = {
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index 74d74fc23167..7b370c073e7d 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -9,7 +9,7 @@
#include <net/netlink.h>
#include <net/genetlink.h>
-#include <linux/netdev.h>
+#include <uapi/linux/netdev.h>
int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e6a739b1afa9..543007f159f9 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -690,7 +690,7 @@ int netpoll_setup(struct netpoll *np)
err = -ENODEV;
goto unlock;
}
- dev_hold(ndev);
+ netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL);
if (netdev_master_upper_dev_get(ndev)) {
np_err(np, "%s is a slave device, aborting\n", np->dev_name);
@@ -783,12 +783,11 @@ put_noaddr:
err = __netpoll_setup(np, ndev);
if (err)
goto put;
- netdev_tracker_alloc(ndev, &np->dev_tracker, GFP_KERNEL);
rtnl_unlock();
return 0;
put:
- dev_put(ndev);
+ netdev_put(ndev, &np->dev_tracker);
unlock:
rtnl_unlock();
return err;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 760238196db1..f56b8d697014 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2785,14 +2785,17 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
break;
}
get_page(pkt_dev->page);
- skb_frag_set_page(skb, i, pkt_dev->page);
- skb_frag_off_set(&skb_shinfo(skb)->frags[i], 0);
+
/*last fragment, fill rest of data*/
if (i == (frags - 1))
- skb_frag_size_set(&skb_shinfo(skb)->frags[i],
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE));
+ skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
+ pkt_dev->page, 0,
+ (datalen < PAGE_SIZE ?
+ datalen : PAGE_SIZE));
else
- skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len);
+ skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
+ pkt_dev->page, 0, frag_len);
+
datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 41de3a2f29e1..3ad4e030846d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -961,24 +961,27 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
nla_total_size(sizeof(struct ifla_vf_rate)) +
nla_total_size(sizeof(struct ifla_vf_link_state)) +
nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
- nla_total_size(0) + /* nest IFLA_VF_STATS */
- /* IFLA_VF_STATS_RX_PACKETS */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_TX_PACKETS */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_RX_BYTES */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_TX_BYTES */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_BROADCAST */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_MULTICAST */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_RX_DROPPED */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_TX_DROPPED */
- nla_total_size_64bit(sizeof(__u64)) +
nla_total_size(sizeof(struct ifla_vf_trust)));
+ if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) {
+ size += num_vfs *
+ (nla_total_size(0) + /* nest IFLA_VF_STATS */
+ /* IFLA_VF_STATS_RX_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_BROADCAST */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_MULTICAST */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_DROPPED */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_DROPPED */
+ nla_total_size_64bit(sizeof(__u64)));
+ }
return size;
} else
return 0;
@@ -1270,7 +1273,8 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct net_device *dev,
int vfs_num,
- struct nlattr *vfinfo)
+ struct nlattr *vfinfo,
+ u32 ext_filter_mask)
{
struct ifla_vf_rss_query_en vf_rss_query_en;
struct nlattr *vf, *vfstats, *vfvlanlist;
@@ -1376,33 +1380,35 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
goto nla_put_vf_failure;
}
nla_nest_end(skb, vfvlanlist);
- memset(&vf_stats, 0, sizeof(vf_stats));
- if (dev->netdev_ops->ndo_get_vf_stats)
- dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
- &vf_stats);
- vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS);
- if (!vfstats)
- goto nla_put_vf_failure;
- if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
- vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
- vf_stats.tx_packets, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES,
- vf_stats.rx_bytes, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES,
- vf_stats.tx_bytes, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
- vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
- vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
- vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
- nla_nest_cancel(skb, vfstats);
- goto nla_put_vf_failure;
+ if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) {
+ memset(&vf_stats, 0, sizeof(vf_stats));
+ if (dev->netdev_ops->ndo_get_vf_stats)
+ dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
+ &vf_stats);
+ vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS);
+ if (!vfstats)
+ goto nla_put_vf_failure;
+ if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
+ vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
+ vf_stats.tx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES,
+ vf_stats.rx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES,
+ vf_stats.tx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
+ vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
+ vf_stats.multicast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
+ vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
+ vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
+ nla_nest_cancel(skb, vfstats);
+ goto nla_put_vf_failure;
+ }
+ nla_nest_end(skb, vfstats);
}
- nla_nest_end(skb, vfstats);
nla_nest_end(skb, vf);
return 0;
@@ -1435,7 +1441,7 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,
return -EMSGSIZE;
for (i = 0; i < num_vfs; i++) {
- if (rtnl_fill_vfinfo(skb, dev, i, vfinfo))
+ if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask))
return -EMSGSIZE;
}
@@ -2377,45 +2383,43 @@ static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
- if (dev) {
- if (tb[IFLA_ADDRESS] &&
- nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
- return -EINVAL;
+ if (tb[IFLA_ADDRESS] &&
+ nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
+ return -EINVAL;
- if (tb[IFLA_BROADCAST] &&
- nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
- return -EINVAL;
+ if (tb[IFLA_BROADCAST] &&
+ nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
+ return -EINVAL;
- if (tb[IFLA_GSO_MAX_SIZE] &&
- nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) {
- NL_SET_ERR_MSG(extack, "too big gso_max_size");
- return -EINVAL;
- }
+ if (tb[IFLA_GSO_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) {
+ NL_SET_ERR_MSG(extack, "too big gso_max_size");
+ return -EINVAL;
+ }
- if (tb[IFLA_GSO_MAX_SEGS] &&
- (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS ||
- nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) {
- NL_SET_ERR_MSG(extack, "too big gso_max_segs");
- return -EINVAL;
- }
+ if (tb[IFLA_GSO_MAX_SEGS] &&
+ (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS ||
+ nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) {
+ NL_SET_ERR_MSG(extack, "too big gso_max_segs");
+ return -EINVAL;
+ }
- if (tb[IFLA_GRO_MAX_SIZE] &&
- nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) {
- NL_SET_ERR_MSG(extack, "too big gro_max_size");
- return -EINVAL;
- }
+ if (tb[IFLA_GRO_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) {
+ NL_SET_ERR_MSG(extack, "too big gro_max_size");
+ return -EINVAL;
+ }
- if (tb[IFLA_GSO_IPV4_MAX_SIZE] &&
- nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) {
- NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size");
- return -EINVAL;
- }
+ if (tb[IFLA_GSO_IPV4_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) {
+ NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size");
+ return -EINVAL;
+ }
- if (tb[IFLA_GRO_IPV4_MAX_SIZE] &&
- nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) {
- NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size");
- return -EINVAL;
- }
+ if (tb[IFLA_GRO_IPV4_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) {
+ NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size");
+ return -EINVAL;
}
if (tb[IFLA_AF_SPEC]) {
@@ -2736,10 +2740,6 @@ static int do_setlink(const struct sk_buff *skb,
char ifname[IFNAMSIZ];
int err;
- err = validate_linkmsg(dev, tb, extack);
- if (err < 0)
- return err;
-
if (tb[IFLA_IFNAME])
nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
else
@@ -3156,6 +3156,10 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
+ err = validate_linkmsg(dev, tb, extack);
+ if (err < 0)
+ goto errout;
+
err = do_setlink(skb, dev, ifm, extack, tb, 0);
errout:
return err;
@@ -3399,6 +3403,9 @@ static int rtnl_group_changelink(const struct sk_buff *skb,
for_each_netdev_safe(net, dev, aux) {
if (dev->group == group) {
+ err = validate_linkmsg(dev, tb, extack);
+ if (err < 0)
+ return err;
err = do_setlink(skb, dev, ifm, extack, tb, 0);
if (err < 0)
return err;
@@ -3556,10 +3563,6 @@ replay:
m_ops = master_dev->rtnl_link_ops;
}
- err = validate_linkmsg(dev, tb, extack);
- if (err < 0)
- return err;
-
if (tb[IFLA_LINKINFO]) {
err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX,
tb[IFLA_LINKINFO],
@@ -3623,6 +3626,10 @@ replay:
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
+ err = validate_linkmsg(dev, tb, extack);
+ if (err < 0)
+ return err;
+
if (linkinfo[IFLA_INFO_DATA]) {
if (!ops || ops != dev->rtnl_link_ops ||
!ops->changelink)
@@ -4090,7 +4097,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
ndm->ndm_ifindex = dev->ifindex;
ndm->ndm_state = ndm_state;
- if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
+ if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr))
goto nla_put_failure;
if (vid)
if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid))
@@ -4104,10 +4111,10 @@ nla_put_failure:
return -EMSGSIZE;
}
-static inline size_t rtnl_fdb_nlmsg_size(void)
+static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev)
{
return NLMSG_ALIGN(sizeof(struct ndmsg)) +
- nla_total_size(ETH_ALEN) + /* NDA_LLADDR */
+ nla_total_size(dev->addr_len) + /* NDA_LLADDR */
nla_total_size(sizeof(u16)) + /* NDA_VLAN */
0;
}
@@ -4119,7 +4126,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
struct sk_buff *skb;
int err = -ENOBUFS;
- skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC);
+ skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC);
if (!skb)
goto errout;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index cea28d30abb5..6c5915efbc17 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -67,6 +67,7 @@
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/gso.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <net/mpls.h>
@@ -92,15 +93,7 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
-/* skb_small_head_cache and related code is only supported
- * for CONFIG_SLAB and CONFIG_SLUB.
- * As soon as SLOB is removed from the kernel, we can clean up this.
- */
-#if !defined(CONFIG_SLOB)
-# define HAVE_SKB_SMALL_HEAD_CACHE 1
-#endif
-#ifdef HAVE_SKB_SMALL_HEAD_CACHE
static struct kmem_cache *skb_small_head_cache __ro_after_init;
#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
@@ -117,7 +110,6 @@ static struct kmem_cache *skb_small_head_cache __ro_after_init;
#define SKB_SMALL_HEAD_HEADROOM \
SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
-#endif /* HAVE_SKB_SMALL_HEAD_CACHE */
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
@@ -562,7 +554,6 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
void *obj;
obj_size = SKB_HEAD_ALIGN(*size);
-#ifdef HAVE_SKB_SMALL_HEAD_CACHE
if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
!(flags & KMALLOC_NOT_NORMAL_BITS)) {
obj = kmem_cache_alloc_node(skb_small_head_cache,
@@ -576,7 +567,6 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node);
goto out;
}
-#endif
*size = obj_size = kmalloc_size_roundup(obj_size);
/*
* Try a regular allocation, when that fails and we're not entitled
@@ -898,11 +888,9 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
static void skb_kfree_head(void *head, unsigned int end_offset)
{
-#ifdef HAVE_SKB_SMALL_HEAD_CACHE
if (end_offset == SKB_SMALL_HEAD_HEADROOM)
kmem_cache_free(skb_small_head_cache, head);
else
-#endif
kfree(head);
}
@@ -2160,7 +2148,6 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
if (likely(skb_end_offset(skb) == saved_end_offset))
return 0;
-#ifdef HAVE_SKB_SMALL_HEAD_CACHE
/* We can not change skb->end if the original or new value
* is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head().
*/
@@ -2174,7 +2161,6 @@ int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
WARN_ON_ONCE(1);
return 0;
}
-#endif
shinfo = skb_shinfo(skb);
@@ -3003,32 +2989,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
}
EXPORT_SYMBOL_GPL(skb_splice_bits);
-static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg,
- struct kvec *vec, size_t num, size_t size)
+static int sendmsg_locked(struct sock *sk, struct msghdr *msg)
{
struct socket *sock = sk->sk_socket;
+ size_t size = msg_data_left(msg);
if (!sock)
return -EINVAL;
- return kernel_sendmsg(sock, msg, vec, num, size);
+
+ if (!sock->ops->sendmsg_locked)
+ return sock_no_sendmsg_locked(sk, msg, size);
+
+ return sock->ops->sendmsg_locked(sk, msg, size);
}
-static int sendpage_unlocked(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
{
struct socket *sock = sk->sk_socket;
if (!sock)
return -EINVAL;
- return kernel_sendpage(sock, page, offset, size, flags);
+ return sock_sendmsg(sock, msg);
}
-typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg,
- struct kvec *vec, size_t num, size_t size);
-typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset,
- size_t size, int flags);
+typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
- int len, sendmsg_func sendmsg, sendpage_func sendpage)
+ int len, sendmsg_func sendmsg)
{
unsigned int orig_len = len;
struct sk_buff *head = skb;
@@ -3048,8 +3034,9 @@ do_frag_list:
memset(&msg, 0, sizeof(msg));
msg.msg_flags = MSG_DONTWAIT;
- ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked,
- sendmsg_unlocked, sk, &msg, &kv, 1, slen);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
+ ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
+ sendmsg_unlocked, sk, &msg);
if (ret <= 0)
goto error;
@@ -3080,11 +3067,18 @@ do_frag_list:
slen = min_t(size_t, len, skb_frag_size(frag) - offset);
while (slen) {
- ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked,
- sendpage_unlocked, sk,
- skb_frag_page(frag),
- skb_frag_off(frag) + offset,
- slen, MSG_DONTWAIT);
+ struct bio_vec bvec;
+ struct msghdr msg = {
+ .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT,
+ };
+
+ bvec_set_page(&bvec, skb_frag_page(frag), slen,
+ skb_frag_off(frag) + offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
+ slen);
+
+ ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
+ sendmsg_unlocked, sk, &msg);
if (ret <= 0)
goto error;
@@ -3121,16 +3115,14 @@ error:
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
int len)
{
- return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked,
- kernel_sendpage_locked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked,
- sendpage_unlocked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);
}
/**
@@ -4203,13 +4195,13 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
EXPORT_SYMBOL(skb_find_text);
int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
- int offset, size_t size)
+ int offset, size_t size, size_t max_frags)
{
int i = skb_shinfo(skb)->nr_frags;
if (skb_can_coalesce(skb, i, page, offset)) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
- } else if (i < MAX_SKB_FRAGS) {
+ } else if (i < max_frags) {
skb_zcopy_downgrade_managed(skb);
get_page(page);
skb_fill_page_desc_noacc(skb, i, page, offset, size);
@@ -4249,10 +4241,9 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
struct page *page;
page = virt_to_head_page(frag_skb->head);
- __skb_frag_set_page(&head_frag, page);
- skb_frag_off_set(&head_frag, frag_skb->data -
- (unsigned char *)page_address(page));
- skb_frag_size_set(&head_frag, skb_headlen(frag_skb));
+ skb_frag_fill_page_desc(&head_frag, page, frag_skb->data -
+ (unsigned char *)page_address(page),
+ skb_headlen(frag_skb));
return head_frag;
}
@@ -4768,7 +4759,6 @@ void __init skb_init(void)
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
-#ifdef HAVE_SKB_SMALL_HEAD_CACHE
/* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
* struct skb_shared_info is located at the end of skb->head,
* and should not be copied to/from user.
@@ -4780,7 +4770,6 @@ void __init skb_init(void)
0,
SKB_SMALL_HEAD_HEADROOM,
NULL);
-#endif
skb_extensions_init();
}
@@ -5784,147 +5773,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
-/**
- * skb_gso_transport_seglen - Return length of individual segments of a gso packet
- *
- * @skb: GSO skb
- *
- * skb_gso_transport_seglen is used to determine the real size of the
- * individual segments, including Layer4 headers (TCP/UDP).
- *
- * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
- */
-static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
-{
- const struct skb_shared_info *shinfo = skb_shinfo(skb);
- unsigned int thlen = 0;
-
- if (skb->encapsulation) {
- thlen = skb_inner_transport_header(skb) -
- skb_transport_header(skb);
-
- if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
- thlen += inner_tcp_hdrlen(skb);
- } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
- thlen = tcp_hdrlen(skb);
- } else if (unlikely(skb_is_gso_sctp(skb))) {
- thlen = sizeof(struct sctphdr);
- } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
- thlen = sizeof(struct udphdr);
- }
- /* UFO sets gso_size to the size of the fragmentation
- * payload, i.e. the size of the L4 (UDP) header is already
- * accounted for.
- */
- return thlen + shinfo->gso_size;
-}
-
-/**
- * skb_gso_network_seglen - Return length of individual segments of a gso packet
- *
- * @skb: GSO skb
- *
- * skb_gso_network_seglen is used to determine the real size of the
- * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
- *
- * The MAC/L2 header is not accounted for.
- */
-static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
-{
- unsigned int hdr_len = skb_transport_header(skb) -
- skb_network_header(skb);
-
- return hdr_len + skb_gso_transport_seglen(skb);
-}
-
-/**
- * skb_gso_mac_seglen - Return length of individual segments of a gso packet
- *
- * @skb: GSO skb
- *
- * skb_gso_mac_seglen is used to determine the real size of the
- * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
- * headers (TCP/UDP).
- */
-static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
-{
- unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
-
- return hdr_len + skb_gso_transport_seglen(skb);
-}
-
-/**
- * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
- *
- * There are a couple of instances where we have a GSO skb, and we
- * want to determine what size it would be after it is segmented.
- *
- * We might want to check:
- * - L3+L4+payload size (e.g. IP forwarding)
- * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
- *
- * This is a helper to do that correctly considering GSO_BY_FRAGS.
- *
- * @skb: GSO skb
- *
- * @seg_len: The segmented length (from skb_gso_*_seglen). In the
- * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
- *
- * @max_len: The maximum permissible length.
- *
- * Returns true if the segmented length <= max length.
- */
-static inline bool skb_gso_size_check(const struct sk_buff *skb,
- unsigned int seg_len,
- unsigned int max_len) {
- const struct skb_shared_info *shinfo = skb_shinfo(skb);
- const struct sk_buff *iter;
-
- if (shinfo->gso_size != GSO_BY_FRAGS)
- return seg_len <= max_len;
-
- /* Undo this so we can re-use header sizes */
- seg_len -= GSO_BY_FRAGS;
-
- skb_walk_frags(skb, iter) {
- if (seg_len + skb_headlen(iter) > max_len)
- return false;
- }
-
- return true;
-}
-
-/**
- * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
- *
- * @skb: GSO skb
- * @mtu: MTU to validate against
- *
- * skb_gso_validate_network_len validates if a given skb will fit a
- * wanted MTU once split. It considers L3 headers, L4 headers, and the
- * payload.
- */
-bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
-{
- return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
-}
-EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
-
-/**
- * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
- *
- * @skb: GSO skb
- * @len: length to validate against
- *
- * skb_gso_validate_mac_len validates if a given skb will fit a wanted
- * length once split, including L2, L3 and L4 headers and the payload.
- */
-bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
-{
- return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
-}
-EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
-
static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
int mac_len, meta_len;
@@ -6912,3 +6760,91 @@ nodefer: __kfree_skb(skb);
if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
smp_call_function_single_async(cpu, &sd->defer_csd);
}
+
+static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
+ size_t offset, size_t len)
+{
+ const char *kaddr;
+ __wsum csum;
+
+ kaddr = kmap_local_page(page);
+ csum = csum_partial(kaddr + offset, len, 0);
+ kunmap_local(kaddr);
+ skb->csum = csum_block_add(skb->csum, csum, skb->len);
+}
+
+/**
+ * skb_splice_from_iter - Splice (or copy) pages to skbuff
+ * @skb: The buffer to add pages to
+ * @iter: Iterator representing the pages to be added
+ * @maxsize: Maximum amount of pages to be added
+ * @gfp: Allocation flags
+ *
+ * This is a common helper function for supporting MSG_SPLICE_PAGES. It
+ * extracts pages from an iterator and adds them to the socket buffer if
+ * possible, copying them to fragments if not possible (such as if they're slab
+ * pages).
+ *
+ * Returns the amount of data spliced/copied or -EMSGSIZE if there's
+ * insufficient space in the buffer to transfer anything.
+ */
+ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
+ ssize_t maxsize, gfp_t gfp)
+{
+ size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
+ struct page *pages[8], **ppages = pages;
+ ssize_t spliced = 0, ret = 0;
+ unsigned int i;
+
+ while (iter->count > 0) {
+ ssize_t space, nr, len;
+ size_t off;
+
+ ret = -EMSGSIZE;
+ space = frag_limit - skb_shinfo(skb)->nr_frags;
+ if (space < 0)
+ break;
+
+ /* We might be able to coalesce without increasing nr_frags */
+ nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));
+
+ len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
+ if (len <= 0) {
+ ret = len ?: -EIO;
+ break;
+ }
+
+ i = 0;
+ do {
+ struct page *page = pages[i++];
+ size_t part = min_t(size_t, PAGE_SIZE - off, len);
+
+ ret = -EIO;
+ if (WARN_ON_ONCE(!sendpage_ok(page)))
+ goto out;
+
+ ret = skb_append_pagefrags(skb, page, off, part,
+ frag_limit);
+ if (ret < 0) {
+ iov_iter_revert(iter, len);
+ goto out;
+ }
+
+ if (skb->ip_summed == CHECKSUM_NONE)
+ skb_splice_csum_page(skb, page, off, part);
+
+ off = 0;
+ spliced += part;
+ maxsize -= part;
+ len -= part;
+ } while (len > 0);
+
+ if (maxsize <= 0)
+ break;
+ }
+
+out:
+ skb_len_add(skb, spliced);
+ return spliced ?: ret;
+}
+EXPORT_SYMBOL(skb_splice_from_iter);
diff --git a/net/core/sock.c b/net/core/sock.c
index 6e5662ca00fe..9370fd50aa2c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -114,6 +114,9 @@
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
#include <linux/compat.h>
+#include <linux/mroute.h>
+#include <linux/mroute6.h>
+#include <linux/icmpv6.h>
#include <linux/uaccess.h>
@@ -138,6 +141,7 @@
#include <net/tcp.h>
#include <net/busy_poll.h>
+#include <net/phonet/phonet.h>
#include <linux/ethtool.h>
@@ -1246,6 +1250,13 @@ set_sndbuf:
clear_bit(SOCK_PASSCRED, &sock->flags);
break;
+ case SO_PASSPIDFD:
+ if (valbool)
+ set_bit(SOCK_PASSPIDFD, &sock->flags);
+ else
+ clear_bit(SOCK_PASSPIDFD, &sock->flags);
+ break;
+
case SO_TIMESTAMP_OLD:
case SO_TIMESTAMP_NEW:
case SO_TIMESTAMPNS_OLD:
@@ -1726,6 +1737,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
break;
+ case SO_PASSPIDFD:
+ v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
+ break;
+
case SO_PEERCRED:
{
struct ucred peercred;
@@ -1741,6 +1756,39 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
goto lenout;
}
+ case SO_PEERPIDFD:
+ {
+ struct pid *peer_pid;
+ struct file *pidfd_file = NULL;
+ int pidfd;
+
+ if (len > sizeof(pidfd))
+ len = sizeof(pidfd);
+
+ spin_lock(&sk->sk_peer_lock);
+ peer_pid = get_pid(sk->sk_peer_pid);
+ spin_unlock(&sk->sk_peer_lock);
+
+ if (!peer_pid)
+ return -ESRCH;
+
+ pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
+ put_pid(peer_pid);
+ if (pidfd < 0)
+ return pidfd;
+
+ if (copy_to_sockptr(optval, &pidfd, len) ||
+ copy_to_sockptr(optlen, &len, sizeof(int))) {
+ put_unused_fd(pidfd);
+ fput(pidfd_file);
+
+ return -EFAULT;
+ }
+
+ fd_install(pidfd, pidfd_file);
+ return 0;
+ }
+
case SO_PEERGROUPS:
{
const struct cred *cred;
@@ -2550,13 +2598,24 @@ kuid_t sock_i_uid(struct sock *sk)
}
EXPORT_SYMBOL(sock_i_uid);
-unsigned long sock_i_ino(struct sock *sk)
+unsigned long __sock_i_ino(struct sock *sk)
{
unsigned long ino;
- read_lock_bh(&sk->sk_callback_lock);
+ read_lock(&sk->sk_callback_lock);
ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
- read_unlock_bh(&sk->sk_callback_lock);
+ read_unlock(&sk->sk_callback_lock);
+ return ino;
+}
+EXPORT_SYMBOL(__sock_i_ino);
+
+unsigned long sock_i_ino(struct sock *sk)
+{
+ unsigned long ino;
+
+ local_bh_disable();
+ ino = __sock_i_ino(sk);
+ local_bh_enable();
return ino;
}
EXPORT_SYMBOL(sock_i_ino);
@@ -3213,36 +3272,6 @@ void __receive_sock(struct file *file)
}
}
-ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
-{
- ssize_t res;
- struct msghdr msg = {.msg_flags = flags};
- struct kvec iov;
- char *kaddr = kmap(page);
- iov.iov_base = kaddr + offset;
- iov.iov_len = size;
- res = kernel_sendmsg(sock, &msg, &iov, 1, size);
- kunmap(page);
- return res;
-}
-EXPORT_SYMBOL(sock_no_sendpage);
-
-ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
- int offset, size_t size, int flags)
-{
- ssize_t res;
- struct msghdr msg = {.msg_flags = flags};
- struct kvec iov;
- char *kaddr = kmap(page);
-
- iov.iov_base = kaddr + offset;
- iov.iov_len = size;
- res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
- kunmap(page);
- return res;
-}
-EXPORT_SYMBOL(sock_no_sendpage_locked);
-
/*
* Default Socket Callbacks
*/
@@ -3998,7 +4027,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{
seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
- "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
+ "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
sock_prot_inuse_get(seq_file_net(seq), proto),
@@ -4019,7 +4048,6 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
proto_method_implemented(proto->getsockopt),
proto_method_implemented(proto->sendmsg),
proto_method_implemented(proto->recvmsg),
- proto_method_implemented(proto->sendpage),
proto_method_implemented(proto->bind),
proto_method_implemented(proto->backlog_rcv),
proto_method_implemented(proto->hash),
@@ -4040,7 +4068,7 @@ static int proto_seq_show(struct seq_file *seq, void *v)
"maxhdr",
"slab",
"module",
- "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
+ "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
else
proto_seq_printf(seq, list_entry(v, struct proto, node));
return 0;
@@ -4100,3 +4128,63 @@ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
return sk->sk_prot->bind_add(sk, addr, addr_len);
}
EXPORT_SYMBOL(sock_bind_add);
+
+/* Copy 'size' bytes from userspace and return `size` back to userspace */
+int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
+ void __user *arg, void *karg, size_t size)
+{
+ int ret;
+
+ if (copy_from_user(karg, arg, size))
+ return -EFAULT;
+
+ ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(arg, karg, size))
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL(sock_ioctl_inout);
+
+/* This is the most common ioctl prep function, where the result (4 bytes) is
+ * copied back to userspace if the ioctl() returns successfully. No input is
+ * copied from userspace as input argument.
+ */
+static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ int ret, karg = 0;
+
+ ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
+ if (ret)
+ return ret;
+
+ return put_user(karg, (int __user *)arg);
+}
+
+/* A wrapper around sock ioctls, which copies the data from userspace
+ * (depending on the protocol/ioctl), and copies back the result to userspace.
+ * The main motivation for this function is to pass kernel memory to the
+ * protocol ioctl callbacks, instead of userspace memory.
+ */
+int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ int rc = 1;
+
+ if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
+ rc = ipmr_sk_ioctl(sk, cmd, arg);
+ else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
+ rc = ip6mr_sk_ioctl(sk, cmd, arg);
+ else if (sk_is_phonet(sk))
+ rc = phonet_sk_ioctl(sk, cmd, arg);
+
+ /* If ioctl was processed, returns its value */
+ if (rc <= 0)
+ return rc;
+
+ /* Otherwise call the default handler */
+ return sock_ioctl_out(sk, cmd, arg);
+}
+EXPORT_SYMBOL(sk_ioctl);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 00afb66cd095..19538d628714 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -32,8 +32,6 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
{
struct bpf_stab *stab;
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
if (attr->max_entries == 0 ||
attr->key_size != 4 ||
(attr->value_size != sizeof(u32) &&
@@ -1085,8 +1083,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
struct bpf_shtab *htab;
int i, err;
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
if (attr->max_entries == 0 ||
attr->key_size == 0 ||
(attr->value_size != sizeof(u32) &&