summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/bpf_sk_storage.c14
-rw-r--r--net/core/dev.c23
-rw-r--r--net/core/devmem.c11
-rw-r--r--net/core/failover.c6
-rw-r--r--net/core/filter.c55
-rw-r--r--net/core/gro.c7
-rw-r--r--net/core/netmem_priv.h23
-rw-r--r--net/core/netpoll.c25
-rw-r--r--net/core/page_pool.c24
-rw-r--r--net/core/rtnetlink.c6
-rw-r--r--net/core/skbuff.c46
-rw-r--r--net/core/skmsg.c9
-rw-r--r--net/core/sock_map.c39
13 files changed, 199 insertions, 89 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 14eb7812bda4..ecd659f79fd4 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -172,7 +172,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
struct bpf_map *map;
smap = rcu_dereference(SDATA(selem)->smap);
- if (!(smap->map.map_flags & BPF_F_CLONE))
+ if (!smap || !(smap->map.map_flags & BPF_F_CLONE))
continue;
/* Note that for lockless listeners adding new element
@@ -531,10 +531,10 @@ err_free:
}
EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
-static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
+static int diag_get(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_data *sdata, struct sk_buff *skb)
{
struct nlattr *nla_stg, *nla_value;
- struct bpf_local_storage_map *smap;
/* It cannot exceed max nlattr's payload */
BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE);
@@ -543,7 +543,6 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
if (!nla_stg)
return -EMSGSIZE;
- smap = rcu_dereference(sdata->smap);
if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
goto errout;
@@ -558,6 +557,7 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
sdata->data, true);
else
copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
+ check_and_init_map_value(&smap->map, nla_data(nla_value));
nla_nest_end(skb, nla_stg);
return 0;
@@ -596,9 +596,11 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
saved_len = skb->len;
hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
smap = rcu_dereference(SDATA(selem)->smap);
+ if (!smap)
+ continue;
diag_size += nla_value_size(smap->map.value_size);
- if (nla_stgs && diag_get(SDATA(selem), skb))
+ if (nla_stgs && diag_get(smap, SDATA(selem), skb))
/* Continue to learn diag_size */
err = -EMSGSIZE;
}
@@ -665,7 +667,7 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
diag_size += nla_value_size(diag->maps[i]->value_size);
- if (nla_stgs && diag_get(sdata, skb))
+ if (nla_stgs && diag_get((struct bpf_local_storage_map *)diag->maps[i], sdata, skb))
/* Continue to learn diag_size */
err = -EMSGSIZE;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 06c195906231..0c6c270d9f7d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -371,7 +371,7 @@ static void netdev_name_node_alt_free(struct rcu_head *head)
static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
netdev_name_node_del(name_node);
- list_del(&name_node->list);
+ list_del_rcu(&name_node->list);
call_rcu(&name_node->rcu, netdev_name_node_alt_free);
}
@@ -6862,9 +6862,9 @@ static void skb_defer_free_flush(void)
#if defined(CONFIG_NET_RX_BUSY_POLL)
-static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
+static void __busy_poll_stop(struct napi_struct *napi, unsigned long timeout)
{
- if (!skip_schedule) {
+ if (!timeout) {
gro_normal_list(&napi->gro);
__napi_schedule(napi);
return;
@@ -6874,6 +6874,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
gro_flush_normal(&napi->gro, HZ >= 1000);
clear_bit(NAPI_STATE_SCHED, &napi->state);
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
}
enum {
@@ -6885,8 +6887,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
unsigned flags, u16 budget)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
- bool skip_schedule = false;
- unsigned long timeout;
+ unsigned long timeout = 0;
int rc;
/* Busy polling means there is a high chance device driver hard irq
@@ -6906,10 +6907,12 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
if (flags & NAPI_F_PREFER_BUSY_POLL) {
napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
- timeout = napi_get_gro_flush_timeout(napi);
- if (napi->defer_hard_irqs_count && timeout) {
- hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
- skip_schedule = true;
+ if (napi->defer_hard_irqs_count) {
+ /* A short enough gro flush timeout and long enough
+ * poll can result in timer firing too early.
+ * Timer will be armed later if necessary.
+ */
+ timeout = napi_get_gro_flush_timeout(napi);
}
}
@@ -6924,7 +6927,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock);
if (rc == budget)
- __busy_poll_stop(napi, skip_schedule);
+ __busy_poll_stop(napi, timeout);
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
}
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 468344739db2..4f71de44c0fb 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -241,6 +241,11 @@ net_devmem_bind_dmabuf(struct net_device *dev,
}
if (direction == DMA_TO_DEVICE) {
+ if (!IS_ALIGNED(dmabuf->size, PAGE_SIZE)) {
+ err = -EINVAL;
+ NL_SET_ERR_MSG(extack, "TX dma-buf size must be a multiple of PAGE_SIZE");
+ goto err_unmap;
+ }
binding->tx_vec = kvmalloc_objs(struct net_iov *,
dmabuf->size / PAGE_SIZE);
if (!binding->tx_vec) {
@@ -267,6 +272,12 @@ net_devmem_bind_dmabuf(struct net_device *dev,
size_t len = sg_dma_len(sg);
struct net_iov *niov;
+ if (!IS_ALIGNED(len, PAGE_SIZE)) {
+ err = -EINVAL;
+ NL_SET_ERR_MSG(extack, "dma-buf SG length must be PAGE_SIZE aligned");
+ goto err_free_chunks;
+ }
+
owner = kzalloc_node(sizeof(*owner), GFP_KERNEL,
dev_to_node(&dev->dev));
if (!owner) {
diff --git a/net/core/failover.c b/net/core/failover.c
index 11bb183c7a1b..e43c59cd6868 100644
--- a/net/core/failover.c
+++ b/net/core/failover.c
@@ -12,6 +12,7 @@
#include <uapi/linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/if_vlan.h>
+#include <net/netdev_lock.h>
#include <net/failover.h>
static LIST_HEAD(failover_list);
@@ -221,8 +222,11 @@ failover_existing_slave_register(struct net_device *failover_dev)
for_each_netdev(net, dev) {
if (netif_is_failover(dev))
continue;
- if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
+ if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) {
+ netdev_lock_ops(dev);
failover_slave_register(dev);
+ netdev_unlock_ops(dev);
+ }
}
rtnl_unlock();
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 80a3b702a2d4..9590877b0714 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1654,15 +1654,24 @@ err_prog_put:
return err;
}
+static void sk_reuseport_prog_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+ struct bpf_prog *prog = aux->prog;
+
+ bpf_release_orig_filter(prog);
+ bpf_prog_free(prog);
+}
+
void sk_reuseport_prog_free(struct bpf_prog *prog)
{
if (!prog)
return;
- if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
- bpf_prog_put(prog);
+ if (bpf_prog_was_classic(prog))
+ call_rcu(&prog->aux->rcu, sk_reuseport_prog_free_rcu);
else
- bpf_prog_destroy(prog);
+ bpf_prog_put(prog);
}
static inline int __bpf_try_make_writable(struct sk_buff *skb,
@@ -5481,7 +5490,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
char *optval, int *optlen,
bool getopt)
{
- if (sk->sk_protocol != IPPROTO_TCP)
+ if (!sk_is_tcp(sk))
return -EINVAL;
switch (optname) {
@@ -5688,6 +5697,30 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+BPF_CALL_5(bpf_sk_setsockopt_nodelay, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ /*
+ * TCP_NODELAY triggers tcp_push_pending_frames() and re-enters
+ * CA_EVENT_TX_START in bpf_tcp_cc.
+ */
+ if (level == SOL_TCP && optname == TCP_NODELAY)
+ return -EOPNOTSUPP;
+
+ return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto = {
+ .func = bpf_sk_setsockopt_nodelay,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
int, optname, char *, optval, int, optlen)
{
@@ -5833,6 +5866,12 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
if (!is_locked_tcp_sock_ops(bpf_sock))
return -EOPNOTSUPP;
+ /* TCP_NODELAY triggers tcp_push_pending_frames() and re-enters these callbacks. */
+ if ((bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB ||
+ bpf_sock->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB) &&
+ level == SOL_TCP && optname == TCP_NODELAY)
+ return -EOPNOTSUPP;
+
return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
@@ -6443,6 +6482,8 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
* against MTU of FIB lookup resulting net_device
*/
dev = dev_get_by_index_rcu(net, params->ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
if (!is_skb_forwardable(dev, skb))
rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
@@ -7443,7 +7484,7 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
{
- if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+ if (sk_fullsock(sk) && sk_is_tcp(sk))
return (unsigned long)sk;
return (unsigned long)NULL;
@@ -11915,7 +11956,7 @@ BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
*/
BTF_TYPE_EMIT(struct tcp6_sock);
if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_family == AF_INET6)
+ sk->sk_type == SOCK_STREAM && sk->sk_family == AF_INET6)
return (unsigned long)sk;
return (unsigned long)NULL;
@@ -11931,7 +11972,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
{
- if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+ if (sk && sk_fullsock(sk) && sk_is_tcp(sk))
return (unsigned long)sk;
return (unsigned long)NULL;
diff --git a/net/core/gro.c b/net/core/gro.c
index 31d21de5b15a..a84753983467 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -109,6 +109,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
if (p->pp_recycle != skb->pp_recycle)
return -ETOOMANYREFS;
+ if (skb_zcopy(p) || skb_zcopy(skb))
+ return -ETOOMANYREFS;
+
if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) ||
NAPI_GRO_CB(skb)->flush))
return -E2BIG;
@@ -213,10 +216,12 @@ done:
p->data_len += len;
p->truesize += delta_truesize;
p->len += len;
+ skb_shinfo(p)->flags |= skbinfo->flags & SKBFL_SHARED_FRAG;
if (lp != p) {
lp->data_len += len;
lp->truesize += delta_truesize;
lp->len += len;
+ skb_shinfo(lp)->flags |= skbinfo->flags & SKBFL_SHARED_FRAG;
}
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
@@ -244,6 +249,8 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
p->truesize += skb->truesize;
p->len += skb->len;
+ skb_shinfo(p)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
+
NAPI_GRO_CB(skb)->same_flow = 1;
return 0;
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
index 3e6fde8f1726..23175cb2bd86 100644
--- a/net/core/netmem_priv.h
+++ b/net/core/netmem_priv.h
@@ -8,18 +8,21 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
}
-static inline bool netmem_is_pp(netmem_ref netmem)
+static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
+{
+ netmem_to_nmdesc(netmem)->pp_magic |= pp_magic;
+}
+
+static inline void netmem_clear_pp_magic(netmem_ref netmem)
{
- struct page *page;
+ WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK);
- /* XXX: Now that the offset of page_type is shared between
- * struct page and net_iov, just cast the netmem to struct page
- * unconditionally by clearing NET_IOV if any, no matter whether
- * it comes from struct net_iov or struct page. This should be
- * adjusted once the offset is no longer shared.
- */
- page = (struct page *)((__force unsigned long)netmem & ~NET_IOV);
- return PageNetpp(page);
+ netmem_to_nmdesc(netmem)->pp_magic = 0;
+}
+
+static inline bool netmem_is_pp(netmem_ref netmem)
+{
+ return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE;
}
static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 4381e0fc25bf..3f4a17fa5713 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -319,6 +319,8 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
lockdep_assert_irqs_disabled();
dev = np->dev;
+ /* npinfo->txq belongs to np->dev, so retries must stay bound to it. */
+ skb->dev = dev;
rcu_read_lock();
npinfo = rcu_dereference_bh(dev->npinfo);
@@ -608,14 +610,16 @@ EXPORT_SYMBOL_GPL(__netpoll_setup);
/*
* Returns a pointer to a string representation of the identifier used
* to select the egress interface for the given netpoll instance. buf
- * must be a buffer of length at least MAC_ADDR_STR_LEN + 1.
+ * is used to format np->dev_mac when np->dev_name is empty; bufsz must
+ * be at least MAC_ADDR_STR_LEN + 1 to fit the formatted MAC address
+ * and its NUL terminator.
*/
-static char *egress_dev(struct netpoll *np, char *buf)
+static char *egress_dev(struct netpoll *np, char *buf, size_t bufsz)
{
if (np->dev_name[0])
return np->dev_name;
- snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac);
+ snprintf(buf, bufsz, "%pM", np->dev_mac);
return buf;
}
@@ -645,7 +649,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev)
if (!IS_ENABLED(CONFIG_IPV6)) {
np_err(np, "IPv6 is not supported %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return -EINVAL;
}
@@ -667,7 +671,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev)
}
if (err) {
np_err(np, "no IPv6 address for %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return err;
}
@@ -687,14 +691,14 @@ static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev)
in_dev = __in_dev_get_rtnl(ndev);
if (!in_dev) {
np_err(np, "no IP address for %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return -EDESTADDRREQ;
}
ifa = rtnl_dereference(in_dev->ifa_list);
if (!ifa) {
np_err(np, "no IP address for %s, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
return -EDESTADDRREQ;
}
@@ -736,7 +740,8 @@ int netpoll_setup(struct netpoll *np)
ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac);
if (!ndev) {
- np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf));
+ np_err(np, "%s doesn't exist, aborting\n",
+ egress_dev(np, buf, sizeof(buf)));
err = -ENODEV;
goto unlock;
}
@@ -744,14 +749,14 @@ int netpoll_setup(struct netpoll *np)
if (netdev_master_upper_dev_get(ndev)) {
np_err(np, "%s is a slave device, aborting\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
err = -EBUSY;
goto put;
}
if (!netif_running(ndev)) {
np_info(np, "device %s not up yet, forcing it\n",
- egress_dev(np, buf));
+ egress_dev(np, buf, sizeof(buf)));
err = dev_open(ndev, NULL);
if (err) {
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 6e576dec80db..8171d1173221 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -707,18 +707,8 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict)
void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
{
- struct page *page;
-
netmem_set_pp(netmem, pool);
-
- /* XXX: Now that the offset of page_type is shared between
- * struct page and net_iov, just cast the netmem to struct page
- * unconditionally by clearing NET_IOV if any, no matter whether
- * it comes from struct net_iov or struct page. This should be
- * adjusted once the offset is no longer shared.
- */
- page = (struct page *)((__force unsigned long)netmem & ~NET_IOV);
- __SetPageNetpp(page);
+ netmem_or_pp_magic(netmem, PP_SIGNATURE);
/* Ensuring all pages have been split into one fragment initially:
* page_pool_set_pp_info() is only called once for every page when it
@@ -733,17 +723,7 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
void page_pool_clear_pp_info(netmem_ref netmem)
{
- struct page *page;
-
- /* XXX: Now that the offset of page_type is shared between
- * struct page and net_iov, just cast the netmem to struct page
- * unconditionally by clearing NET_IOV if any, no matter whether
- * it comes from struct net_iov or struct page. This should be
- * adjusted once the offset is no longer shared.
- */
- page = (struct page *)((__force unsigned long)netmem & ~NET_IOV);
- __ClearPageNetpp(page);
-
+ netmem_clear_pp_magic(netmem);
netmem_set_pp(netmem, NULL);
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b613bb6e07df..511c25bf6f2a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1572,6 +1572,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
port_guid.vf = ivi.vf;
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ memset(&vf_broadcast, 0, sizeof(vf_broadcast));
memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
@@ -6327,8 +6328,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
0, &filters, &idxattr, &prividx, extack);
if (err < 0) {
- /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
- WARN_ON(err == -EMSGSIZE);
+ /* -EMSGSIZE implies BUG in if_nlmsg_stats_size
+ * or a too big nested attribute.
+ */
kfree_skb(nskb);
} else {
err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7dad68e3b518..0d3cc115f2e7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2248,6 +2248,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
skb_frag_ref(skb, i);
}
skb_shinfo(n)->nr_frags = i;
+ skb_shinfo(n)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
}
if (skb_has_frag_list(skb)) {
@@ -2786,6 +2787,8 @@ done:
skb->data_len = 0;
skb_set_tail_pointer(skb, len);
}
+ if (!skb_shinfo(skb)->nr_frags && !skb_has_frag_list(skb))
+ skb->unreadable = 0;
if (!skb->sk || skb->destructor == sock_edemux)
skb_condense(skb);
@@ -2793,16 +2796,37 @@ done:
}
EXPORT_SYMBOL(___pskb_trim);
+static int pskb_trim_rcsum_complete(struct sk_buff *skb, unsigned int len)
+{
+ int delta = skb->len - len;
+
+ if (skb_frags_readable(skb)) {
+ skb->csum = csum_block_sub(skb->csum,
+ skb_checksum(skb, len, delta, 0),
+ len);
+ return 0;
+ }
+
+ if (len > skb_headlen(skb))
+ return -EFAULT;
+
+ /* The trimmed bytes are unreadable, but the remaining packet can be
+ * checksummed by software after trimming.
+ */
+ skb->ip_summed = CHECKSUM_NONE;
+ return 0;
+}
+
/* Note : use pskb_trim_rcsum() instead of calling this directly
*/
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
{
if (skb->ip_summed == CHECKSUM_COMPLETE) {
- int delta = skb->len - len;
+ int err;
- skb->csum = csum_block_sub(skb->csum,
- skb_checksum(skb, len, delta, 0),
- len);
+ err = pskb_trim_rcsum_complete(skb, len);
+ if (err)
+ return err;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
@@ -4349,6 +4373,8 @@ onlymerged:
tgt->ip_summed = CHECKSUM_PARTIAL;
skb->ip_summed = CHECKSUM_PARTIAL;
+ skb_shinfo(tgt)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
+
skb_len_add(skb, -shiftlen);
skb_len_add(tgt, shiftlen);
@@ -4959,7 +4985,8 @@ normal:
skb_copy_from_linear_data_offset(head_skb, offset,
skb_put(nskb, hsize), hsize);
- skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
+ skb_shinfo(nskb)->flags |= (skb_shinfo(head_skb)->flags |
+ skb_shinfo(frag_skb)->flags) &
SKBFL_SHARED_FRAG;
if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
@@ -4976,6 +5003,9 @@ normal:
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
frag_skb = list_skb;
+
+ skb_shinfo(nskb)->flags |= skb_shinfo(frag_skb)->flags & SKBFL_SHARED_FRAG;
+
if (!skb_headlen(list_skb)) {
BUG_ON(!nfrags);
} else {
@@ -6200,6 +6230,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
from_shinfo->frags,
from_shinfo->nr_frags * sizeof(skb_frag_t));
to_shinfo->nr_frags += from_shinfo->nr_frags;
+ if (from_shinfo->nr_frags)
+ to_shinfo->flags |= from_shinfo->flags & SKBFL_SHARED_FRAG;
if (!skb_cloned(from))
from_shinfo->nr_frags = 0;
@@ -6801,6 +6833,8 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_kfree_head(data);
return -ENOMEM;
}
+ if (skb_zcopy(skb))
+ net_zcopy_get(skb_zcopy(skb));
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
@@ -6944,6 +6978,8 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data);
return -ENOMEM;
}
+ if (skb_zcopy(skb))
+ net_zcopy_get(skb_zcopy(skb));
skb_release_data(skb, SKB_CONSUMED);
skb->head = data;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 6187a83bd741..e1850caf1a71 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -1268,12 +1268,19 @@ out:
static void sk_psock_verdict_data_ready(struct sock *sk)
{
const struct proto_ops *ops = NULL;
+ struct sk_psock *psock;
struct socket *sock;
int copied;
trace_sk_data_ready(sk);
rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock && tls_sw_has_ctx_rx(sk)) {
+ psock->saved_data_ready(sk);
+ rcu_read_unlock();
+ return;
+ }
sock = READ_ONCE(sk->sk_socket);
if (likely(sock))
ops = READ_ONCE(sock->ops);
@@ -1283,8 +1290,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
copied = ops->read_skb(sk, sk_psock_verdict_recv);
if (copied >= 0) {
- struct sk_psock *psock;
-
rcu_read_lock();
psock = sk_psock(sk);
if (psock)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 02a68be3002a..99e3789492a0 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -1630,18 +1630,23 @@ void sock_map_unhash(struct sock *sk)
void (*saved_unhash)(struct sock *sk);
struct sk_psock *psock;
+retry:
rcu_read_lock();
psock = sk_psock(sk);
if (unlikely(!psock)) {
rcu_read_unlock();
saved_unhash = READ_ONCE(sk->sk_prot)->unhash;
+ if (unlikely(saved_unhash == sock_map_unhash))
+ goto retry;
} else {
saved_unhash = psock->saved_unhash;
sock_map_remove_links(sk, psock);
rcu_read_unlock();
+
+ if (WARN_ON_ONCE(saved_unhash == sock_map_unhash))
+ return;
}
- if (WARN_ON_ONCE(saved_unhash == sock_map_unhash))
- return;
+
if (saved_unhash)
saved_unhash(sk);
}
@@ -1652,20 +1657,25 @@ void sock_map_destroy(struct sock *sk)
void (*saved_destroy)(struct sock *sk);
struct sk_psock *psock;
+retry:
rcu_read_lock();
psock = sk_psock_get(sk);
if (unlikely(!psock)) {
rcu_read_unlock();
saved_destroy = READ_ONCE(sk->sk_prot)->destroy;
+ if (unlikely(saved_destroy == sock_map_destroy))
+ goto retry;
} else {
saved_destroy = psock->saved_destroy;
sock_map_remove_links(sk, psock);
rcu_read_unlock();
sk_psock_stop(psock);
sk_psock_put(sk, psock);
+
+ if (WARN_ON_ONCE(saved_destroy == sock_map_destroy))
+ return;
}
- if (WARN_ON_ONCE(saved_destroy == sock_map_destroy))
- return;
+
if (saved_destroy)
saved_destroy(sk);
}
@@ -1676,32 +1686,33 @@ void sock_map_close(struct sock *sk, long timeout)
void (*saved_close)(struct sock *sk, long timeout);
struct sk_psock *psock;
+retry:
lock_sock(sk);
rcu_read_lock();
- psock = sk_psock(sk);
+ psock = sk_psock_get(sk);
if (likely(psock)) {
saved_close = psock->saved_close;
sock_map_remove_links(sk, psock);
- psock = sk_psock_get(sk);
- if (unlikely(!psock))
- goto no_psock;
rcu_read_unlock();
sk_psock_stop(psock);
release_sock(sk);
cancel_delayed_work_sync(&psock->work);
sk_psock_put(sk, psock);
+
+ /* Make sure we do not recurse. This is a bug.
+ * Leak the socket instead of crashing on a stack overflow.
+ */
+ if (WARN_ON_ONCE(saved_close == sock_map_close))
+ return;
} else {
saved_close = READ_ONCE(sk->sk_prot)->close;
-no_psock:
rcu_read_unlock();
release_sock(sk);
+
+ if (unlikely(saved_close == sock_map_close))
+ goto retry;
}
- /* Make sure we do not recurse. This is a bug.
- * Leak the socket instead of crashing on a stack overflow.
- */
- if (WARN_ON_ONCE(saved_close == sock_map_close))
- return;
saved_close(sk, timeout);
}
EXPORT_SYMBOL_GPL(sock_map_close);