diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/bpf_sk_storage.c | 14 | ||||
| -rw-r--r-- | net/core/dev.c | 23 | ||||
| -rw-r--r-- | net/core/devmem.c | 11 | ||||
| -rw-r--r-- | net/core/failover.c | 6 | ||||
| -rw-r--r-- | net/core/filter.c | 55 | ||||
| -rw-r--r-- | net/core/gro.c | 7 | ||||
| -rw-r--r-- | net/core/netmem_priv.h | 23 | ||||
| -rw-r--r-- | net/core/netpoll.c | 25 | ||||
| -rw-r--r-- | net/core/page_pool.c | 24 | ||||
| -rw-r--r-- | net/core/rtnetlink.c | 6 | ||||
| -rw-r--r-- | net/core/skbuff.c | 46 | ||||
| -rw-r--r-- | net/core/skmsg.c | 9 | ||||
| -rw-r--r-- | net/core/sock_map.c | 39 |
13 files changed, 199 insertions, 89 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 14eb7812bda4..ecd659f79fd4 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -172,7 +172,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) struct bpf_map *map; smap = rcu_dereference(SDATA(selem)->smap); - if (!(smap->map.map_flags & BPF_F_CLONE)) + if (!smap || !(smap->map.map_flags & BPF_F_CLONE)) continue; /* Note that for lockless listeners adding new element @@ -531,10 +531,10 @@ err_free: } EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc); -static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) +static int diag_get(struct bpf_local_storage_map *smap, + struct bpf_local_storage_data *sdata, struct sk_buff *skb) { struct nlattr *nla_stg, *nla_value; - struct bpf_local_storage_map *smap; /* It cannot exceed max nlattr's payload */ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE); @@ -543,7 +543,6 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) if (!nla_stg) return -EMSGSIZE; - smap = rcu_dereference(sdata->smap); if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id)) goto errout; @@ -558,6 +557,7 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) sdata->data, true); else copy_map_value(&smap->map, nla_data(nla_value), sdata->data); + check_and_init_map_value(&smap->map, nla_data(nla_value)); nla_nest_end(skb, nla_stg); return 0; @@ -596,9 +596,11 @@ static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb, saved_len = skb->len; hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) { smap = rcu_dereference(SDATA(selem)->smap); + if (!smap) + continue; diag_size += nla_value_size(smap->map.value_size); - if (nla_stgs && diag_get(SDATA(selem), skb)) + if (nla_stgs && diag_get(smap, SDATA(selem), skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } @@ -665,7 +667,7 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag, diag_size += nla_value_size(diag->maps[i]->value_size); - if (nla_stgs && diag_get(sdata, skb)) + if (nla_stgs && diag_get((struct bpf_local_storage_map *)diag->maps[i], sdata, skb)) /* Continue to learn diag_size */ err = -EMSGSIZE; } diff --git a/net/core/dev.c b/net/core/dev.c index 06c195906231..0c6c270d9f7d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,7 +371,7 @@ static void netdev_name_node_alt_free(struct rcu_head *head) static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) { netdev_name_node_del(name_node); - list_del(&name_node->list); + list_del_rcu(&name_node->list); call_rcu(&name_node->rcu, netdev_name_node_alt_free); } @@ -6862,9 +6862,9 @@ static void skb_defer_free_flush(void) #if defined(CONFIG_NET_RX_BUSY_POLL) -static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) +static void __busy_poll_stop(struct napi_struct *napi, unsigned long timeout) { - if (!skip_schedule) { + if (!timeout) { gro_normal_list(&napi->gro); __napi_schedule(napi); return; @@ -6874,6 +6874,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) gro_flush_normal(&napi->gro, HZ >= 1000); clear_bit(NAPI_STATE_SCHED, &napi->state); + hrtimer_start(&napi->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); } enum { @@ -6885,8 +6887,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, unsigned flags, u16 budget) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; - bool skip_schedule = false; - unsigned long timeout; + unsigned long timeout = 0; int rc; /* Busy polling means there is a high chance device driver hard irq @@ -6906,10 +6907,12 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi); - timeout = napi_get_gro_flush_timeout(napi); - if (napi->defer_hard_irqs_count && timeout) { - hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED); - skip_schedule = true; + if (napi->defer_hard_irqs_count) { + /* A short enough gro flush timeout and long enough + * poll can result in timer firing too early. + * Timer will be armed later if necessary. + */ + timeout = napi_get_gro_flush_timeout(napi); } } @@ -6924,7 +6927,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, trace_napi_poll(napi, rc, budget); netpoll_poll_unlock(have_poll_lock); if (rc == budget) - __busy_poll_stop(napi, skip_schedule); + __busy_poll_stop(napi, timeout); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } diff --git a/net/core/devmem.c b/net/core/devmem.c index 468344739db2..4f71de44c0fb 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -241,6 +241,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, } if (direction == DMA_TO_DEVICE) { + if (!IS_ALIGNED(dmabuf->size, PAGE_SIZE)) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, "TX dma-buf size must be a multiple of PAGE_SIZE"); + goto err_unmap; + } binding->tx_vec = kvmalloc_objs(struct net_iov *, dmabuf->size / PAGE_SIZE); if (!binding->tx_vec) { @@ -267,6 +272,12 @@ net_devmem_bind_dmabuf(struct net_device *dev, size_t len = sg_dma_len(sg); struct net_iov *niov; + if (!IS_ALIGNED(len, PAGE_SIZE)) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, "dma-buf SG length must be PAGE_SIZE aligned"); + goto err_free_chunks; + } + owner = kzalloc_node(sizeof(*owner), GFP_KERNEL, dev_to_node(&dev->dev)); if (!owner) { diff --git a/net/core/failover.c b/net/core/failover.c index 11bb183c7a1b..e43c59cd6868 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -12,6 +12,7 @@ #include <uapi/linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> +#include <net/netdev_lock.h> #include <net/failover.h> static LIST_HEAD(failover_list); @@ -221,8 +222,11 @@ failover_existing_slave_register(struct net_device *failover_dev) for_each_netdev(net, dev) { if (netif_is_failover(dev)) continue; - if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) + if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr)) { + netdev_lock_ops(dev); failover_slave_register(dev); + netdev_unlock_ops(dev); + } } rtnl_unlock(); } diff --git a/net/core/filter.c b/net/core/filter.c index 80a3b702a2d4..9590877b0714 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1654,15 +1654,24 @@ err_prog_put: return err; } +static void sk_reuseport_prog_free_rcu(struct rcu_head *rcu) +{ + struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); + struct bpf_prog *prog = aux->prog; + + bpf_release_orig_filter(prog); + bpf_prog_free(prog); +} + void sk_reuseport_prog_free(struct bpf_prog *prog) { if (!prog) return; - if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) - bpf_prog_put(prog); + if (bpf_prog_was_classic(prog)) + call_rcu(&prog->aux->rcu, sk_reuseport_prog_free_rcu); else - bpf_prog_destroy(prog); + bpf_prog_put(prog); } static inline int __bpf_try_make_writable(struct sk_buff *skb, @@ -5481,7 +5490,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) { - if (sk->sk_protocol != IPPROTO_TCP) + if (!sk_is_tcp(sk)) return -EINVAL; switch (optname) { @@ -5688,6 +5697,30 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_5(bpf_sk_setsockopt_nodelay, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + /* + * TCP_NODELAY triggers tcp_push_pending_frames() and re-enters + * CA_EVENT_TX_START in bpf_tcp_cc. + */ + if (level == SOL_TCP && optname == TCP_NODELAY) + return -EOPNOTSUPP; + + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto = { + .func = bpf_sk_setsockopt_nodelay, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { @@ -5833,6 +5866,12 @@ BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, if (!is_locked_tcp_sock_ops(bpf_sock)) return -EOPNOTSUPP; + /* TCP_NODELAY triggers tcp_push_pending_frames() and re-enters these callbacks. */ + if ((bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB || + bpf_sock->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB) && + level == SOL_TCP && optname == TCP_NODELAY) + return -EOPNOTSUPP; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -6443,6 +6482,8 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, * against MTU of FIB lookup resulting net_device */ dev = dev_get_by_index_rcu(net, params->ifindex); + if (unlikely(!dev)) + return -ENODEV; if (!is_skb_forwardable(dev, skb)) rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; @@ -7443,7 +7484,7 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) { - if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + if (sk_fullsock(sk) && sk_is_tcp(sk)) return (unsigned long)sk; return (unsigned long)NULL; @@ -11915,7 +11956,7 @@ BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) */ BTF_TYPE_EMIT(struct tcp6_sock); if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP && - sk->sk_family == AF_INET6) + sk->sk_type == SOCK_STREAM && sk->sk_family == AF_INET6) return (unsigned long)sk; return (unsigned long)NULL; @@ -11931,7 +11972,7 @@ const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = { BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk) { - if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + if (sk && sk_fullsock(sk) && sk_is_tcp(sk)) return (unsigned long)sk; return (unsigned long)NULL; diff --git a/net/core/gro.c b/net/core/gro.c index 31d21de5b15a..a84753983467 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -109,6 +109,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (p->pp_recycle != skb->pp_recycle) return -ETOOMANYREFS; + if (skb_zcopy(p) || skb_zcopy(skb)) + return -ETOOMANYREFS; + if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) || NAPI_GRO_CB(skb)->flush)) return -E2BIG; @@ -213,10 +216,12 @@ done: p->data_len += len; p->truesize += delta_truesize; p->len += len; + skb_shinfo(p)->flags |= skbinfo->flags & SKBFL_SHARED_FRAG; if (lp != p) { lp->data_len += len; lp->truesize += delta_truesize; lp->len += len; + skb_shinfo(lp)->flags |= skbinfo->flags & SKBFL_SHARED_FRAG; } NAPI_GRO_CB(skb)->same_flow = 1; return 0; @@ -244,6 +249,8 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) p->truesize += skb->truesize; p->len += skb->len; + skb_shinfo(p)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + NAPI_GRO_CB(skb)->same_flow = 1; return 0; diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 3e6fde8f1726..23175cb2bd86 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -8,18 +8,21 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } -static inline bool netmem_is_pp(netmem_ref netmem) +static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) +{ + netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; +} + +static inline void netmem_clear_pp_magic(netmem_ref netmem) { - struct page *page; + WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - return PageNetpp(page); + netmem_to_nmdesc(netmem)->pp_magic = 0; +} + +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; } static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4381e0fc25bf..3f4a17fa5713 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -319,6 +319,8 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) lockdep_assert_irqs_disabled(); dev = np->dev; + /* npinfo->txq belongs to np->dev, so retries must stay bound to it. */ + skb->dev = dev; rcu_read_lock(); npinfo = rcu_dereference_bh(dev->npinfo); @@ -608,14 +610,16 @@ EXPORT_SYMBOL_GPL(__netpoll_setup); /* * Returns a pointer to a string representation of the identifier used * to select the egress interface for the given netpoll instance. buf - * must be a buffer of length at least MAC_ADDR_STR_LEN + 1. + * is used to format np->dev_mac when np->dev_name is empty; bufsz must + * be at least MAC_ADDR_STR_LEN + 1 to fit the formatted MAC address + * and its NUL terminator. */ -static char *egress_dev(struct netpoll *np, char *buf) +static char *egress_dev(struct netpoll *np, char *buf, size_t bufsz) { if (np->dev_name[0]) return np->dev_name; - snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac); + snprintf(buf, bufsz, "%pM", np->dev_mac); return buf; } @@ -645,7 +649,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) if (!IS_ENABLED(CONFIG_IPV6)) { np_err(np, "IPv6 is not supported %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EINVAL; } @@ -667,7 +671,7 @@ static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev) } if (err) { np_err(np, "no IPv6 address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return err; } @@ -687,14 +691,14 @@ static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev) in_dev = __in_dev_get_rtnl(ndev); if (!in_dev) { np_err(np, "no IP address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EDESTADDRREQ; } ifa = rtnl_dereference(in_dev->ifa_list); if (!ifa) { np_err(np, "no IP address for %s, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); return -EDESTADDRREQ; } @@ -736,7 +740,8 @@ int netpoll_setup(struct netpoll *np) ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac); if (!ndev) { - np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf)); + np_err(np, "%s doesn't exist, aborting\n", + egress_dev(np, buf, sizeof(buf))); err = -ENODEV; goto unlock; } @@ -744,14 +749,14 @@ int netpoll_setup(struct netpoll *np) if (netdev_master_upper_dev_get(ndev)) { np_err(np, "%s is a slave device, aborting\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); err = -EBUSY; goto put; } if (!netif_running(ndev)) { np_info(np, "device %s not up yet, forcing it\n", - egress_dev(np, buf)); + egress_dev(np, buf, sizeof(buf))); err = dev_open(ndev, NULL); if (err) { diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 6e576dec80db..8171d1173221 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -707,18 +707,8 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { - struct page *page; - netmem_set_pp(netmem, pool); - - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - __SetPageNetpp(page); + netmem_or_pp_magic(netmem, PP_SIGNATURE); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it @@ -733,17 +723,7 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) void page_pool_clear_pp_info(netmem_ref netmem) { - struct page *page; - - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - __ClearPageNetpp(page); - + netmem_clear_pp_magic(netmem); netmem_set_pp(netmem, NULL); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b613bb6e07df..511c25bf6f2a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1572,6 +1572,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); + memset(&vf_broadcast, 0, sizeof(vf_broadcast)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; @@ -6327,8 +6328,9 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, &filters, &idxattr, &prividx, extack); if (err < 0) { - /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ - WARN_ON(err == -EMSGSIZE); + /* -EMSGSIZE implies BUG in if_nlmsg_stats_size + * or a too big nested attribute. + */ kfree_skb(nskb); } else { err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7dad68e3b518..0d3cc115f2e7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2248,6 +2248,7 @@ struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom, skb_frag_ref(skb, i); } skb_shinfo(n)->nr_frags = i; + skb_shinfo(n)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; } if (skb_has_frag_list(skb)) { @@ -2786,6 +2787,8 @@ done: skb->data_len = 0; skb_set_tail_pointer(skb, len); } + if (!skb_shinfo(skb)->nr_frags && !skb_has_frag_list(skb)) + skb->unreadable = 0; if (!skb->sk || skb->destructor == sock_edemux) skb_condense(skb); @@ -2793,16 +2796,37 @@ done: } EXPORT_SYMBOL(___pskb_trim); +static int pskb_trim_rcsum_complete(struct sk_buff *skb, unsigned int len) +{ + int delta = skb->len - len; + + if (skb_frags_readable(skb)) { + skb->csum = csum_block_sub(skb->csum, + skb_checksum(skb, len, delta, 0), + len); + return 0; + } + + if (len > skb_headlen(skb)) + return -EFAULT; + + /* The trimmed bytes are unreadable, but the remaining packet can be + * checksummed by software after trimming. + */ + skb->ip_summed = CHECKSUM_NONE; + return 0; +} + /* Note : use pskb_trim_rcsum() instead of calling this directly */ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len) { if (skb->ip_summed == CHECKSUM_COMPLETE) { - int delta = skb->len - len; + int err; - skb->csum = csum_block_sub(skb->csum, - skb_checksum(skb, len, delta, 0), - len); + err = pskb_trim_rcsum_complete(skb, len); + if (err) + return err; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; int offset = skb_checksum_start_offset(skb) + skb->csum_offset; @@ -4349,6 +4373,8 @@ onlymerged: tgt->ip_summed = CHECKSUM_PARTIAL; skb->ip_summed = CHECKSUM_PARTIAL; + skb_shinfo(tgt)->flags |= skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG; + skb_len_add(skb, -shiftlen); skb_len_add(tgt, shiftlen); @@ -4959,7 +4985,8 @@ normal: skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & + skb_shinfo(nskb)->flags |= (skb_shinfo(head_skb)->flags | + skb_shinfo(frag_skb)->flags) & SKBFL_SHARED_FRAG; if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) @@ -4976,6 +5003,9 @@ normal: nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; frag_skb = list_skb; + + skb_shinfo(nskb)->flags |= skb_shinfo(frag_skb)->flags & SKBFL_SHARED_FRAG; + if (!skb_headlen(list_skb)) { BUG_ON(!nfrags); } else { @@ -6200,6 +6230,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, from_shinfo->frags, from_shinfo->nr_frags * sizeof(skb_frag_t)); to_shinfo->nr_frags += from_shinfo->nr_frags; + if (from_shinfo->nr_frags) + to_shinfo->flags |= from_shinfo->flags & SKBFL_SHARED_FRAG; if (!skb_cloned(from)) from_shinfo->nr_frags = 0; @@ -6801,6 +6833,8 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_kfree_head(data); return -ENOMEM; } + if (skb_zcopy(skb)) + net_zcopy_get(skb_zcopy(skb)); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) @@ -6944,6 +6978,8 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, skb_kfree_head(data); return -ENOMEM; } + if (skb_zcopy(skb)) + net_zcopy_get(skb_zcopy(skb)); skb_release_data(skb, SKB_CONSUMED); skb->head = data; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 6187a83bd741..e1850caf1a71 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1268,12 +1268,19 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { const struct proto_ops *ops = NULL; + struct sk_psock *psock; struct socket *sock; int copied; trace_sk_data_ready(sk); rcu_read_lock(); + psock = sk_psock(sk); + if (psock && tls_sw_has_ctx_rx(sk)) { + psock->saved_data_ready(sk); + rcu_read_unlock(); + return; + } sock = READ_ONCE(sk->sk_socket); if (likely(sock)) ops = READ_ONCE(sock->ops); @@ -1283,8 +1290,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk) copied = ops->read_skb(sk, sk_psock_verdict_recv); if (copied >= 0) { - struct sk_psock *psock; - rcu_read_lock(); psock = sk_psock(sk); if (psock) diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 02a68be3002a..99e3789492a0 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1630,18 +1630,23 @@ void sock_map_unhash(struct sock *sk) void (*saved_unhash)(struct sock *sk); struct sk_psock *psock; +retry: rcu_read_lock(); psock = sk_psock(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_unhash = READ_ONCE(sk->sk_prot)->unhash; + if (unlikely(saved_unhash == sock_map_unhash)) + goto retry; } else { saved_unhash = psock->saved_unhash; sock_map_remove_links(sk, psock); rcu_read_unlock(); + + if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) + return; } - if (WARN_ON_ONCE(saved_unhash == sock_map_unhash)) - return; + if (saved_unhash) saved_unhash(sk); } @@ -1652,20 +1657,25 @@ void sock_map_destroy(struct sock *sk) void (*saved_destroy)(struct sock *sk); struct sk_psock *psock; +retry: rcu_read_lock(); psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); saved_destroy = READ_ONCE(sk->sk_prot)->destroy; + if (unlikely(saved_destroy == sock_map_destroy)) + goto retry; } else { saved_destroy = psock->saved_destroy; sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock); sk_psock_put(sk, psock); + + if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) + return; } - if (WARN_ON_ONCE(saved_destroy == sock_map_destroy)) - return; + if (saved_destroy) saved_destroy(sk); } @@ -1676,32 +1686,33 @@ void sock_map_close(struct sock *sk, long timeout) void (*saved_close)(struct sock *sk, long timeout); struct sk_psock *psock; +retry: lock_sock(sk); rcu_read_lock(); - psock = sk_psock(sk); + psock = sk_psock_get(sk); if (likely(psock)) { saved_close = psock->saved_close; sock_map_remove_links(sk, psock); - psock = sk_psock_get(sk); - if (unlikely(!psock)) - goto no_psock; rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); + + /* Make sure we do not recurse. This is a bug. + * Leak the socket instead of crashing on a stack overflow. + */ + if (WARN_ON_ONCE(saved_close == sock_map_close)) + return; } else { saved_close = READ_ONCE(sk->sk_prot)->close; -no_psock: rcu_read_unlock(); release_sock(sk); + + if (unlikely(saved_close == sock_map_close)) + goto retry; } - /* Make sure we do not recurse. This is a bug. - * Leak the socket instead of crashing on a stack overflow. - */ - if (WARN_ON_ONCE(saved_close == sock_map_close)) - return; saved_close(sk, timeout); } EXPORT_SYMBOL_GPL(sock_map_close); |
