summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c59
-rw-r--r--net/core/dev.h35
-rw-r--r--net/core/devmem.c6
-rw-r--r--net/core/filter.c13
-rw-r--r--net/core/neighbour.c3
-rw-r--r--net/core/netpoll.c2
-rw-r--r--net/core/page_pool_user.c4
-rw-r--r--net/core/secure_seq.c80
-rw-r--r--net/core/skbuff.c23
-rw-r--r--net/core/skmsg.c14
10 files changed, 110 insertions, 129 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 6ff4256700e6..14a83f2035b9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3987,7 +3987,7 @@ static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
if (shinfo->nr_frags > 0) {
niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
if (net_is_devmem_iov(niov) &&
- net_devmem_iov_binding(niov)->dev != dev)
+ READ_ONCE(net_devmem_iov_binding(niov)->dev) != dev)
goto out_free;
}
@@ -4818,10 +4818,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
- /* Other cpus might concurrently change txq->xmit_lock_owner
- * to -1 or to their cpu id, but not to our id.
- */
- if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
+ if (!netif_tx_owned(txq, cpu)) {
+ bool is_list = false;
+
if (dev_xmit_recursion())
goto recursion_alert;
@@ -4832,17 +4831,28 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
+ is_list = !!skb->next;
+
dev_xmit_recursion_inc();
skb = dev_hard_start_xmit(skb, dev, txq, &rc);
dev_xmit_recursion_dec();
- if (dev_xmit_complete(rc)) {
- HARD_TX_UNLOCK(dev, txq);
- goto out;
- }
+
+ /* GSO segments a single SKB into
+ * a list of frames. TCP expects error
+ * to mean none of the data was sent.
+ */
+ if (is_list)
+ rc = NETDEV_TX_OK;
}
HARD_TX_UNLOCK(dev, txq);
+ if (!skb) /* xmit completed */
+ goto out;
+
net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
dev->name);
+ /* NETDEV_TX_BUSY or queue was stopped */
+ if (!is_list)
+ rc = -ENETDOWN;
} else {
/* Recursion is detected! It is possible,
* unfortunately
@@ -4850,10 +4860,10 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
recursion_alert:
net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
dev->name);
+ rc = -ENETDOWN;
}
}
- rc = -ENETDOWN;
rcu_read_unlock_bh();
dev_core_stats_tx_dropped_inc(dev);
@@ -4992,8 +5002,7 @@ static bool rps_flow_is_active(struct rps_dev_flow *rflow,
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
- struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
- u32 flow_id)
+ struct rps_dev_flow *rflow, u16 next_cpu, u32 hash)
{
if (next_cpu < nr_cpu_ids) {
u32 head;
@@ -5004,6 +5013,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *tmp_rflow;
unsigned int tmp_cpu;
u16 rxq_index;
+ u32 flow_id;
int rc;
/* Should we steer this flow to a different hardware queue? */
@@ -5019,6 +5029,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (!flow_table)
goto out;
+ flow_id = rfs_slot(hash, flow_table);
tmp_rflow = &flow_table->flows[flow_id];
tmp_cpu = READ_ONCE(tmp_rflow->cpu);
@@ -5066,7 +5077,6 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow_table *flow_table;
struct rps_map *map;
int cpu = -1;
- u32 flow_id;
u32 tcpu;
u32 hash;
@@ -5113,8 +5123,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- flow_id = rfs_slot(hash, flow_table);
- rflow = &flow_table->flows[flow_id];
+ rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
tcpu = rflow->cpu;
/*
@@ -5133,8 +5142,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
- rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
- flow_id);
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash);
}
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
@@ -7783,11 +7791,12 @@ static int napi_thread_wait(struct napi_struct *napi)
return -1;
}
-static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
+static void napi_threaded_poll_loop(struct napi_struct *napi,
+ unsigned long *busy_poll_last_qs)
{
+ unsigned long last_qs = busy_poll_last_qs ? *busy_poll_last_qs : jiffies;
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
struct softnet_data *sd;
- unsigned long last_qs = jiffies;
for (;;) {
bool repoll = false;
@@ -7816,12 +7825,12 @@ static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
/* When busy poll is enabled, the old packets are not flushed in
* napi_complete_done. So flush them here.
*/
- if (busy_poll)
+ if (busy_poll_last_qs)
gro_flush_normal(&napi->gro, HZ >= 1000);
local_bh_enable();
/* Call cond_resched here to avoid watchdog warnings. */
- if (repoll || busy_poll) {
+ if (repoll || busy_poll_last_qs) {
rcu_softirq_qs_periodic(last_qs);
cond_resched();
}
@@ -7829,11 +7838,15 @@ static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
if (!repoll)
break;
}
+
+ if (busy_poll_last_qs)
+ *busy_poll_last_qs = last_qs;
}
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
+ unsigned long last_qs = jiffies;
bool want_busy_poll;
bool in_busy_poll;
unsigned long val;
@@ -7851,7 +7864,7 @@ static int napi_threaded_poll(void *data)
assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
want_busy_poll);
- napi_threaded_poll_loop(napi, want_busy_poll);
+ napi_threaded_poll_loop(napi, want_busy_poll ? &last_qs : NULL);
}
return 0;
@@ -13164,7 +13177,7 @@ static void run_backlog_napi(unsigned int cpu)
{
struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
- napi_threaded_poll_loop(&sd->backlog, false);
+ napi_threaded_poll_loop(&sd->backlog, NULL);
}
static void backlog_napi_setup(unsigned int cpu)
diff --git a/net/core/dev.h b/net/core/dev.h
index 98793a738f43..781619e76b3e 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -366,41 +366,6 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi)
void kick_defer_list_purge(unsigned int cpu);
-#define XMIT_RECURSION_LIMIT 8
-
-#ifndef CONFIG_PREEMPT_RT
-static inline bool dev_xmit_recursion(void)
-{
- return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
- XMIT_RECURSION_LIMIT);
-}
-
-static inline void dev_xmit_recursion_inc(void)
-{
- __this_cpu_inc(softnet_data.xmit.recursion);
-}
-
-static inline void dev_xmit_recursion_dec(void)
-{
- __this_cpu_dec(softnet_data.xmit.recursion);
-}
-#else
-static inline bool dev_xmit_recursion(void)
-{
- return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
-}
-
-static inline void dev_xmit_recursion_inc(void)
-{
- current->net_xmit.recursion++;
-}
-
-static inline void dev_xmit_recursion_dec(void)
-{
- current->net_xmit.recursion--;
-}
-#endif
-
int dev_set_hwtstamp_phylib(struct net_device *dev,
struct kernel_hwtstamp_config *cfg,
struct netlink_ext_ack *extack);
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 8c9aad776bf4..69d79aee07ef 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -396,7 +396,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
* net_device.
*/
dst_dev = dst_dev_rcu(dst);
- if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) {
+ if (unlikely(!dst_dev) ||
+ unlikely(dst_dev != READ_ONCE(binding->dev))) {
err = -ENODEV;
goto out_unlock;
}
@@ -513,7 +514,8 @@ static void mp_dmabuf_devmem_uninstall(void *mp_priv,
xa_erase(&binding->bound_rxqs, xa_idx);
if (xa_empty(&binding->bound_rxqs)) {
mutex_lock(&binding->lock);
- binding->dev = NULL;
+ ASSERT_EXCLUSIVE_WRITER(binding->dev);
+ WRITE_ONCE(binding->dev, NULL);
mutex_unlock(&binding->lock);
}
break;
diff --git a/net/core/filter.c b/net/core/filter.c
index 0d5d5a17acb2..78b548158fb0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2228,6 +2228,9 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
return -ENOMEM;
}
+ if (unlikely(!ipv6_mod_enabled()))
+ goto out_drop;
+
rcu_read_lock();
if (!nh) {
dst = skb_dst(skb);
@@ -2335,6 +2338,10 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
} else if (nh->nh_family == AF_INET6) {
+ if (unlikely(!ipv6_mod_enabled())) {
+ rcu_read_unlock();
+ goto out_drop;
+ }
neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
is_v6gw = true;
} else if (nh->nh_family == AF_INET) {
@@ -4150,12 +4157,14 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
struct xdp_rxq_info *rxq = xdp->rxq;
- unsigned int tailroom;
+ int tailroom;
if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
return -EOPNOTSUPP;
- tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
+ tailroom = rxq->frag_size - skb_frag_size(frag) -
+ skb_frag_off(frag) % rxq->frag_size;
+ WARN_ON_ONCE(tailroom < 0);
if (unlikely(offset > tailroom))
return -EINVAL;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a95cfe77f7f0..c56a4e7bf790 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -820,7 +820,8 @@ int pneigh_create(struct neigh_table *tbl, struct net *net,
update:
WRITE_ONCE(n->flags, flags);
n->permanent = permanent;
- WRITE_ONCE(n->protocol, protocol);
+ if (protocol)
+ WRITE_ONCE(n->protocol, protocol);
out:
mutex_unlock(&tbl->phash_lock);
return err;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a8558a52884f..cd74beffd209 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -132,7 +132,7 @@ static int netif_local_xmit_active(struct net_device *dev)
for (i = 0; i < dev->num_tx_queues; i++) {
struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
- if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id())
+ if (netif_tx_owned(txq, smp_processor_id()))
return 1;
}
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
index c82a95beceff..ee5060d8eec0 100644
--- a/net/core/page_pool_user.c
+++ b/net/core/page_pool_user.c
@@ -245,7 +245,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
goto err_cancel;
if (pool->user.detach_time &&
nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME,
- pool->user.detach_time))
+ ktime_divns(pool->user.detach_time, NSEC_PER_SEC)))
goto err_cancel;
if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL))
@@ -337,7 +337,7 @@ err_unlock:
void page_pool_detached(struct page_pool *pool)
{
mutex_lock(&page_pools_lock);
- pool->user.detach_time = ktime_get_boottime_seconds();
+ pool->user.detach_time = ktime_get_boottime();
netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF);
mutex_unlock(&page_pools_lock);
}
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 9a3965680451..6a6f2cda5aae 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -20,7 +20,6 @@
#include <net/tcp.h>
static siphash_aligned_key_t net_secret;
-static siphash_aligned_key_t ts_secret;
#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ)
@@ -28,11 +27,6 @@ static __always_inline void net_secret_init(void)
{
net_get_random_once(&net_secret, sizeof(net_secret));
}
-
-static __always_inline void ts_secret_init(void)
-{
- net_get_random_once(&ts_secret, sizeof(ts_secret));
-}
#endif
#ifdef CONFIG_INET
@@ -53,28 +47,9 @@ static u32 seq_scale(u32 seq)
#endif
#if IS_ENABLED(CONFIG_IPV6)
-u32 secure_tcpv6_ts_off(const struct net *net,
- const __be32 *saddr, const __be32 *daddr)
-{
- const struct {
- struct in6_addr saddr;
- struct in6_addr daddr;
- } __aligned(SIPHASH_ALIGNMENT) combined = {
- .saddr = *(struct in6_addr *)saddr,
- .daddr = *(struct in6_addr *)daddr,
- };
-
- if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
- return 0;
-
- ts_secret_init();
- return siphash(&combined, offsetofend(typeof(combined), daddr),
- &ts_secret);
-}
-EXPORT_IPV6_MOD(secure_tcpv6_ts_off);
-
-u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
- __be16 sport, __be16 dport)
+union tcp_seq_and_ts_off
+secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr,
+ const __be32 *daddr, __be16 sport, __be16 dport)
{
const struct {
struct in6_addr saddr;
@@ -87,14 +62,20 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
.sport = sport,
.dport = dport
};
- u32 hash;
+ union tcp_seq_and_ts_off st;
net_secret_init();
- hash = siphash(&combined, offsetofend(typeof(combined), dport),
- &net_secret);
- return seq_scale(hash);
+
+ st.hash64 = siphash(&combined, offsetofend(typeof(combined), dport),
+ &net_secret);
+
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
+ st.ts_off = 0;
+
+ st.seq = seq_scale(st.seq);
+ return st;
}
-EXPORT_SYMBOL(secure_tcpv6_seq);
+EXPORT_SYMBOL(secure_tcpv6_seq_and_ts_off);
u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport)
@@ -118,33 +99,30 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#endif
#ifdef CONFIG_INET
-u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
-{
- if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
- return 0;
-
- ts_secret_init();
- return siphash_2u32((__force u32)saddr, (__force u32)daddr,
- &ts_secret);
-}
-
/* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d),
* but fortunately, `sport' cannot be 0 in any circumstances. If this changes,
* it would be easy enough to have the former function use siphash_4u32, passing
* the arguments as separate u32.
*/
-u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
+union tcp_seq_and_ts_off
+secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
{
- u32 hash;
+ u32 ports = (__force u32)sport << 16 | (__force u32)dport;
+ union tcp_seq_and_ts_off st;
net_secret_init();
- hash = siphash_3u32((__force u32)saddr, (__force u32)daddr,
- (__force u32)sport << 16 | (__force u32)dport,
- &net_secret);
- return seq_scale(hash);
+
+ st.hash64 = siphash_3u32((__force u32)saddr, (__force u32)daddr,
+ ports, &net_secret);
+
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
+ st.ts_off = 0;
+
+ st.seq = seq_scale(st.seq);
+ return st;
}
-EXPORT_SYMBOL_GPL(secure_tcp_seq);
+EXPORT_SYMBOL_GPL(secure_tcp_seq_and_ts_off);
u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dc47d3efc72e..0e217041958a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5590,15 +5590,28 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
- bool ret;
+ struct socket *sock;
+ struct file *file;
+ bool ret = false;
if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
return true;
- read_lock_bh(&sk->sk_callback_lock);
- ret = sk->sk_socket && sk->sk_socket->file &&
- file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
- read_unlock_bh(&sk->sk_callback_lock);
+ /* The sk pointer remains valid as long as the skb is. The sk_socket and
+ * file pointer may become NULL if the socket is closed. Both structures
+ * (including file->cred) are RCU freed which means they can be accessed
+ * within a RCU read section.
+ */
+ rcu_read_lock();
+ sock = READ_ONCE(sk->sk_socket);
+ if (!sock)
+ goto out;
+ file = READ_ONCE(sock->file);
+ if (!file)
+ goto out;
+ ret = file_ns_capable(file, &init_user_ns, CAP_NET_RAW);
+out:
+ rcu_read_unlock();
return ret;
}
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 2e26174c9919..3261793abe83 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -1205,8 +1205,8 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
return;
psock->saved_data_ready = sk->sk_data_ready;
- sk->sk_data_ready = sk_psock_strp_data_ready;
- sk->sk_write_space = sk_psock_write_space;
+ WRITE_ONCE(sk->sk_data_ready, sk_psock_strp_data_ready);
+ WRITE_ONCE(sk->sk_write_space, sk_psock_write_space);
}
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
@@ -1216,8 +1216,8 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
if (!psock->saved_data_ready)
return;
- sk->sk_data_ready = psock->saved_data_ready;
- psock->saved_data_ready = NULL;
+ WRITE_ONCE(sk->sk_data_ready, psock->saved_data_ready);
+ WRITE_ONCE(psock->saved_data_ready, NULL);
strp_stop(&psock->strp);
}
@@ -1296,8 +1296,8 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
return;
psock->saved_data_ready = sk->sk_data_ready;
- sk->sk_data_ready = sk_psock_verdict_data_ready;
- sk->sk_write_space = sk_psock_write_space;
+ WRITE_ONCE(sk->sk_data_ready, sk_psock_verdict_data_ready);
+ WRITE_ONCE(sk->sk_write_space, sk_psock_write_space);
}
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
@@ -1308,6 +1308,6 @@ void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
if (!psock->saved_data_ready)
return;
- sk->sk_data_ready = psock->saved_data_ready;
+ WRITE_ONCE(sk->sk_data_ready, psock->saved_data_ready);
psock->saved_data_ready = NULL;
}