From 545cd5e5ec5477c325e4098b6fd21213dceda408 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 24 Mar 2017 10:07:53 -0700 Subject: net: Busy polling should ignore sender CPUs This patch is a cleanup/fix for NAPI IDs following the changes that made it so that sender_cpu and napi_id were doing a better job of sharing the same location in the sk_buff. One issue I found is that we weren't validating the napi_id as being valid before we started trying to setup the busy polling. This change corrects that by using the MIN_NAPI_ID value that is now used in both allocating the NAPI IDs, as well as validating them. Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 7869ae3837ca..ab337bf5bbf4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5066,15 +5066,20 @@ bool sk_busy_loop(struct sock *sk, int nonblock) int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; + unsigned int napi_id; int rc; restart: + napi_id = READ_ONCE(sk->sk_napi_id); + if (napi_id < MIN_NAPI_ID) + return 0; + rc = false; napi_poll = NULL; rcu_read_lock(); - napi = napi_by_id(sk->sk_napi_id); + napi = napi_by_id(napi_id); if (!napi) goto out; @@ -5143,10 +5148,10 @@ static void napi_hash_add(struct napi_struct *napi) spin_lock(&napi_hash_lock); - /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < NR_CPUS + 1)) - napi_gen_id = NR_CPUS + 1; + if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); napi->napi_id = napi_gen_id; -- cgit v1.2.3 From e5907459ce7e2b6bc397007865ad492f10c2aeac Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 24 Mar 2017 10:08:00 -0700 Subject: tcp: Record Rx hash and NAPI ID in tcp_child_process While working on some recent busy poll changes we found that child sockets were being instantiated without NAPI ID being set. In our first attempt to fix it, it was suggested that we should just pull programming the NAPI ID into the function itself since all callers will need to have it set. In addition to the NAPI ID change I have dropped the code that was populating the Rx hash since it was actually being populated in tcp_get_cookie_sock. Reported-by: Sridhar Samudrala Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 -- net/ipv4/tcp_minisocks.c | 4 ++++ net/ipv6/tcp_ipv6.c | 2 -- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7482b5d11861..20cbd2f07f28 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1409,8 +1409,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (!nsk) goto discard; if (nsk != sk) { - sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1e217948be62..8f6373b0cd77 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -26,6 +26,7 @@ #include #include #include +#include int sysctl_tcp_abort_on_overflow __read_mostly; @@ -799,6 +800,9 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; + /* record NAPI ID of child */ + sk_mark_napi_id(child, skb); + tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 031a8c019f7a..8e42e8f54b70 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1293,8 +1293,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; if (nsk != sk) { - sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) -- cgit v1.2.3 From 2b5cd0dfa384242f78a396b90087368c9440cc9a Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 24 Mar 2017 10:08:12 -0700 Subject: net: Change return type of sk_busy_loop from bool to void checking the return value of sk_busy_loop. As there are only a few consumers of that data, and the data being checked for can be replaced with a check for !skb_queue_empty() we might as well just pull the code out of sk_busy_loop and place it in the spots that actually need it. Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 8 ++++++-- net/core/dev.c | 25 +++++++++++-------------- net/sctp/socket.c | 9 ++++++--- 3 files changed, 23 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index ea633342ab0d..4608aa245410 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -256,8 +256,12 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, } spin_unlock_irqrestore(&queue->lock, cpu_flags); - } while (sk_can_busy_loop(sk) && - sk_busy_loop(sk, flags & MSG_DONTWAIT)); + + if (!sk_can_busy_loop(sk)) + break; + + sk_busy_loop(sk, flags & MSG_DONTWAIT); + } while (!skb_queue_empty(&sk->sk_receive_queue)); error = -EAGAIN; diff --git a/net/core/dev.c b/net/core/dev.c index ab337bf5bbf4..af70eb6ba682 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5060,21 +5060,19 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) do_softirq(); } -bool sk_busy_loop(struct sock *sk, int nonblock) +void sk_busy_loop(struct sock *sk, int nonblock) { unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; unsigned int napi_id; - int rc; restart: napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id < MIN_NAPI_ID) - return 0; + return; - rc = false; napi_poll = NULL; rcu_read_lock(); @@ -5085,7 +5083,8 @@ restart: preempt_disable(); for (;;) { - rc = 0; + int work = 0; + local_bh_disable(); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@ -5103,12 +5102,12 @@ restart: have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - rc = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + work = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, work, BUSY_POLL_BUDGET); count: - if (rc > 0) + if (work > 0) __NET_ADD_STATS(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); + LINUX_MIB_BUSYPOLLRXPACKETS, work); local_bh_enable(); if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || @@ -5121,9 +5120,9 @@ count: preempt_enable(); rcu_read_unlock(); cond_resched(); - rc = !skb_queue_empty(&sk->sk_receive_queue); - if (rc || busy_loop_timeout(end_time)) - return rc; + if (!skb_queue_empty(&sk->sk_receive_queue) || + busy_loop_timeout(end_time)) + return; goto restart; } cpu_relax(); @@ -5131,10 +5130,8 @@ count: if (napi_poll) busy_poll_stop(napi, have_poll_lock); preempt_enable(); - rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock(); - return rc; } EXPORT_SYMBOL(sk_busy_loop); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 72cc3ecf6516..ccc08fc39722 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -7518,9 +7518,12 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, if (sk->sk_shutdown & RCV_SHUTDOWN) break; - if (sk_can_busy_loop(sk) && - sk_busy_loop(sk, noblock)) - continue; + if (sk_can_busy_loop(sk)) { + sk_busy_loop(sk, noblock); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + continue; + } /* User doesn't want to wait. */ error = -EAGAIN; -- cgit v1.2.3 From 37056719bba500d0d2b8216fdf641e5507ec9a0e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 24 Mar 2017 10:08:18 -0700 Subject: net: Track start of busy loop instead of when it should end This patch flips the logic we were using to determine if the busy polling has timed out. The main motivation for this is that we will need to support two different possible timeout values in the future and by recording the start time rather than when we would want to end we can focus on making the end_time specific to the task be it epoll or socket based polling. Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index af70eb6ba682..2d1b5613b7fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5062,7 +5062,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) void sk_busy_loop(struct sock *sk, int nonblock) { - unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + unsigned long start_time = nonblock ? 0 : busy_loop_current_time(); int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; @@ -5111,7 +5111,7 @@ count: local_bh_enable(); if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || - busy_loop_timeout(end_time)) + sk_busy_loop_timeout(sk, start_time)) break; if (unlikely(need_resched())) { @@ -5121,7 +5121,7 @@ count: rcu_read_unlock(); cond_resched(); if (!skb_queue_empty(&sk->sk_receive_queue) || - busy_loop_timeout(end_time)) + sk_busy_loop_timeout(sk, start_time)) return; goto restart; } -- cgit v1.2.3 From 7db6b048da3b9f84fe1d22fb29ff7e7c2ec6c0e5 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 24 Mar 2017 10:08:24 -0700 Subject: net: Commonize busy polling code to focus on napi_id instead of socket Move the core functionality in sk_busy_loop() to napi_busy_loop() and make it independent of sk. This enables re-using this function in epoll busy loop implementation. Signed-off-by: Sridhar Samudrala Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 21 ++++++++------------- net/core/sock.c | 11 +++++++++++ 2 files changed, 19 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 2d1b5613b7fd..ef9fe60ee294 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5060,19 +5060,16 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) do_softirq(); } -void sk_busy_loop(struct sock *sk, int nonblock) +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) { - unsigned long start_time = nonblock ? 0 : busy_loop_current_time(); + unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; - unsigned int napi_id; restart: - napi_id = READ_ONCE(sk->sk_napi_id); - if (napi_id < MIN_NAPI_ID) - return; - napi_poll = NULL; rcu_read_lock(); @@ -5106,12 +5103,11 @@ restart: trace_napi_poll(napi, work, BUSY_POLL_BUDGET); count: if (work > 0) - __NET_ADD_STATS(sock_net(sk), + __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); local_bh_enable(); - if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || - sk_busy_loop_timeout(sk, start_time)) + if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { @@ -5120,8 +5116,7 @@ count: preempt_enable(); rcu_read_unlock(); cond_resched(); - if (!skb_queue_empty(&sk->sk_receive_queue) || - sk_busy_loop_timeout(sk, start_time)) + if (loop_end(loop_end_arg, start_time)) return; goto restart; } @@ -5133,7 +5128,7 @@ count: out: rcu_read_unlock(); } -EXPORT_SYMBOL(sk_busy_loop); +EXPORT_SYMBOL(napi_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ diff --git a/net/core/sock.c b/net/core/sock.c index 1b9030ee6f4b..4b762f2a3552 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3237,3 +3237,14 @@ static int __init proto_init(void) subsys_initcall(proto_init); #endif /* PROC_FS */ + +#ifdef CONFIG_NET_RX_BUSY_POLL +bool sk_busy_loop_end(void *p, unsigned long start_time) +{ + struct sock *sk = p; + + return !skb_queue_empty(&sk->sk_receive_queue) || + sk_busy_loop_timeout(sk, start_time); +} +EXPORT_SYMBOL(sk_busy_loop_end); +#endif /* CONFIG_NET_RX_BUSY_POLL */ -- cgit v1.2.3 From 6d4339028b350efbf87c61e6d9e113e5373545c9 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 24 Mar 2017 10:08:36 -0700 Subject: net: Introduce SO_INCOMING_NAPI_ID This socket option returns the NAPI ID associated with the queue on which the last frame is received. This information can be used by the apps to split the incoming flows among the threads based on the Rx queue on which they are received. If the NAPI ID actually represents a sender_cpu then the value is ignored and 0 is returned. Signed-off-by: Sridhar Samudrala Signed-off-by: Alexander Duyck Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 4b762f2a3552..1a58a9dc6888 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1328,6 +1328,18 @@ int sock_getsockopt(struct socket *sock, int level, int optname, goto lenout; } + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_INCOMING_NAPI_ID: + v.val = READ_ONCE(sk->sk_napi_id); + + /* aggregate non-NAPI IDs down to 0 */ + if (v.val < MIN_NAPI_ID) + v.val = 0; + + break; +#endif + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). -- cgit v1.2.3