summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/core/dev.c111
-rw-r--r--net/core/net-sysfs.c94
-rw-r--r--net/core/sysctl_net_core.c68
-rw-r--r--net/ipv4/af_inet.c29
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv4/udp.c7
6 files changed, 283 insertions, 28 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index e8041eb76ac1..d7107ac835fa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2203,19 +2203,28 @@ int weight_p __read_mostly = 64; /* old backlog weight */
DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
#ifdef CONFIG_RPS
+
+/* One global table that all flow-based protocols share. */
+struct rps_sock_flow_table *rps_sock_flow_table;
+EXPORT_SYMBOL(rps_sock_flow_table);
+
/*
* get_rps_cpu is called from netif_receive_skb and returns the target
* CPU from the RPS map of the receiving queue for a given skb.
* rcu_read_lock must be held on entry.
*/
-static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
+static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow **rflowp)
{
struct ipv6hdr *ip6;
struct iphdr *ip;
struct netdev_rx_queue *rxqueue;
struct rps_map *map;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_sock_flow_table *sock_flow_table;
int cpu = -1;
u8 ip_proto;
+ u16 tcpu;
u32 addr1, addr2, ports, ihl;
if (skb_rx_queue_recorded(skb)) {
@@ -2232,7 +2241,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
} else
rxqueue = dev->_rx;
- if (!rxqueue->rps_map)
+ if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
goto done;
if (skb->rxhash)
@@ -2284,9 +2293,48 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
skb->rxhash = 1;
got_hash:
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ sock_flow_table = rcu_dereference(rps_sock_flow_table);
+ if (flow_table && sock_flow_table) {
+ u16 next_cpu;
+ struct rps_dev_flow *rflow;
+
+ rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
+ tcpu = rflow->cpu;
+
+ next_cpu = sock_flow_table->ents[skb->rxhash &
+ sock_flow_table->mask];
+
+ /*
+ * If the desired CPU (where last recvmsg was done) is
+ * different from current CPU (one in the rx-queue flow
+ * table entry), switch if one of the following holds:
+ * - Current CPU is unset (equal to RPS_NO_CPU).
+ * - Current CPU is offline.
+ * - The current CPU's queue tail has advanced beyond the
+ * last packet that was enqueued using this table entry.
+ * This guarantees that all previous packets for the flow
+ * have been dequeued, thus preserving in order delivery.
+ */
+ if (unlikely(tcpu != next_cpu) &&
+ (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
+ ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+ rflow->last_qtail)) >= 0)) {
+ tcpu = rflow->cpu = next_cpu;
+ if (tcpu != RPS_NO_CPU)
+ rflow->last_qtail = per_cpu(softnet_data,
+ tcpu).input_queue_head;
+ }
+ if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
+ *rflowp = rflow;
+ cpu = tcpu;
+ goto done;
+ }
+ }
+
map = rcu_dereference(rxqueue->rps_map);
if (map) {
- u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
+ tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
if (cpu_online(tcpu)) {
cpu = tcpu;
@@ -2320,13 +2368,14 @@ static void trigger_softirq(void *data)
__napi_schedule(&queue->backlog);
__get_cpu_var(netdev_rx_stat).received_rps++;
}
-#endif /* CONFIG_SMP */
+#endif /* CONFIG_RPS */
/*
* enqueue_to_backlog is called to queue an skb to a per CPU backlog
* queue (may be a remote CPU queue).
*/
-static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
+static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
+ unsigned int *qtail)
{
struct softnet_data *queue;
unsigned long flags;
@@ -2341,6 +2390,10 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
if (queue->input_pkt_queue.qlen) {
enqueue:
__skb_queue_tail(&queue->input_pkt_queue, skb);
+#ifdef CONFIG_RPS
+ *qtail = queue->input_queue_head +
+ queue->input_pkt_queue.qlen;
+#endif
rps_unlock(queue);
local_irq_restore(flags);
return NET_RX_SUCCESS;
@@ -2355,11 +2408,10 @@ enqueue:
cpu_set(cpu, rcpus->mask[rcpus->select]);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
- } else
- __napi_schedule(&queue->backlog);
-#else
- __napi_schedule(&queue->backlog);
+ goto enqueue;
+ }
#endif
+ __napi_schedule(&queue->backlog);
}
goto enqueue;
}
@@ -2401,18 +2453,25 @@ int netif_rx(struct sk_buff *skb)
#ifdef CONFIG_RPS
{
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
rcu_read_lock();
- cpu = get_rps_cpu(skb->dev, skb);
+
+ cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu < 0)
cpu = smp_processor_id();
- ret = enqueue_to_backlog(skb, cpu);
+
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+
rcu_read_unlock();
}
#else
- ret = enqueue_to_backlog(skb, get_cpu());
- put_cpu();
+ {
+ unsigned int qtail;
+ ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
+ put_cpu();
+ }
#endif
return ret;
}
@@ -2830,14 +2889,22 @@ out:
int netif_receive_skb(struct sk_buff *skb)
{
#ifdef CONFIG_RPS
- int cpu;
+ struct rps_dev_flow voidflow, *rflow = &voidflow;
+ int cpu, ret;
+
+ rcu_read_lock();
- cpu = get_rps_cpu(skb->dev, skb);
+ cpu = get_rps_cpu(skb->dev, skb, &rflow);
- if (cpu < 0)
- return __netif_receive_skb(skb);
- else
- return enqueue_to_backlog(skb, cpu);
+ if (cpu >= 0) {
+ ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ ret = __netif_receive_skb(skb);
+ }
+
+ return ret;
#else
return __netif_receive_skb(skb);
#endif
@@ -2856,6 +2923,7 @@ static void flush_backlog(void *arg)
if (skb->dev == dev) {
__skb_unlink(skb, &queue->input_pkt_queue);
kfree_skb(skb);
+ incr_input_queue_head(queue);
}
rps_unlock(queue);
}
@@ -3179,6 +3247,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
local_irq_enable();
break;
}
+ incr_input_queue_head(queue);
rps_unlock(queue);
local_irq_enable();
@@ -5542,8 +5611,10 @@ static int dev_cpu_callback(struct notifier_block *nfb,
local_irq_enable();
/* Process offline CPU's input_pkt_queue */
- while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
+ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
netif_rx(skb);
+ incr_input_queue_head(oldsd);
+ }
return NOTIFY_OK;
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 96ed6905b823..143052a22b9b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -17,6 +17,7 @@
#include <net/sock.h>
#include <linux/rtnetlink.h>
#include <linux/wireless.h>
+#include <linux/vmalloc.h>
#include <net/wext.h>
#include "net-sysfs.h"
@@ -601,22 +602,109 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
return len;
}
+static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr,
+ char *buf)
+{
+ struct rps_dev_flow_table *flow_table;
+ unsigned int val = 0;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(queue->rps_flow_table);
+ if (flow_table)
+ val = flow_table->mask + 1;
+ rcu_read_unlock();
+
+ return sprintf(buf, "%u\n", val);
+}
+
+static void rps_dev_flow_table_release_work(struct work_struct *work)
+{
+ struct rps_dev_flow_table *table = container_of(work,
+ struct rps_dev_flow_table, free_work);
+
+ vfree(table);
+}
+
+static void rps_dev_flow_table_release(struct rcu_head *rcu)
+{
+ struct rps_dev_flow_table *table = container_of(rcu,
+ struct rps_dev_flow_table, rcu);
+
+ INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
+ schedule_work(&table->free_work);
+}
+
+ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int count;
+ char *endp;
+ struct rps_dev_flow_table *table, *old_table;
+ static DEFINE_SPINLOCK(rps_dev_flow_lock);
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ count = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ return -EINVAL;
+
+ if (count) {
+ int i;
+
+ if (count > 1<<30) {
+ /* Enforce a limit to prevent overflow */
+ return -EINVAL;
+ }
+ count = roundup_pow_of_two(count);
+ table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
+ if (!table)
+ return -ENOMEM;
+
+ table->mask = count - 1;
+ for (i = 0; i < count; i++)
+ table->flows[i].cpu = RPS_NO_CPU;
+ } else
+ table = NULL;
+
+ spin_lock(&rps_dev_flow_lock);
+ old_table = queue->rps_flow_table;
+ rcu_assign_pointer(queue->rps_flow_table, table);
+ spin_unlock(&rps_dev_flow_lock);
+
+ if (old_table)
+ call_rcu(&old_table->rcu, rps_dev_flow_table_release);
+
+ return len;
+}
+
static struct rx_queue_attribute rps_cpus_attribute =
__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
+
+static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute =
+ __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+ show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
+
static struct attribute *rx_queue_default_attrs[] = {
&rps_cpus_attribute.attr,
+ &rps_dev_flow_table_cnt_attribute.attr,
NULL
};
static void rx_queue_release(struct kobject *kobj)
{
struct netdev_rx_queue *queue = to_rx_queue(kobj);
- struct rps_map *map = queue->rps_map;
struct netdev_rx_queue *first = queue->first;
- if (map)
- call_rcu(&map->rcu, rps_map_release);
+ if (queue->rps_map)
+ call_rcu(&queue->rps_map->rcu, rps_map_release);
+
+ if (queue->rps_flow_table)
+ call_rcu(&queue->rps_flow_table->rcu,
+ rps_dev_flow_table_release);
if (atomic_dec_and_test(&first->count))
kfree(first);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index b7b6b8208f75..dcc7d25996ab 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -11,12 +11,72 @@
#include <linux/socket.h>
#include <linux/netdevice.h>
#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <net/ip.h>
#include <net/sock.h>
+#ifdef CONFIG_RPS
+static int rps_sock_flow_sysctl(ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned int orig_size, size;
+ int ret, i;
+ ctl_table tmp = {
+ .data = &size,
+ .maxlen = sizeof(size),
+ .mode = table->mode
+ };
+ struct rps_sock_flow_table *orig_sock_table, *sock_table;
+ static DEFINE_MUTEX(sock_flow_mutex);
+
+ mutex_lock(&sock_flow_mutex);
+
+ orig_sock_table = rps_sock_flow_table;
+ size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
+
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+
+ if (write) {
+ if (size) {
+ if (size > 1<<30) {
+ /* Enforce limit to prevent overflow */
+ mutex_unlock(&sock_flow_mutex);
+ return -EINVAL;
+ }
+ size = roundup_pow_of_two(size);
+ if (size != orig_size) {
+ sock_table =
+ vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
+ if (!sock_table) {
+ mutex_unlock(&sock_flow_mutex);
+ return -ENOMEM;
+ }
+
+ sock_table->mask = size - 1;
+ } else
+ sock_table = orig_sock_table;
+
+ for (i = 0; i < size; i++)
+ sock_table->ents[i] = RPS_NO_CPU;
+ } else
+ sock_table = NULL;
+
+ if (sock_table != orig_sock_table) {
+ rcu_assign_pointer(rps_sock_flow_table, sock_table);
+ synchronize_rcu();
+ vfree(orig_sock_table);
+ }
+ }
+
+ mutex_unlock(&sock_flow_mutex);
+
+ return ret;
+}
+#endif /* CONFIG_RPS */
+
static struct ctl_table net_core_table[] = {
#ifdef CONFIG_NET
{
@@ -82,6 +142,14 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+#ifdef CONFIG_RPS
+ {
+ .procname = "rps_sock_flow_entries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rps_sock_flow_sysctl
+ },
+#endif
#endif /* CONFIG_NET */
{
.procname = "netdev_budget",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 193dcd6ed64f..c5376c725503 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -419,6 +419,8 @@ int inet_release(struct socket *sock)
if (sk) {
long timeout;
+ inet_rps_reset_flow(sk);
+
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
@@ -720,6 +722,8 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
{
struct sock *sk = sock->sk;
+ inet_rps_record_flow(sk);
+
/* We may need to bind the socket. */
if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
@@ -728,12 +732,13 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
}
EXPORT_SYMBOL(inet_sendmsg);
-
static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
size_t size, int flags)
{
struct sock *sk = sock->sk;
+ inet_rps_record_flow(sk);
+
/* We may need to bind the socket. */
if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
@@ -743,6 +748,22 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
return sock_no_sendpage(sock, page, offset, size, flags);
}
+int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ inet_rps_record_flow(sk);
+
+ err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
int inet_shutdown(struct socket *sock, int how)
{
@@ -872,7 +893,7 @@ const struct proto_ops inet_stream_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = tcp_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = tcp_sendpage,
.splice_read = tcp_splice_read,
@@ -899,7 +920,7 @@ const struct proto_ops inet_dgram_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
@@ -929,7 +950,7 @@ static const struct proto_ops inet_sockraw_ops = {
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a24995cdc4b6..ad08392a738c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1672,6 +1672,8 @@ process:
skb->dev = NULL;
+ inet_rps_save_rxhash(sk, skb->rxhash);
+
bh_lock_sock_nested(sk);
ret = 0;
if (!sock_owned_by_user(sk)) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8fef859db35d..666b963496ff 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1217,6 +1217,7 @@ int udp_disconnect(struct sock *sk, int flags)
sk->sk_state = TCP_CLOSE;
inet->inet_daddr = 0;
inet->inet_dport = 0;
+ inet_rps_save_rxhash(sk, 0);
sk->sk_bound_dev_if = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
@@ -1258,8 +1259,12 @@ EXPORT_SYMBOL(udp_lib_unhash);
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- int rc = sock_queue_rcv_skb(sk, skb);
+ int rc;
+
+ if (inet_sk(sk)->inet_daddr)
+ inet_rps_save_rxhash(sk, skb->rxhash);
+ rc = sock_queue_rcv_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);