From 0a9627f2649a02bea165cfd529d7bcb625c2fcad Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 16 Mar 2010 08:03:29 +0000 Subject: rps: Receive Packet Steering This patch implements software receive side packet steering (RPS). RPS distributes the load of received packet processing across multiple CPUs. Problem statement: Protocol processing done in the NAPI context for received packets is serialized per device queue and becomes a bottleneck under high packet load. This substantially limits pps that can be achieved on a single queue NIC and provides no scaling with multiple cores. This solution queues packets early on in the receive path on the backlog queues of other CPUs. This allows protocol processing (e.g. IP and TCP) to be performed on packets in parallel. For each device (or each receive queue in a multi-queue device) a mask of CPUs is set to indicate the CPUs that can process packets. A CPU is selected on a per packet basis by hashing contents of the packet header (e.g. the TCP or UDP 4-tuple) and using the result to index into the CPU mask. The IPI mechanism is used to raise networking receive softirqs between CPUs. This effectively emulates in software what a multi-queue NIC can provide, but is generic requiring no device support. Many devices now provide a hash over the 4-tuple on a per packet basis (e.g. the Toeplitz hash). This patch allow drivers to set the HW reported hash in an skb field, and that value in turn is used to index into the RPS maps. Using the HW generated hash can avoid cache misses on the packet when steering it to a remote CPU. The CPU mask is set on a per device and per queue basis in the sysfs variable /sys/class/net//queues/rx-/rps_cpus. This is a set of canonical bit maps for receive queues in the device (numbered by ). If a device does not support multi-queue, a single variable is used for the device (rx-0). Generally, we have found this technique increases pps capabilities of a single queue device with good CPU utilization. Optimal settings for the CPU mask seem to depend on architectures and cache hierarcy. Below are some results running 500 instances of netperf TCP_RR test with 1 byte req. and resp. Results show cumulative transaction rate and system CPU utilization. e1000e on 8 core Intel Without RPS: 108K tps at 33% CPU With RPS: 311K tps at 64% CPU forcedeth on 16 core AMD Without RPS: 156K tps at 15% CPU With RPS: 404K tps at 49% CPU bnx2x on 16 core AMD Without RPS 567K tps at 61% CPU (4 HW RX queues) Without RPS 738K tps at 96% CPU (8 HW RX queues) With RPS: 854K tps at 76% CPU (4 HW RX queues) Caveats: - The benefits of this patch are dependent on architecture and cache hierarchy. Tuning the masks to get best performance is probably necessary. - This patch adds overhead in the path for processing a single packet. In a lightly loaded server this overhead may eliminate the advantages of increased parallelism, and possibly cause some relative performance degradation. We have found that masks that are cache aware (share same caches with the interrupting CPU) mitigate much of this. - The RPS masks can be changed dynamically, however whenever the mask is changed this introduces the possibility of generating out of order packets. It's probably best not change the masks too frequently. Signed-off-by: Tom Herbert include/linux/netdevice.h | 32 ++++- include/linux/skbuff.h | 3 + net/core/dev.c | 335 +++++++++++++++++++++++++++++++++++++-------- net/core/net-sysfs.c | 225 ++++++++++++++++++++++++++++++- net/core/skbuff.c | 2 + 5 files changed, 538 insertions(+), 59 deletions(-) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 225 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 224 insertions(+), 1 deletion(-) (limited to 'net/core/net-sysfs.c') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 099c753c4213..7a46343d5ae3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -466,6 +466,216 @@ static struct attribute_group wireless_group = { }; #endif +/* + * RX queue sysfs structures and functions. + */ +struct rx_queue_attribute { + struct attribute attr; + ssize_t (*show)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, char *buf); + ssize_t (*store)(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, const char *buf, size_t len); +}; +#define to_rx_queue_attr(_attr) container_of(_attr, \ + struct rx_queue_attribute, attr) + +#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) + +static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->show) + return -EIO; + + return attribute->show(queue, attribute, buf); +} + +static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); + struct netdev_rx_queue *queue = to_rx_queue(kobj); + + if (!attribute->store) + return -EIO; + + return attribute->store(queue, attribute, buf, count); +} + +static struct sysfs_ops rx_queue_sysfs_ops = { + .show = rx_queue_attr_show, + .store = rx_queue_attr_store, +}; + +static ssize_t show_rps_map(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, char *buf) +{ + struct rps_map *map; + cpumask_var_t mask; + size_t len = 0; + int i; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + rcu_read_lock(); + map = rcu_dereference(queue->rps_map); + if (map) + for (i = 0; i < map->len; i++) + cpumask_set_cpu(map->cpus[i], mask); + + len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask); + if (PAGE_SIZE - len < 3) { + rcu_read_unlock(); + free_cpumask_var(mask); + return -EINVAL; + } + rcu_read_unlock(); + + free_cpumask_var(mask); + len += sprintf(buf + len, "\n"); + return len; +} + +static void rps_map_release(struct rcu_head *rcu) +{ + struct rps_map *map = container_of(rcu, struct rps_map, rcu); + + kfree(map); +} + +ssize_t store_rps_map(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attribute, + const char *buf, size_t len) +{ + struct rps_map *old_map, *map; + cpumask_var_t mask; + int err, cpu, i; + static DEFINE_SPINLOCK(rps_map_lock); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); + if (err) { + free_cpumask_var(mask); + return err; + } + + map = kzalloc(max_t(unsigned, + RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), + GFP_KERNEL); + if (!map) { + free_cpumask_var(mask); + return -ENOMEM; + } + + i = 0; + for_each_cpu_and(cpu, mask, cpu_online_mask) + map->cpus[i++] = cpu; + + if (i) + map->len = i; + else { + kfree(map); + map = NULL; + } + + spin_lock(&rps_map_lock); + old_map = queue->rps_map; + rcu_assign_pointer(queue->rps_map, map); + spin_unlock(&rps_map_lock); + + if (old_map) + call_rcu(&old_map->rcu, rps_map_release); + + free_cpumask_var(mask); + return len; +} + +static struct rx_queue_attribute rps_cpus_attribute = + __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); + +static struct attribute *rx_queue_default_attrs[] = { + &rps_cpus_attribute.attr, + NULL +}; + +static void rx_queue_release(struct kobject *kobj) +{ + struct netdev_rx_queue *queue = to_rx_queue(kobj); + struct rps_map *map = queue->rps_map; + struct netdev_rx_queue *first = queue->first; + + if (map) + call_rcu(&map->rcu, rps_map_release); + + if (atomic_dec_and_test(&first->count)) + kfree(first); +} + +static struct kobj_type rx_queue_ktype = { + .sysfs_ops = &rx_queue_sysfs_ops, + .release = rx_queue_release, + .default_attrs = rx_queue_default_attrs, +}; + +static int rx_queue_add_kobject(struct net_device *net, int index) +{ + struct netdev_rx_queue *queue = net->_rx + index; + struct kobject *kobj = &queue->kobj; + int error = 0; + + kobj->kset = net->queues_kset; + error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, + "rx-%u", index); + if (error) { + kobject_put(kobj); + return error; + } + + kobject_uevent(kobj, KOBJ_ADD); + + return error; +} + +static int rx_queue_register_kobjects(struct net_device *net) +{ + int i; + int error = 0; + + net->queues_kset = kset_create_and_add("queues", + NULL, &net->dev.kobj); + if (!net->queues_kset) + return -ENOMEM; + for (i = 0; i < net->num_rx_queues; i++) { + error = rx_queue_add_kobject(net, i); + if (error) + break; + } + + if (error) + while (--i >= 0) + kobject_put(&net->_rx[i].kobj); + + return error; +} + +static void rx_queue_remove_kobjects(struct net_device *net) +{ + int i; + + for (i = 0; i < net->num_rx_queues; i++) + kobject_put(&net->_rx[i].kobj); + kset_unregister(net->queues_kset); +} + #endif /* CONFIG_SYSFS */ #ifdef CONFIG_HOTPLUG @@ -529,6 +739,8 @@ void netdev_unregister_kobject(struct net_device * net) if (!net_eq(dev_net(net), &init_net)) return; + rx_queue_remove_kobjects(net); + device_del(dev); } @@ -537,6 +749,7 @@ int netdev_register_kobject(struct net_device *net) { struct device *dev = &(net->dev); const struct attribute_group **groups = net->sysfs_groups; + int error = 0; dev->class = &net_class; dev->platform_data = net; @@ -563,7 +776,17 @@ int netdev_register_kobject(struct net_device *net) if (!net_eq(dev_net(net), &init_net)) return 0; - return device_add(dev); + error = device_add(dev); + if (error) + return error; + + error = rx_queue_register_kobjects(net); + if (error) { + device_del(dev); + return error; + } + + return error; } int netdev_class_create_file(struct class_attribute *class_attr) -- cgit v1.2.3 From e880eb6c5c9d98e389ffc0d8947f75d70785361a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 22 Mar 2010 18:06:47 -0700 Subject: rps: Fix build with CONFIG_SYSFS enabled Fix build with CONFIG_SYSFS not enabled. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core/net-sysfs.c') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7a46343d5ae3..f6b6bfee72ae 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -739,7 +739,9 @@ void netdev_unregister_kobject(struct net_device * net) if (!net_eq(dev_net(net), &init_net)) return; +#ifdef CONFIG_SYSFS rx_queue_remove_kobjects(net); +#endif device_del(dev); } @@ -780,11 +782,13 @@ int netdev_register_kobject(struct net_device *net) if (error) return error; +#ifdef CONFIG_SYSFS error = rx_queue_register_kobjects(net); if (error) { device_del(dev); return error; } +#endif return error; } -- cgit v1.2.3 From 30bde1f5076a9b6bd4b6a168523930ce242c7449 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 29 Mar 2010 01:00:44 -0700 Subject: rps: fix net-sysfs build for !CONFIG_RPS Signed-off-by: Stephen Rothwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/core/net-sysfs.c') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f6b6bfee72ae..1e7fdd6029a2 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -466,6 +466,7 @@ static struct attribute_group wireless_group = { }; #endif +#ifdef CONFIG_RPS /* * RX queue sysfs structures and functions. */ @@ -675,7 +676,7 @@ static void rx_queue_remove_kobjects(struct net_device *net) kobject_put(&net->_rx[i].kobj); kset_unregister(net->queues_kset); } - +#endif /* CONFIG_RPS */ #endif /* CONFIG_SYSFS */ #ifdef CONFIG_HOTPLUG @@ -739,7 +740,7 @@ void netdev_unregister_kobject(struct net_device * net) if (!net_eq(dev_net(net), &init_net)) return; -#ifdef CONFIG_SYSFS +#ifdef CONFIG_RPS rx_queue_remove_kobjects(net); #endif @@ -782,7 +783,7 @@ int netdev_register_kobject(struct net_device *net) if (error) return error; -#ifdef CONFIG_SYSFS +#ifdef CONFIG_RPS error = rx_queue_register_kobjects(net); if (error) { device_del(dev); -- cgit v1.2.3 From fec5e652e58fa6017b2c9e06466cb2a6538de5b4 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 16 Apr 2010 16:01:27 -0700 Subject: rfs: Receive Flow Steering This patch implements receive flow steering (RFS). RFS steers received packets for layer 3 and 4 processing to the CPU where the application for the corresponding flow is running. RFS is an extension of Receive Packet Steering (RPS). The basic idea of RFS is that when an application calls recvmsg (or sendmsg) the application's running CPU is stored in a hash table that is indexed by the connection's rxhash which is stored in the socket structure. The rxhash is passed in skb's received on the connection from netif_receive_skb. For each received packet, the associated rxhash is used to look up the CPU in the hash table, if a valid CPU is set then the packet is steered to that CPU using the RPS mechanisms. The convolution of the simple approach is that it would potentially allow OOO packets. If threads are thrashing around CPUs or multiple threads are trying to read from the same sockets, a quickly changing CPU value in the hash table could cause rampant OOO packets-- we consider this a non-starter. To avoid OOO packets, this solution implements two types of hash tables: rps_sock_flow_table and rps_dev_flow_table. rps_sock_table is a global hash table. Each entry is just a CPU number and it is populated in recvmsg and sendmsg as described above. This table contains the "desired" CPUs for flows. rps_dev_flow_table is specific to each device queue. Each entry contains a CPU and a tail queue counter. The CPU is the "current" CPU for a matching flow. The tail queue counter holds the value of a tail queue counter for the associated CPU's backlog queue at the time of last enqueue for a flow matching the entry. Each backlog queue has a queue head counter which is incremented on dequeue, and so a queue tail counter is computed as queue head count + queue length. When a packet is enqueued on a backlog queue, the current value of the queue tail counter is saved in the hash entry of the rps_dev_flow_table. And now the trick: when selecting the CPU for RPS (get_rps_cpu) the rps_sock_flow table and the rps_dev_flow table for the RX queue are consulted. When the desired CPU for the flow (found in the rps_sock_flow table) does not match the current CPU (found in the rps_dev_flow table), the current CPU is changed to the desired CPU if one of the following is true: - The current CPU is unset (equal to RPS_NO_CPU) - Current CPU is offline - The current CPU's queue head counter >= queue tail counter in the rps_dev_flow table. This checks if the queue tail has advanced beyond the last packet that was enqueued using this table entry. This guarantees that all packets queued using this entry have been dequeued, thus preserving in order delivery. Making each queue have its own rps_dev_flow table has two advantages: 1) the tail queue counters will be written on each receive, so keeping the table local to interrupting CPU s good for locality. 2) this allows lockless access to the table-- the CPU number and queue tail counter need to be accessed together under mutual exclusion from netif_receive_skb, we assume that this is only called from device napi_poll which is non-reentrant. This patch implements RFS for TCP and connected UDP sockets. It should be usable for other flow oriented protocols. There are two configuration parameters for RFS. The "rps_flow_entries" kernel init parameter sets the number of entries in the rps_sock_flow_table, the per rxqueue sysfs entry "rps_flow_cnt" contains the number of entries in the rps_dev_flow table for the rxqueue. Both are rounded to power of two. The obvious benefit of RFS (over just RPS) is that it achieves CPU locality between the receive processing for a flow and the applications processing; this can result in increased performance (higher pps, lower latency). The benefits of RFS are dependent on cache hierarchy, application load, and other factors. On simple benchmarks, we don't necessarily see improvement and sometimes see degradation. However, for more complex benchmarks and for applications where cache pressure is much higher this technique seems to perform very well. Below are some benchmark results which show the potential benfit of this patch. The netperf test has 500 instances of netperf TCP_RR test with 1 byte req. and resp. The RPC test is an request/response test similar in structure to netperf RR test ith 100 threads on each host, but does more work in userspace that netperf. e1000e on 8 core Intel No RFS or RPS 104K tps at 30% CPU No RFS (best RPS config): 290K tps at 63% CPU RFS 303K tps at 61% CPU RPC test tps CPU% 50/90/99% usec latency Latency StdDev No RFS/RPS 103K 48% 757/900/3185 4472.35 RPS only: 174K 73% 415/993/2468 491.66 RFS 223K 73% 379/651/1382 315.61 Signed-off-by: Tom Herbert Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 3 deletions(-) (limited to 'net/core/net-sysfs.c') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 96ed6905b823..143052a22b9b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "net-sysfs.h" @@ -601,22 +602,109 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue, return len; } +static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, + char *buf) +{ + struct rps_dev_flow_table *flow_table; + unsigned int val = 0; + + rcu_read_lock(); + flow_table = rcu_dereference(queue->rps_flow_table); + if (flow_table) + val = flow_table->mask + 1; + rcu_read_unlock(); + + return sprintf(buf, "%u\n", val); +} + +static void rps_dev_flow_table_release_work(struct work_struct *work) +{ + struct rps_dev_flow_table *table = container_of(work, + struct rps_dev_flow_table, free_work); + + vfree(table); +} + +static void rps_dev_flow_table_release(struct rcu_head *rcu) +{ + struct rps_dev_flow_table *table = container_of(rcu, + struct rps_dev_flow_table, rcu); + + INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); + schedule_work(&table->free_work); +} + +ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, + struct rx_queue_attribute *attr, + const char *buf, size_t len) +{ + unsigned int count; + char *endp; + struct rps_dev_flow_table *table, *old_table; + static DEFINE_SPINLOCK(rps_dev_flow_lock); + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + count = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EINVAL; + + if (count) { + int i; + + if (count > 1<<30) { + /* Enforce a limit to prevent overflow */ + return -EINVAL; + } + count = roundup_pow_of_two(count); + table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count)); + if (!table) + return -ENOMEM; + + table->mask = count - 1; + for (i = 0; i < count; i++) + table->flows[i].cpu = RPS_NO_CPU; + } else + table = NULL; + + spin_lock(&rps_dev_flow_lock); + old_table = queue->rps_flow_table; + rcu_assign_pointer(queue->rps_flow_table, table); + spin_unlock(&rps_dev_flow_lock); + + if (old_table) + call_rcu(&old_table->rcu, rps_dev_flow_table_release); + + return len; +} + static struct rx_queue_attribute rps_cpus_attribute = __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map); + +static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute = + __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR, + show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); + static struct attribute *rx_queue_default_attrs[] = { &rps_cpus_attribute.attr, + &rps_dev_flow_table_cnt_attribute.attr, NULL }; static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); - struct rps_map *map = queue->rps_map; struct netdev_rx_queue *first = queue->first; - if (map) - call_rcu(&map->rcu, rps_map_release); + if (queue->rps_map) + call_rcu(&queue->rps_map->rcu, rps_map_release); + + if (queue->rps_flow_table) + call_rcu(&queue->rps_flow_table->rcu, + rps_dev_flow_table_release); if (atomic_dec_and_test(&first->count)) kfree(first); -- cgit v1.2.3 From f5acb907dc24c3822f408211bad1cd6e5d0433cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Apr 2010 14:40:57 -0700 Subject: rps: static functions store_rps_map() & store_rps_dev_flow_table_cnt() are static. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/net-sysfs.c') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 143052a22b9b..c57c4b228bb5 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -550,7 +550,7 @@ static void rps_map_release(struct rcu_head *rcu) kfree(map); } -ssize_t store_rps_map(struct netdev_rx_queue *queue, +static ssize_t store_rps_map(struct netdev_rx_queue *queue, struct rx_queue_attribute *attribute, const char *buf, size_t len) { @@ -635,7 +635,7 @@ static void rps_dev_flow_table_release(struct rcu_head *rcu) schedule_work(&table->free_work); } -ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, +static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, struct rx_queue_attribute *attr, const char *buf, size_t len) { -- cgit v1.2.3