From 1059590254fa9dce9cafc4f07d1103dbec415e76 Mon Sep 17 00:00:00 2001 From: Pankaj Gupta Date: Mon, 12 Jan 2015 11:41:28 +0530 Subject: net: allow large number of rx queues netif_alloc_rx_queues() uses kcalloc() to allocate memory for "struct netdev_queue *_rx" array. If we are doing large rx queue allocation kcalloc() might fail, so this patch does a fallback to vzalloc(). Similar implementation is done for tx queue allocation in netif_alloc_netdev_queues(). We avoid failure of high order memory allocation with the help of vzalloc(), this allows us to do large rx and tx queue allocation which in turn helps us to increase the number of queues in tun. As vmalloc() adds overhead on a critical network path, __GFP_REPEAT flag is used with kzalloc() to do this fallback only when really needed. Signed-off-by: Pankaj Gupta Reviewed-by: Michael S. Tsirkin Reviewed-by: David Gibson Signed-off-by: David S. Miller --- net/core/dev.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 683d493aa1bf..805456147c30 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6172,13 +6172,16 @@ static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; + size_t sz = count * sizeof(*rx); BUG_ON(count < 1); - rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) - return -ENOMEM; - + rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!rx) { + rx = vzalloc(sz); + if (!rx) + return -ENOMEM; + } dev->_rx = rx; for (i = 0; i < count; i++) @@ -6808,7 +6811,7 @@ void free_netdev(struct net_device *dev) netif_free_tx_queues(dev); #ifdef CONFIG_SYSFS - kfree(dev->_rx); + kvfree(dev->_rx); #endif kfree(rcu_dereference_protected(dev->ingress_queue, 1)); -- cgit v1.2.3 From df8a39defad46b83694ea6dd868d332976d62cc0 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 13 Jan 2015 17:13:44 +0100 Subject: net: rename vlan_tx_* helpers since "tx" is misleading there The same macros are used for rx as well. So rename it. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 805456147c30..1e325adc4367 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2578,7 +2578,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb) if (skb->encapsulation) features &= dev->hw_enc_features; - if (!vlan_tx_tag_present(skb)) { + if (!skb_vlan_tag_present(skb)) { if (unlikely(protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))) { struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; @@ -2659,7 +2659,7 @@ out: static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features) { - if (vlan_tx_tag_present(skb) && + if (skb_vlan_tag_present(skb) && !vlan_hw_offload_capable(features, skb->vlan_proto)) skb = __vlan_hwaccel_push_inside(skb); return skb; @@ -3676,7 +3676,7 @@ ncls: if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) goto drop; - if (vlan_tx_tag_present(skb)) { + if (skb_vlan_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; @@ -3708,8 +3708,8 @@ ncls: } } - if (unlikely(vlan_tx_tag_present(skb))) { - if (vlan_tx_tag_get_id(skb)) + if (unlikely(skb_vlan_tag_present(skb))) { + if (skb_vlan_tag_get_id(skb)) skb->pkt_type = PACKET_OTHERHOST; /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() -- cgit v1.2.3 From 7866a621043fbaca3d7389e9b9f69dd1a2e5a855 Mon Sep 17 00:00:00 2001 From: Salam Noureddine Date: Tue, 27 Jan 2015 11:35:48 -0800 Subject: dev: add per net_device packet type chains When many pf_packet listeners are created on a lot of interfaces the current implementation using global packet type lists scales poorly. This patch adds per net_device packet type lists to fix this problem. The patch was originally written by Eric Biederman for linux-2.6.29. Tested on linux-3.16. Signed-off-by: "Eric W. Biederman" Signed-off-by: Salam Noureddine Signed-off-by: David S. Miller --- net/core/dev.c | 132 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 84 insertions(+), 48 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 7f028d441e98..1d564d68e31a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,9 +371,10 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { if (pt->type == htons(ETH_P_ALL)) - return &ptype_all; + return pt->dev ? &pt->dev->ptype_all : &ptype_all; else - return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; + return pt->dev ? &pt->dev->ptype_specific : + &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } /** @@ -1734,6 +1735,23 @@ static inline int deliver_skb(struct sk_buff *skb, return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline void deliver_ptype_list_skb(struct sk_buff *skb, + struct packet_type **pt, + struct net_device *dev, __be16 type, + struct list_head *ptype_list) +{ + struct packet_type *ptype, *pt_prev = *pt; + + list_for_each_entry_rcu(ptype, ptype_list, list) { + if (ptype->type != type) + continue; + if (pt_prev) + deliver_skb(skb, pt_prev, dev); + pt_prev = ptype; + } + *pt = pt_prev; +} + static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) { if (!ptype->af_packet_priv || !skb->sk) @@ -1757,45 +1775,54 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) struct packet_type *ptype; struct sk_buff *skb2 = NULL; struct packet_type *pt_prev = NULL; + struct list_head *ptype_list = &ptype_all; rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { +again: + list_for_each_entry_rcu(ptype, ptype_list, list) { /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ - if ((ptype->dev == dev || !ptype->dev) && - (!skb_loop_sk(ptype, skb))) { - if (pt_prev) { - deliver_skb(skb2, pt_prev, skb->dev); - pt_prev = ptype; - continue; - } + if (skb_loop_sk(ptype, skb)) + continue; - skb2 = skb_clone(skb, GFP_ATOMIC); - if (!skb2) - break; + if (pt_prev) { + deliver_skb(skb2, pt_prev, skb->dev); + pt_prev = ptype; + continue; + } - net_timestamp_set(skb2); + /* need to clone skb, done only once */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + goto out_unlock; - /* skb->nh should be correctly - set by sender, so that the second statement is - just protection against buggy protocols. - */ - skb_reset_mac_header(skb2); - - if (skb_network_header(skb2) < skb2->data || - skb_network_header(skb2) > skb_tail_pointer(skb2)) { - net_crit_ratelimited("protocol %04x is buggy, dev %s\n", - ntohs(skb2->protocol), - dev->name); - skb_reset_network_header(skb2); - } + net_timestamp_set(skb2); - skb2->transport_header = skb2->network_header; - skb2->pkt_type = PACKET_OUTGOING; - pt_prev = ptype; + /* skb->nh should be correctly + * set by sender, so that the second statement is + * just protection against buggy protocols. + */ + skb_reset_mac_header(skb2); + + if (skb_network_header(skb2) < skb2->data || + skb_network_header(skb2) > skb_tail_pointer(skb2)) { + net_crit_ratelimited("protocol %04x is buggy, dev %s\n", + ntohs(skb2->protocol), + dev->name); + skb_reset_network_header(skb2); } + + skb2->transport_header = skb2->network_header; + skb2->pkt_type = PACKET_OUTGOING; + pt_prev = ptype; + } + + if (ptype_list == &ptype_all) { + ptype_list = &dev->ptype_all; + goto again; } +out_unlock: if (pt_prev) pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); @@ -2617,7 +2644,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, unsigned int len; int rc; - if (!list_empty(&ptype_all)) + if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) dev_queue_xmit_nit(skb, dev); len = skb->len; @@ -3615,7 +3642,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct net_device *orig_dev; - struct net_device *null_or_dev; bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; @@ -3658,11 +3684,15 @@ another_round: goto skip_taps; list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = ptype; - } + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; + } + + list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = ptype; } skip_taps: @@ -3718,19 +3748,21 @@ ncls: skb->vlan_tci = 0; } + type = skb->protocol; + /* deliver only exact match when indicated */ - null_or_dev = deliver_exact ? skb->dev : NULL; + if (likely(!deliver_exact)) { + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &ptype_base[ntohs(type) & + PTYPE_HASH_MASK]); + } - type = skb->protocol; - list_for_each_entry_rcu(ptype, - &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { - if (ptype->type == type && - (ptype->dev == null_or_dev || ptype->dev == skb->dev || - ptype->dev == orig_dev)) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev, orig_dev); - pt_prev = ptype; - } + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &orig_dev->ptype_specific); + + if (unlikely(skb->dev != orig_dev)) { + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &skb->dev->ptype_specific); } if (pt_prev) { @@ -6579,6 +6611,8 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(netdev_refcnt_read(dev)); + BUG_ON(!list_empty(&dev->ptype_all)); + BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr)); WARN_ON(dev->dn_ptr); @@ -6761,6 +6795,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->all_adj_list.upper); INIT_LIST_HEAD(&dev->all_adj_list.lower); + INIT_LIST_HEAD(&dev->ptype_all); + INIT_LIST_HEAD(&dev->ptype_specific); dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); -- cgit v1.2.3 From 61bd3857ff2c7daf756d49b41e6277bbdaa8f789 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 3 Feb 2015 16:48:29 +0200 Subject: net/core: Add event for a change in slave state Add event which provides an indication on a change in the state of a bonding slave. The event handler should cast the pointer to the appropriate type (struct netdev_bonding_info) in order to get the full info about the slave. Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- net/core/dev.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 1d564d68e31a..ede0b161b115 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5355,6 +5355,26 @@ void netdev_upper_dev_unlink(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_dev_unlink); +/** + * netdev_bonding_info_change - Dispatch event about slave change + * @dev: device + * @netdev_bonding_info: info to dispatch + * + * Send NETDEV_BONDING_INFO to netdev notifiers with info. + * The caller must hold the RTNL lock. + */ +void netdev_bonding_info_change(struct net_device *dev, + struct netdev_bonding_info *bonding_info) +{ + struct netdev_notifier_bonding_info info; + + memcpy(&info.bonding_info, bonding_info, + sizeof(struct netdev_bonding_info)); + call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, + &info.info); +} +EXPORT_SYMBOL(netdev_bonding_info_change); + void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; -- cgit v1.2.3 From 91e83133e70ebe1572746d1ad858b4eb28ab9b53 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Feb 2015 14:58:14 -0800 Subject: net: use netif_rx_ni() from process context Hotpluging a cpu might be rare, yet we have to use proper handlers when taking over packets found in backlog queues. dev_cpu_callback() runs from process context, thus we should call netif_rx_ni() to properly invoke softirq handler. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 7fe82929f509..6c1556aeec29 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7064,11 +7064,11 @@ static int dev_cpu_callback(struct notifier_block *nfb, /* Process offline CPU's input_pkt_queue */ while ((skb = __skb_dequeue(&oldsd->process_queue))) { - netif_rx_internal(skb); + netif_rx_ni(skb); input_queue_head_incr(oldsd); } while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { - netif_rx_internal(skb); + netif_rx_ni(skb); input_queue_head_incr(oldsd); } -- cgit v1.2.3 From 567e4b79731c352a17d73c483959f795d3593e03 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Feb 2015 12:59:01 -0800 Subject: net: rfs: add hash collision detection Receive Flow Steering is a nice solution but suffers from hash collisions when a mix of connected and unconnected traffic is received on the host, when flow hash table is populated. Also, clearing flow in inet_release() makes RFS not very good for short lived flows, as many packets can follow close(). (FIN , ACK packets, ...) This patch extends the information stored into global hash table to not only include cpu number, but upper part of the hash value. I use a 32bit value, and dynamically split it in two parts. For host with less than 64 possible cpus, this gives 6 bits for the cpu number, and 26 (32-6) bits for the upper part of the hash. Since hash bucket selection use low order bits of the hash, we have a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big enough. If the hash found in flow table does not match, we fallback to RPS (if it is enabled for the rxqueue). This means that a packet for an non connected flow can avoid the IPI through a unrelated/victim CPU. This also means we no longer have to clear the table at socket close time, and this helps short lived flows performance. Signed-off-by: Eric Dumazet Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/dev.c | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index a3a96ffc67f4..8be38675e1a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd, /* One global table that all flow-based protocols share. */ struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); +u32 rps_cpu_mask __read_mostly; +EXPORT_SYMBOL(rps_cpu_mask); struct static_key rps_needed __read_mostly; @@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { - struct netdev_rx_queue *rxqueue; - struct rps_map *map; + const struct rps_sock_flow_table *sock_flow_table; + struct netdev_rx_queue *rxqueue = dev->_rx; struct rps_dev_flow_table *flow_table; - struct rps_sock_flow_table *sock_flow_table; + struct rps_map *map; int cpu = -1; - u16 tcpu; + u32 tcpu; u32 hash; if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->real_num_rx_queues)) { WARN_ONCE(dev->real_num_rx_queues > 1, "%s received packet on queue %u, but number " @@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, dev->name, index, dev->real_num_rx_queues); goto done; } - rxqueue = dev->_rx + index; - } else - rxqueue = dev->_rx; + rxqueue += index; + } + /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ + + flow_table = rcu_dereference(rxqueue->rps_flow_table); map = rcu_dereference(rxqueue->rps_map); - if (map) { - if (map->len == 1 && - !rcu_access_pointer(rxqueue->rps_flow_table)) { - tcpu = map->cpus[0]; - if (cpu_online(tcpu)) - cpu = tcpu; - goto done; - } - } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) { + if (!flow_table && !map) goto done; - } skb_reset_network_header(skb); hash = skb_get_hash(skb); if (!hash) goto done; - flow_table = rcu_dereference(rxqueue->rps_flow_table); sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) { - u16 next_cpu; struct rps_dev_flow *rflow; + u32 next_cpu; + u32 ident; + + /* First check into global flow table if there is a match */ + ident = sock_flow_table->ents[hash & sock_flow_table->mask]; + if ((ident ^ hash) & ~rps_cpu_mask) + goto try_rps; + next_cpu = ident & rps_cpu_mask; + + /* OK, now we know there is a match, + * we can look at the local (per receive queue) flow table + */ rflow = &flow_table->flows[hash & flow_table->mask]; tcpu = rflow->cpu; - next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask]; - /* * If the desired CPU (where last recvmsg was done) is * different from current CPU (one in the rx-queue flow @@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } } +try_rps: + if (map) { tcpu = map->cpus[reciprocal_scale(hash, map->len)]; if (cpu_online(tcpu)) { -- cgit v1.2.3