summaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c1720
1 files changed, 564 insertions, 1156 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index f64e439b4a00..17bc535115d3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -97,8 +97,6 @@
#include <net/net_namespace.h>
#include <net/sock.h>
#include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
#include <linux/stat.h>
#include <net/dst.h>
#include <net/pkt_sched.h>
@@ -106,12 +104,10 @@
#include <net/xfrm.h>
#include <linux/highmem.h>
#include <linux/init.h>
-#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
-#include <net/wext.h>
#include <net/iw_handler.h>
#include <asm/current.h>
#include <linux/audit.h>
@@ -132,9 +128,7 @@
#include <linux/pci.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
-#include <linux/net_tstamp.h>
#include <linux/static_key.h>
-#include <net/flow_keys.h>
#include "net-sysfs.h"
@@ -144,41 +138,10 @@
/* This should be increased if a protocol with a bigger head is added. */
#define GRO_MAX_HEAD (MAX_HEADER + 128)
-/*
- * The list of packet types we will receive (as opposed to discard)
- * and the routines to invoke.
- *
- * Why 16. Because with 16 the only overlap we get on a hash of the
- * low nibble of the protocol value is RARP/SNAP/X.25.
- *
- * NOTE: That is no longer true with the addition of VLAN tags. Not
- * sure which should go first, but I bet it won't make much
- * difference if we are running VLANs. The good news is that
- * this protocol won't be in the list unless compiled in, so
- * the average user (w/out VLANs) will not be adversely affected.
- * --BLG
- *
- * 0800 IP
- * 8100 802.1Q VLAN
- * 0001 802.3
- * 0002 AX.25
- * 0004 802.2
- * 8035 RARP
- * 0005 SNAP
- * 0805 X.25
- * 0806 ARP
- * 8137 IPX
- * 0009 Localtalk
- * 86DD IPv6
- */
-
-#define PTYPE_HASH_SIZE (16)
-#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
-
static DEFINE_SPINLOCK(ptype_lock);
static DEFINE_SPINLOCK(offload_lock);
-static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-static struct list_head ptype_all __read_mostly; /* Taps */
+struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+struct list_head ptype_all __read_mostly; /* Taps */
static struct list_head offload_base __read_mostly;
/*
@@ -1227,36 +1190,6 @@ void netdev_notify_peers(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_notify_peers);
-/**
- * dev_load - load a network module
- * @net: the applicable net namespace
- * @name: name of interface
- *
- * If a network interface is not present and the process has suitable
- * privileges this function loads the module. If module loading is not
- * available in this kernel then it becomes a nop.
- */
-
-void dev_load(struct net *net, const char *name)
-{
- struct net_device *dev;
- int no_module;
-
- rcu_read_lock();
- dev = dev_get_by_name_rcu(net, name);
- rcu_read_unlock();
-
- no_module = !dev;
- if (no_module && capable(CAP_NET_ADMIN))
- no_module = request_module("netdev-%s", name);
- if (no_module && capable(CAP_SYS_MODULE)) {
- if (!request_module("%s", name))
- pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
- name);
- }
-}
-EXPORT_SYMBOL(dev_load);
-
static int __dev_open(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -1267,6 +1200,14 @@ static int __dev_open(struct net_device *dev)
if (!netif_device_present(dev))
return -ENODEV;
+ /* Block netpoll from trying to do any rx path servicing.
+ * If we don't do this there is a chance ndo_poll_controller
+ * or ndo_poll may be running while we open the device
+ */
+ ret = netpoll_rx_disable(dev);
+ if (ret)
+ return ret;
+
ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
ret = notifier_to_errno(ret);
if (ret)
@@ -1280,6 +1221,8 @@ static int __dev_open(struct net_device *dev)
if (!ret && ops->ndo_open)
ret = ops->ndo_open(dev);
+ netpoll_rx_enable(dev);
+
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
@@ -1371,9 +1314,16 @@ static int __dev_close(struct net_device *dev)
int retval;
LIST_HEAD(single);
+ /* Temporarily disable netpoll until the interface is down */
+ retval = netpoll_rx_disable(dev);
+ if (retval)
+ return retval;
+
list_add(&dev->unreg_list, &single);
retval = __dev_close_many(&single);
list_del(&single);
+
+ netpoll_rx_enable(dev);
return retval;
}
@@ -1409,14 +1359,22 @@ static int dev_close_many(struct list_head *head)
*/
int dev_close(struct net_device *dev)
{
+ int ret = 0;
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
+ /* Block netpoll rx while the interface is going down */
+ ret = netpoll_rx_disable(dev);
+ if (ret)
+ return ret;
+
list_add(&dev->unreg_list, &single);
dev_close_many(&single);
list_del(&single);
+
+ netpoll_rx_enable(dev);
}
- return 0;
+ return ret;
}
EXPORT_SYMBOL(dev_close);
@@ -1621,57 +1579,6 @@ static inline void net_timestamp_set(struct sk_buff *skb)
__net_timestamp(SKB); \
} \
-static int net_hwtstamp_validate(struct ifreq *ifr)
-{
- struct hwtstamp_config cfg;
- enum hwtstamp_tx_types tx_type;
- enum hwtstamp_rx_filters rx_filter;
- int tx_type_valid = 0;
- int rx_filter_valid = 0;
-
- if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
- return -EFAULT;
-
- if (cfg.flags) /* reserved for future extensions */
- return -EINVAL;
-
- tx_type = cfg.tx_type;
- rx_filter = cfg.rx_filter;
-
- switch (tx_type) {
- case HWTSTAMP_TX_OFF:
- case HWTSTAMP_TX_ON:
- case HWTSTAMP_TX_ONESTEP_SYNC:
- tx_type_valid = 1;
- break;
- }
-
- switch (rx_filter) {
- case HWTSTAMP_FILTER_NONE:
- case HWTSTAMP_FILTER_ALL:
- case HWTSTAMP_FILTER_SOME:
- case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
- case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
- case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
- case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
- case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
- case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
- case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
- case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
- case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
- case HWTSTAMP_FILTER_PTP_V2_EVENT:
- case HWTSTAMP_FILTER_PTP_V2_SYNC:
- case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
- rx_filter_valid = 1;
- break;
- }
-
- if (!tx_type_valid || !rx_filter_valid)
- return -ERANGE;
-
- return 0;
-}
-
static inline bool is_skb_forwardable(struct net_device *dev,
struct sk_buff *skb)
{
@@ -1857,6 +1764,228 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
}
}
+#ifdef CONFIG_XPS
+static DEFINE_MUTEX(xps_map_mutex);
+#define xmap_dereference(P) \
+ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
+
+static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
+ int cpu, u16 index)
+{
+ struct xps_map *map = NULL;
+ int pos;
+
+ if (dev_maps)
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+
+ for (pos = 0; map && pos < map->len; pos++) {
+ if (map->queues[pos] == index) {
+ if (map->len > 1) {
+ map->queues[pos] = map->queues[--map->len];
+ } else {
+ RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
+ kfree_rcu(map, rcu);
+ map = NULL;
+ }
+ break;
+ }
+ }
+
+ return map;
+}
+
+static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
+{
+ struct xps_dev_maps *dev_maps;
+ int cpu, i;
+ bool active = false;
+
+ mutex_lock(&xps_map_mutex);
+ dev_maps = xmap_dereference(dev->xps_maps);
+
+ if (!dev_maps)
+ goto out_no_maps;
+
+ for_each_possible_cpu(cpu) {
+ for (i = index; i < dev->num_tx_queues; i++) {
+ if (!remove_xps_queue(dev_maps, cpu, i))
+ break;
+ }
+ if (i == dev->num_tx_queues)
+ active = true;
+ }
+
+ if (!active) {
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
+
+ for (i = index; i < dev->num_tx_queues; i++)
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
+ NUMA_NO_NODE);
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+}
+
+static struct xps_map *expand_xps_map(struct xps_map *map,
+ int cpu, u16 index)
+{
+ struct xps_map *new_map;
+ int alloc_len = XPS_MIN_MAP_ALLOC;
+ int i, pos;
+
+ for (pos = 0; map && pos < map->len; pos++) {
+ if (map->queues[pos] != index)
+ continue;
+ return map;
+ }
+
+ /* Need to add queue to this CPU's existing map */
+ if (map) {
+ if (pos < map->alloc_len)
+ return map;
+
+ alloc_len = map->alloc_len * 2;
+ }
+
+ /* Need to allocate new map to store queue on this CPU's map */
+ new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!new_map)
+ return NULL;
+
+ for (i = 0; i < pos; i++)
+ new_map->queues[i] = map->queues[i];
+ new_map->alloc_len = alloc_len;
+ new_map->len = pos;
+
+ return new_map;
+}
+
+int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
+{
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ struct xps_map *map, *new_map;
+ int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
+ int cpu, numa_node_id = -2;
+ bool active = false;
+
+ mutex_lock(&xps_map_mutex);
+
+ dev_maps = xmap_dereference(dev->xps_maps);
+
+ /* allocate memory for queue storage */
+ for_each_online_cpu(cpu) {
+ if (!cpumask_test_cpu(cpu, mask))
+ continue;
+
+ if (!new_dev_maps)
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps)
+ return -ENOMEM;
+
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ NULL;
+
+ map = expand_xps_map(map, cpu, index);
+ if (!map)
+ goto error;
+
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ }
+
+ if (!new_dev_maps)
+ goto out_no_new_maps;
+
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
+ /* add queue to CPU maps */
+ int pos = 0;
+
+ map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ while ((pos < map->len) && (map->queues[pos] != index))
+ pos++;
+
+ if (pos == map->len)
+ map->queues[map->len++] = index;
+#ifdef CONFIG_NUMA
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(cpu);
+ else if (numa_node_id != cpu_to_node(cpu))
+ numa_node_id = -1;
+#endif
+ } else if (dev_maps) {
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
+ }
+
+ }
+
+ rcu_assign_pointer(dev->xps_maps, new_dev_maps);
+
+ /* Cleanup old maps */
+ if (dev_maps) {
+ for_each_possible_cpu(cpu) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = xmap_dereference(dev_maps->cpu_map[cpu]);
+ if (map && map != new_map)
+ kfree_rcu(map, rcu);
+ }
+
+ kfree_rcu(dev_maps, rcu);
+ }
+
+ dev_maps = new_dev_maps;
+ active = true;
+
+out_no_new_maps:
+ /* update Tx queue numa node */
+ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
+ (numa_node_id >= 0) ? numa_node_id :
+ NUMA_NO_NODE);
+
+ if (!dev_maps)
+ goto out_no_maps;
+
+ /* removes queue from unused CPUs */
+ for_each_possible_cpu(cpu) {
+ if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
+ continue;
+
+ if (remove_xps_queue(dev_maps, cpu, index))
+ active = true;
+ }
+
+ /* free map if not active */
+ if (!active) {
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
+
+out_no_maps:
+ mutex_unlock(&xps_map_mutex);
+
+ return 0;
+error:
+ /* remove any maps that we added */
+ for_each_possible_cpu(cpu) {
+ new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
+ map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
+ NULL;
+ if (new_map && new_map != map)
+ kfree(new_map);
+ }
+
+ mutex_unlock(&xps_map_mutex);
+
+ kfree(new_dev_maps);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(netif_set_xps_queue);
+
+#endif
/*
* Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
* greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -1880,8 +2009,12 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->num_tc)
netif_setup_tc(dev, txq);
- if (txq < dev->real_num_tx_queues)
+ if (txq < dev->real_num_tx_queues) {
qdisc_reset_all_tx_gt(dev, txq);
+#ifdef CONFIG_XPS
+ netif_reset_xps_queues_gt(dev, txq);
+#endif
+ }
}
dev->real_num_tx_queues = txq;
@@ -2046,6 +2179,15 @@ int skb_checksum_help(struct sk_buff *skb)
return -EINVAL;
}
+ /* Before computing a checksum, we should make sure no frag could
+ * be modified by an external entity : checksum could be wrong.
+ */
+ if (skb_has_shared_frag(skb)) {
+ ret = __skb_linearize(skb);
+ if (ret)
+ goto out;
+ }
+
offset = skb_checksum_start_offset(skb);
BUG_ON(offset >= skb_headlen(skb));
csum = skb_checksum(skb, offset, skb->len - offset, 0);
@@ -2069,25 +2211,19 @@ out:
EXPORT_SYMBOL(skb_checksum_help);
/**
- * skb_gso_segment - Perform segmentation on skb.
+ * skb_mac_gso_segment - mac layer segmentation handler.
* @skb: buffer to segment
* @features: features for the output path (see dev->features)
- *
- * This function segments the given skb and returns a list of segments.
- *
- * It may return NULL if the skb requires no segmentation. This is
- * only possible when GSO is used for verifying header integrity.
*/
-struct sk_buff *skb_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_offload *ptype;
__be16 type = skb->protocol;
- int vlan_depth = ETH_HLEN;
- int err;
while (type == htons(ETH_P_8021Q)) {
+ int vlan_depth = ETH_HLEN;
struct vlan_hdr *vh;
if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
@@ -2098,22 +2234,14 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
vlan_depth += VLAN_HLEN;
}
- skb_reset_mac_header(skb);
- skb->mac_len = skb->network_header - skb->mac_header;
__skb_pull(skb, skb->mac_len);
- if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
- skb_warn_bad_offload(skb);
-
- if (skb_header_cloned(skb) &&
- (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
- return ERR_PTR(err);
- }
-
rcu_read_lock();
list_for_each_entry_rcu(ptype, &offload_base, list) {
if (ptype->type == type && ptype->callbacks.gso_segment) {
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
+ int err;
+
err = ptype->callbacks.gso_send_check(skb);
segs = ERR_PTR(err);
if (err || skb_gso_ok(skb, features))
@@ -2131,7 +2259,50 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb,
return segs;
}
-EXPORT_SYMBOL(skb_gso_segment);
+EXPORT_SYMBOL(skb_mac_gso_segment);
+
+
+/* openvswitch calls this on rx path, so we need a different check.
+ */
+static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
+{
+ if (tx_path)
+ return skb->ip_summed != CHECKSUM_PARTIAL;
+ else
+ return skb->ip_summed == CHECKSUM_NONE;
+}
+
+/**
+ * __skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @tx_path: whether it is called in TX path
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ */
+struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, bool tx_path)
+{
+ if (unlikely(skb_needs_check(skb, tx_path))) {
+ int err;
+
+ skb_warn_bad_offload(skb);
+
+ if (skb_header_cloned(skb) &&
+ (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ return ERR_PTR(err);
+ }
+
+ SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb_mac_gso_segment(skb, features);
+}
+EXPORT_SYMBOL(__skb_gso_segment);
/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
@@ -2410,126 +2581,28 @@ out:
return rc;
}
-static u32 hashrnd __read_mostly;
-
-/*
- * Returns a Tx hash based on the given packet descriptor a Tx queues' number
- * to be used as a distribution range.
- */
-u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
- unsigned int num_tx_queues)
+static void qdisc_pkt_len_init(struct sk_buff *skb)
{
- u32 hash;
- u16 qoffset = 0;
- u16 qcount = num_tx_queues;
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
- if (skb_rx_queue_recorded(skb)) {
- hash = skb_get_rx_queue(skb);
- while (unlikely(hash >= num_tx_queues))
- hash -= num_tx_queues;
- return hash;
- }
-
- if (dev->num_tc) {
- u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
- qoffset = dev->tc_to_txq[tc].offset;
- qcount = dev->tc_to_txq[tc].count;
- }
-
- if (skb->sk && skb->sk->sk_hash)
- hash = skb->sk->sk_hash;
- else
- hash = (__force u16) skb->protocol;
- hash = jhash_1word(hash, hashrnd);
-
- return (u16) (((u64) hash * qcount) >> 32) + qoffset;
-}
-EXPORT_SYMBOL(__skb_tx_hash);
-
-static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
-{
- if (unlikely(queue_index >= dev->real_num_tx_queues)) {
- net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
- dev->name, queue_index,
- dev->real_num_tx_queues);
- return 0;
- }
- return queue_index;
-}
-
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
-{
-#ifdef CONFIG_XPS
- struct xps_dev_maps *dev_maps;
- struct xps_map *map;
- int queue_index = -1;
-
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_maps);
- if (dev_maps) {
- map = rcu_dereference(
- dev_maps->cpu_map[raw_smp_processor_id()]);
- if (map) {
- if (map->len == 1)
- queue_index = map->queues[0];
- else {
- u32 hash;
- if (skb->sk && skb->sk->sk_hash)
- hash = skb->sk->sk_hash;
- else
- hash = (__force u16) skb->protocol ^
- skb->rxhash;
- hash = jhash_1word(hash, hashrnd);
- queue_index = map->queues[
- ((u64)hash * map->len) >> 32];
- }
- if (unlikely(queue_index >= dev->real_num_tx_queues))
- queue_index = -1;
- }
- }
- rcu_read_unlock();
-
- return queue_index;
-#else
- return -1;
-#endif
-}
-
-struct netdev_queue *netdev_pick_tx(struct net_device *dev,
- struct sk_buff *skb)
-{
- int queue_index;
- const struct net_device_ops *ops = dev->netdev_ops;
-
- if (dev->real_num_tx_queues == 1)
- queue_index = 0;
- else if (ops->ndo_select_queue) {
- queue_index = ops->ndo_select_queue(dev, skb);
- queue_index = dev_cap_txqueue(dev, queue_index);
- } else {
- struct sock *sk = skb->sk;
- queue_index = sk_tx_queue_get(sk);
-
- if (queue_index < 0 || skb->ooo_okay ||
- queue_index >= dev->real_num_tx_queues) {
- int old_index = queue_index;
+ qdisc_skb_cb(skb)->pkt_len = skb->len;
- queue_index = get_xps_queue(dev, skb);
- if (queue_index < 0)
- queue_index = skb_tx_hash(dev, skb);
+ /* To get more precise estimation of bytes sent on wire,
+ * we add to pkt_len the headers size of all segments
+ */
+ if (shinfo->gso_size) {
+ unsigned int hdr_len;
- if (queue_index != old_index && sk) {
- struct dst_entry *dst =
- rcu_dereference_check(sk->sk_dst_cache, 1);
+ /* mac layer + network layer */
+ hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
- if (dst && skb_dst(skb) == dst)
- sk_tx_queue_set(sk, queue_index);
- }
- }
+ /* + transport layer */
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ hdr_len += tcp_hdrlen(skb);
+ else
+ hdr_len += sizeof(struct udphdr);
+ qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
}
-
- skb_set_queue_mapping(skb, queue_index);
- return netdev_get_tx_queue(dev, queue_index);
}
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
@@ -2540,7 +2613,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
bool contended;
int rc;
- qdisc_skb_cb(skb)->pkt_len = skb->len;
+ qdisc_pkt_len_init(skb);
qdisc_calculate_pkt_len(skb, q);
/*
* Heuristic to force contended enqueues to serialize on a
@@ -2663,6 +2736,8 @@ int dev_queue_xmit(struct sk_buff *skb)
struct Qdisc *q;
int rc = -ENOMEM;
+ skb_reset_mac_header(skb);
+
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
@@ -2757,41 +2832,6 @@ static inline void ____napi_schedule(struct softnet_data *sd,
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
-/*
- * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
- * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
- * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
- * if hash is a canonical 4-tuple hash over transport ports.
- */
-void __skb_get_rxhash(struct sk_buff *skb)
-{
- struct flow_keys keys;
- u32 hash;
-
- if (!skb_flow_dissect(skb, &keys))
- return;
-
- if (keys.ports)
- skb->l4_rxhash = 1;
-
- /* get a consistent hash (same value on both flow directions) */
- if (((__force u32)keys.dst < (__force u32)keys.src) ||
- (((__force u32)keys.dst == (__force u32)keys.src) &&
- ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
- swap(keys.dst, keys.src);
- swap(keys.port16[0], keys.port16[1]);
- }
-
- hash = jhash_3words((__force u32)keys.dst,
- (__force u32)keys.src,
- (__force u32)keys.ports, hashrnd);
- if (!hash)
- hash = 1;
-
- skb->rxhash = hash;
-}
-EXPORT_SYMBOL(__skb_get_rxhash);
-
#ifdef CONFIG_RPS
/* One global table that all flow-based protocols share. */
@@ -3318,7 +3358,7 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
}
}
-static int __netif_receive_skb(struct sk_buff *skb)
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
@@ -3327,24 +3367,11 @@ static int __netif_receive_skb(struct sk_buff *skb)
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
- unsigned long pflags = current->flags;
net_timestamp_check(!netdev_tstamp_prequeue, skb);
trace_netif_receive_skb(skb);
- /*
- * PFMEMALLOC skbs are special, they should
- * - be delivered to SOCK_MEMALLOC sockets only
- * - stay away from userspace
- * - have bounded memory usage
- *
- * Use PF_MEMALLOC as this saves us from propagating the allocation
- * context down to all allocation sites.
- */
- if (sk_memalloc_socks() && skb_pfmemalloc(skb))
- current->flags |= PF_MEMALLOC;
-
/* if we've gotten here through NAPI, check netpoll */
if (netpoll_receive_skb(skb))
goto out;
@@ -3352,7 +3379,8 @@ static int __netif_receive_skb(struct sk_buff *skb)
orig_dev = skb->dev;
skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
+ if (!skb_transport_header_was_set(skb))
+ skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
pt_prev = NULL;
@@ -3377,7 +3405,7 @@ another_round:
}
#endif
- if (sk_memalloc_socks() && skb_pfmemalloc(skb))
+ if (pfmemalloc)
goto skip_taps;
list_for_each_entry_rcu(ptype, &ptype_all, list) {
@@ -3396,8 +3424,7 @@ skip_taps:
ncls:
#endif
- if (sk_memalloc_socks() && skb_pfmemalloc(skb)
- && !skb_pfmemalloc_protocol(skb))
+ if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
if (vlan_tx_tag_present(skb)) {
@@ -3467,7 +3494,31 @@ drop:
unlock:
rcu_read_unlock();
out:
- tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ return ret;
+}
+
+static int __netif_receive_skb(struct sk_buff *skb)
+{
+ int ret;
+
+ if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
+ unsigned long pflags = current->flags;
+
+ /*
+ * PFMEMALLOC skbs are special, they should
+ * - be delivered to SOCK_MEMALLOC sockets only
+ * - stay away from userspace
+ * - have bounded memory usage
+ *
+ * Use PF_MEMALLOC as this saves us from propagating the allocation
+ * context down to all allocation sites.
+ */
+ current->flags |= PF_MEMALLOC;
+ ret = __netif_receive_skb_core(skb, true);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
+ } else
+ ret = __netif_receive_skb_core(skb, false);
+
return ret;
}
@@ -3634,7 +3685,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
__be16 type = skb->protocol;
struct list_head *head = &offload_base;
int same_flow;
- int mac_len;
enum gro_result ret;
if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
@@ -3651,8 +3701,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
continue;
skb_set_network_header(skb, skb_gro_offset(skb));
- mac_len = skb->network_header - skb->mac_header;
- skb->mac_len = mac_len;
+ skb_reset_mac_len(skb);
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
@@ -4134,530 +4183,231 @@ softnet_break:
goto out;
}
-static gifconf_func_t *gifconf_list[NPROTO];
-
-/**
- * register_gifconf - register a SIOCGIF handler
- * @family: Address family
- * @gifconf: Function handler
- *
- * Register protocol dependent address dumping routines. The handler
- * that is passed must not be freed or reused until it has been replaced
- * by another handler.
- */
-int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
-{
- if (family >= NPROTO)
- return -EINVAL;
- gifconf_list[family] = gifconf;
- return 0;
-}
-EXPORT_SYMBOL(register_gifconf);
-
-
-/*
- * Map an interface index to its name (SIOCGIFNAME)
- */
-
-/*
- * We need this ioctl for efficient implementation of the
- * if_indextoname() function required by the IPv6 API. Without
- * it, we would have to search all the interfaces to find a
- * match. --pb
- */
-
-static int dev_ifname(struct net *net, struct ifreq __user *arg)
-{
+struct netdev_upper {
struct net_device *dev;
- struct ifreq ifr;
- unsigned seq;
-
- /*
- * Fetch the caller's info block.
- */
+ bool master;
+ struct list_head list;
+ struct rcu_head rcu;
+ struct list_head search_list;
+};
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
- return -EFAULT;
+static void __append_search_uppers(struct list_head *search_list,
+ struct net_device *dev)
+{
+ struct netdev_upper *upper;
-retry:
- seq = read_seqcount_begin(&devnet_rename_seq);
- rcu_read_lock();
- dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
- if (!dev) {
- rcu_read_unlock();
- return -ENODEV;
+ list_for_each_entry(upper, &dev->upper_dev_list, list) {
+ /* check if this upper is not already in search list */
+ if (list_empty(&upper->search_list))
+ list_add_tail(&upper->search_list, search_list);
}
-
- strcpy(ifr.ifr_name, dev->name);
- rcu_read_unlock();
- if (read_seqcount_retry(&devnet_rename_seq, seq))
- goto retry;
-
- if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
- return -EFAULT;
- return 0;
}
-/*
- * Perform a SIOCGIFCONF call. This structure will change
- * size eventually, and there is nothing I can do about it.
- * Thus we will need a 'compatibility mode'.
- */
-
-static int dev_ifconf(struct net *net, char __user *arg)
+static bool __netdev_search_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
{
- struct ifconf ifc;
- struct net_device *dev;
- char __user *pos;
- int len;
- int total;
- int i;
-
- /*
- * Fetch the caller's info block.
- */
-
- if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
- return -EFAULT;
-
- pos = ifc.ifc_buf;
- len = ifc.ifc_len;
+ LIST_HEAD(search_list);
+ struct netdev_upper *upper;
+ struct netdev_upper *tmp;
+ bool ret = false;
- /*
- * Loop over the interfaces, and write an info block for each.
- */
-
- total = 0;
- for_each_netdev(net, dev) {
- for (i = 0; i < NPROTO; i++) {
- if (gifconf_list[i]) {
- int done;
- if (!pos)
- done = gifconf_list[i](dev, NULL, 0);
- else
- done = gifconf_list[i](dev, pos + total,
- len - total);
- if (done < 0)
- return -EFAULT;
- total += done;
- }
+ __append_search_uppers(&search_list, dev);
+ list_for_each_entry(upper, &search_list, search_list) {
+ if (upper->dev == upper_dev) {
+ ret = true;
+ break;
}
+ __append_search_uppers(&search_list, upper->dev);
}
-
- /*
- * All done. Write the updated control block back to the caller.
- */
- ifc.ifc_len = total;
-
- /*
- * Both BSD and Solaris return 0 here, so we do too.
- */
- return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
+ list_for_each_entry_safe(upper, tmp, &search_list, search_list)
+ INIT_LIST_HEAD(&upper->search_list);
+ return ret;
}
-#ifdef CONFIG_PROC_FS
-
-#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
-
-#define get_bucket(x) ((x) >> BUCKET_SPACE)
-#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
-#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
-
-static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
+static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
+ struct net_device *upper_dev)
{
- struct net *net = seq_file_net(seq);
- struct net_device *dev;
- struct hlist_node *p;
- struct hlist_head *h;
- unsigned int count = 0, offset = get_offset(*pos);
+ struct netdev_upper *upper;
- h = &net->dev_name_head[get_bucket(*pos)];
- hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
- if (++count == offset)
- return dev;
+ list_for_each_entry(upper, &dev->upper_dev_list, list) {
+ if (upper->dev == upper_dev)
+ return upper;
}
-
return NULL;
}
-static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
-{
- struct net_device *dev;
- unsigned int bucket;
-
- do {
- dev = dev_from_same_bucket(seq, pos);
- if (dev)
- return dev;
-
- bucket = get_bucket(*pos) + 1;
- *pos = set_bucket_offset(bucket, 1);
- } while (bucket < NETDEV_HASHENTRIES);
-
- return NULL;
-}
-
-/*
- * This is invoked by the /proc filesystem handler to display a device
- * in detail.
+/**
+ * netdev_has_upper_dev - Check if device is linked to an upper device
+ * @dev: device
+ * @upper_dev: upper device to check
+ *
+ * Find out if a device is linked to specified upper device and return true
+ * in case it is. Note that this checks only immediate upper device,
+ * not through a complete stack of devices. The caller must hold the RTNL lock.
*/
-void *dev_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
+bool netdev_has_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
{
- rcu_read_lock();
- if (!*pos)
- return SEQ_START_TOKEN;
-
- if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
- return NULL;
-
- return dev_from_bucket(seq, pos);
-}
-
-void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- ++*pos;
- return dev_from_bucket(seq, pos);
-}
-
-void dev_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
-{
- struct rtnl_link_stats64 temp;
- const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
+ ASSERT_RTNL();
- seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
- "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
- dev->name, stats->rx_bytes, stats->rx_packets,
- stats->rx_errors,
- stats->rx_dropped + stats->rx_missed_errors,
- stats->rx_fifo_errors,
- stats->rx_length_errors + stats->rx_over_errors +
- stats->rx_crc_errors + stats->rx_frame_errors,
- stats->rx_compressed, stats->multicast,
- stats->tx_bytes, stats->tx_packets,
- stats->tx_errors, stats->tx_dropped,
- stats->tx_fifo_errors, stats->collisions,
- stats->tx_carrier_errors +
- stats->tx_aborted_errors +
- stats->tx_window_errors +
- stats->tx_heartbeat_errors,
- stats->tx_compressed);
+ return __netdev_find_upper(dev, upper_dev);
}
+EXPORT_SYMBOL(netdev_has_upper_dev);
-/*
- * Called from the PROCfs module. This now uses the new arbitrary sized
- * /proc/net interface to create /proc/net/dev
+/**
+ * netdev_has_any_upper_dev - Check if device is linked to some device
+ * @dev: device
+ *
+ * Find out if a device is linked to an upper device and return true in case
+ * it is. The caller must hold the RTNL lock.
*/
-static int dev_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN)
- seq_puts(seq, "Inter-| Receive "
- " | Transmit\n"
- " face |bytes packets errs drop fifo frame "
- "compressed multicast|bytes packets errs "
- "drop fifo colls carrier compressed\n");
- else
- dev_seq_printf_stats(seq, v);
- return 0;
-}
-
-static struct softnet_data *softnet_get_online(loff_t *pos)
-{
- struct softnet_data *sd = NULL;
-
- while (*pos < nr_cpu_ids)
- if (cpu_online(*pos)) {
- sd = &per_cpu(softnet_data, *pos);
- break;
- } else
- ++*pos;
- return sd;
-}
-
-static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
-{
- return softnet_get_online(pos);
-}
-
-static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+bool netdev_has_any_upper_dev(struct net_device *dev)
{
- ++*pos;
- return softnet_get_online(pos);
-}
-
-static void softnet_seq_stop(struct seq_file *seq, void *v)
-{
-}
-
-static int softnet_seq_show(struct seq_file *seq, void *v)
-{
- struct softnet_data *sd = v;
+ ASSERT_RTNL();
- seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
- sd->processed, sd->dropped, sd->time_squeeze, 0,
- 0, 0, 0, 0, /* was fastroute */
- sd->cpu_collision, sd->received_rps);
- return 0;
+ return !list_empty(&dev->upper_dev_list);
}
+EXPORT_SYMBOL(netdev_has_any_upper_dev);
-static const struct seq_operations dev_seq_ops = {
- .start = dev_seq_start,
- .next = dev_seq_next,
- .stop = dev_seq_stop,
- .show = dev_seq_show,
-};
-
-static int dev_seq_open(struct inode *inode, struct file *file)
+/**
+ * netdev_master_upper_dev_get - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RTNL lock.
+ */
+struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
- return seq_open_net(inode, file, &dev_seq_ops,
- sizeof(struct seq_net_private));
-}
+ struct netdev_upper *upper;
-static const struct file_operations dev_seq_fops = {
- .owner = THIS_MODULE,
- .open = dev_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
+ ASSERT_RTNL();
-static const struct seq_operations softnet_seq_ops = {
- .start = softnet_seq_start,
- .next = softnet_seq_next,
- .stop = softnet_seq_stop,
- .show = softnet_seq_show,
-};
+ if (list_empty(&dev->upper_dev_list))
+ return NULL;
-static int softnet_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &softnet_seq_ops);
+ upper = list_first_entry(&dev->upper_dev_list,
+ struct netdev_upper, list);
+ if (likely(upper->master))
+ return upper->dev;
+ return NULL;
}
+EXPORT_SYMBOL(netdev_master_upper_dev_get);
-static const struct file_operations softnet_seq_fops = {
- .owner = THIS_MODULE,
- .open = softnet_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-static void *ptype_get_idx(loff_t pos)
+/**
+ * netdev_master_upper_dev_get_rcu - Get master upper device
+ * @dev: device
+ *
+ * Find a master upper device and return pointer to it or NULL in case
+ * it's not there. The caller must hold the RCU read lock.
+ */
+struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
- struct packet_type *pt = NULL;
- loff_t i = 0;
- int t;
+ struct netdev_upper *upper;
- list_for_each_entry_rcu(pt, &ptype_all, list) {
- if (i == pos)
- return pt;
- ++i;
- }
-
- for (t = 0; t < PTYPE_HASH_SIZE; t++) {
- list_for_each_entry_rcu(pt, &ptype_base[t], list) {
- if (i == pos)
- return pt;
- ++i;
- }
- }
+ upper = list_first_or_null_rcu(&dev->upper_dev_list,
+ struct netdev_upper, list);
+ if (upper && likely(upper->master))
+ return upper->dev;
return NULL;
}
+EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
-static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
+static int __netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev, bool master)
{
- rcu_read_lock();
- return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
-}
+ struct netdev_upper *upper;
-static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct packet_type *pt;
- struct list_head *nxt;
- int hash;
+ ASSERT_RTNL();
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return ptype_get_idx(0);
+ if (dev == upper_dev)
+ return -EBUSY;
- pt = v;
- nxt = pt->list.next;
- if (pt->type == htons(ETH_P_ALL)) {
- if (nxt != &ptype_all)
- goto found;
- hash = 0;
- nxt = ptype_base[0].next;
- } else
- hash = ntohs(pt->type) & PTYPE_HASH_MASK;
+ /* To prevent loops, check if dev is not upper device to upper_dev. */
+ if (__netdev_search_upper_dev(upper_dev, dev))
+ return -EBUSY;
- while (nxt == &ptype_base[hash]) {
- if (++hash >= PTYPE_HASH_SIZE)
- return NULL;
- nxt = ptype_base[hash].next;
- }
-found:
- return list_entry(nxt, struct packet_type, list);
-}
+ if (__netdev_find_upper(dev, upper_dev))
+ return -EEXIST;
-static void ptype_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
-{
- rcu_read_unlock();
-}
+ if (master && netdev_master_upper_dev_get(dev))
+ return -EBUSY;
-static int ptype_seq_show(struct seq_file *seq, void *v)
-{
- struct packet_type *pt = v;
+ upper = kmalloc(sizeof(*upper), GFP_KERNEL);
+ if (!upper)
+ return -ENOMEM;
- if (v == SEQ_START_TOKEN)
- seq_puts(seq, "Type Device Function\n");
- else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
- if (pt->type == htons(ETH_P_ALL))
- seq_puts(seq, "ALL ");
- else
- seq_printf(seq, "%04x", ntohs(pt->type));
+ upper->dev = upper_dev;
+ upper->master = master;
+ INIT_LIST_HEAD(&upper->search_list);
- seq_printf(seq, " %-8s %pF\n",
- pt->dev ? pt->dev->name : "", pt->func);
- }
+ /* Ensure that master upper link is always the first item in list. */
+ if (master)
+ list_add_rcu(&upper->list, &dev->upper_dev_list);
+ else
+ list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
+ dev_hold(upper_dev);
return 0;
}
-static const struct seq_operations ptype_seq_ops = {
- .start = ptype_seq_start,
- .next = ptype_seq_next,
- .stop = ptype_seq_stop,
- .show = ptype_seq_show,
-};
-
-static int ptype_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open_net(inode, file, &ptype_seq_ops,
- sizeof(struct seq_net_private));
-}
-
-static const struct file_operations ptype_seq_fops = {
- .owner = THIS_MODULE,
- .open = ptype_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_net,
-};
-
-
-static int __net_init dev_proc_net_init(struct net *net)
-{
- int rc = -ENOMEM;
-
- if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
- goto out;
- if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
- goto out_dev;
- if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
- goto out_softnet;
-
- if (wext_proc_init(net))
- goto out_ptype;
- rc = 0;
-out:
- return rc;
-out_ptype:
- proc_net_remove(net, "ptype");
-out_softnet:
- proc_net_remove(net, "softnet_stat");
-out_dev:
- proc_net_remove(net, "dev");
- goto out;
-}
-
-static void __net_exit dev_proc_net_exit(struct net *net)
-{
- wext_proc_exit(net);
-
- proc_net_remove(net, "ptype");
- proc_net_remove(net, "softnet_stat");
- proc_net_remove(net, "dev");
-}
-
-static struct pernet_operations __net_initdata dev_proc_ops = {
- .init = dev_proc_net_init,
- .exit = dev_proc_net_exit,
-};
-
-static int __init dev_proc_init(void)
+/**
+ * netdev_upper_dev_link - Add a link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Adds a link to device which is upper to this one. The caller must hold
+ * the RTNL lock. On a failure a negative errno code is returned.
+ * On success the reference counts are adjusted and the function
+ * returns zero.
+ */
+int netdev_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
{
- return register_pernet_subsys(&dev_proc_ops);
+ return __netdev_upper_dev_link(dev, upper_dev, false);
}
-#else
-#define dev_proc_init() 0
-#endif /* CONFIG_PROC_FS */
-
+EXPORT_SYMBOL(netdev_upper_dev_link);
/**
- * netdev_set_master - set up master pointer
- * @slave: slave device
- * @master: new master device
+ * netdev_master_upper_dev_link - Add a master link to the upper device
+ * @dev: device
+ * @upper_dev: new upper device
*
- * Changes the master device of the slave. Pass %NULL to break the
- * bonding. The caller must hold the RTNL semaphore. On a failure
- * a negative errno code is returned. On success the reference counts
- * are adjusted and the function returns zero.
+ * Adds a link to device which is upper to this one. In this case, only
+ * one master upper device can be linked, although other non-master devices
+ * might be linked as well. The caller must hold the RTNL lock.
+ * On a failure a negative errno code is returned. On success the reference
+ * counts are adjusted and the function returns zero.
*/
-int netdev_set_master(struct net_device *slave, struct net_device *master)
+int netdev_master_upper_dev_link(struct net_device *dev,
+ struct net_device *upper_dev)
{
- struct net_device *old = slave->master;
-
- ASSERT_RTNL();
-
- if (master) {
- if (old)
- return -EBUSY;
- dev_hold(master);
- }
-
- slave->master = master;
-
- if (old)
- dev_put(old);
- return 0;
+ return __netdev_upper_dev_link(dev, upper_dev, true);
}
-EXPORT_SYMBOL(netdev_set_master);
+EXPORT_SYMBOL(netdev_master_upper_dev_link);
/**
- * netdev_set_bond_master - set up bonding master/slave pair
- * @slave: slave device
- * @master: new master device
+ * netdev_upper_dev_unlink - Removes a link to upper device
+ * @dev: device
+ * @upper_dev: new upper device
*
- * Changes the master device of the slave. Pass %NULL to break the
- * bonding. The caller must hold the RTNL semaphore. On a failure
- * a negative errno code is returned. On success %RTM_NEWLINK is sent
- * to the routing socket and the function returns zero.
+ * Removes a link to device which is upper to this one. The caller must hold
+ * the RTNL lock.
*/
-int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
+void netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
{
- int err;
+ struct netdev_upper *upper;
ASSERT_RTNL();
- err = netdev_set_master(slave, master);
- if (err)
- return err;
- if (master)
- slave->flags |= IFF_SLAVE;
- else
- slave->flags &= ~IFF_SLAVE;
-
- rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
- return 0;
+ upper = __netdev_find_upper(dev, upper_dev);
+ if (!upper)
+ return;
+ list_del_rcu(&upper->list);
+ dev_put(upper_dev);
+ kfree_rcu(upper, rcu);
}
-EXPORT_SYMBOL(netdev_set_bond_master);
+EXPORT_SYMBOL(netdev_upper_dev_unlink);
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
@@ -5020,381 +4770,33 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
if (!netif_device_present(dev))
return -ENODEV;
err = ops->ndo_set_mac_address(dev, sa);
- if (!err)
- call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ if (err)
+ return err;
+ dev->addr_assign_type = NET_ADDR_SET;
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
- return err;
+ return 0;
}
EXPORT_SYMBOL(dev_set_mac_address);
-/*
- * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
- */
-static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
-{
- int err;
- struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
-
- if (!dev)
- return -ENODEV;
-
- switch (cmd) {
- case SIOCGIFFLAGS: /* Get interface flags */
- ifr->ifr_flags = (short) dev_get_flags(dev);
- return 0;
-
- case SIOCGIFMETRIC: /* Get the metric on the interface
- (currently unused) */
- ifr->ifr_metric = 0;
- return 0;
-
- case SIOCGIFMTU: /* Get the MTU of a device */
- ifr->ifr_mtu = dev->mtu;
- return 0;
-
- case SIOCGIFHWADDR:
- if (!dev->addr_len)
- memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
- else
- memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
- ifr->ifr_hwaddr.sa_family = dev->type;
- return 0;
-
- case SIOCGIFSLAVE:
- err = -EINVAL;
- break;
-
- case SIOCGIFMAP:
- ifr->ifr_map.mem_start = dev->mem_start;
- ifr->ifr_map.mem_end = dev->mem_end;
- ifr->ifr_map.base_addr = dev->base_addr;
- ifr->ifr_map.irq = dev->irq;
- ifr->ifr_map.dma = dev->dma;
- ifr->ifr_map.port = dev->if_port;
- return 0;
-
- case SIOCGIFINDEX:
- ifr->ifr_ifindex = dev->ifindex;
- return 0;
-
- case SIOCGIFTXQLEN:
- ifr->ifr_qlen = dev->tx_queue_len;
- return 0;
-
- default:
- /* dev_ioctl() should ensure this case
- * is never reached
- */
- WARN_ON(1);
- err = -ENOTTY;
- break;
-
- }
- return err;
-}
-
-/*
- * Perform the SIOCxIFxxx calls, inside rtnl_lock()
- */
-static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
-{
- int err;
- struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
- const struct net_device_ops *ops;
-
- if (!dev)
- return -ENODEV;
-
- ops = dev->netdev_ops;
-
- switch (cmd) {
- case SIOCSIFFLAGS: /* Set interface flags */
- return dev_change_flags(dev, ifr->ifr_flags);
-
- case SIOCSIFMETRIC: /* Set the metric on the interface
- (currently unused) */
- return -EOPNOTSUPP;
-
- case SIOCSIFMTU: /* Set the MTU of a device */
- return dev_set_mtu(dev, ifr->ifr_mtu);
-
- case SIOCSIFHWADDR:
- return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
-
- case SIOCSIFHWBROADCAST:
- if (ifr->ifr_hwaddr.sa_family != dev->type)
- return -EINVAL;
- memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
- min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
- call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
- return 0;
-
- case SIOCSIFMAP:
- if (ops->ndo_set_config) {
- if (!netif_device_present(dev))
- return -ENODEV;
- return ops->ndo_set_config(dev, &ifr->ifr_map);
- }
- return -EOPNOTSUPP;
-
- case SIOCADDMULTI:
- if (!ops->ndo_set_rx_mode ||
- ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
- return -EINVAL;
- if (!netif_device_present(dev))
- return -ENODEV;
- return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
-
- case SIOCDELMULTI:
- if (!ops->ndo_set_rx_mode ||
- ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
- return -EINVAL;
- if (!netif_device_present(dev))
- return -ENODEV;
- return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
-
- case SIOCSIFTXQLEN:
- if (ifr->ifr_qlen < 0)
- return -EINVAL;
- dev->tx_queue_len = ifr->ifr_qlen;
- return 0;
-
- case SIOCSIFNAME:
- ifr->ifr_newname[IFNAMSIZ-1] = '\0';
- return dev_change_name(dev, ifr->ifr_newname);
-
- case SIOCSHWTSTAMP:
- err = net_hwtstamp_validate(ifr);
- if (err)
- return err;
- /* fall through */
-
- /*
- * Unknown or private ioctl
- */
- default:
- if ((cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15) ||
- cmd == SIOCBONDENSLAVE ||
- cmd == SIOCBONDRELEASE ||
- cmd == SIOCBONDSETHWADDR ||
- cmd == SIOCBONDSLAVEINFOQUERY ||
- cmd == SIOCBONDINFOQUERY ||
- cmd == SIOCBONDCHANGEACTIVE ||
- cmd == SIOCGMIIPHY ||
- cmd == SIOCGMIIREG ||
- cmd == SIOCSMIIREG ||
- cmd == SIOCBRADDIF ||
- cmd == SIOCBRDELIF ||
- cmd == SIOCSHWTSTAMP ||
- cmd == SIOCWANDEV) {
- err = -EOPNOTSUPP;
- if (ops->ndo_do_ioctl) {
- if (netif_device_present(dev))
- err = ops->ndo_do_ioctl(dev, ifr, cmd);
- else
- err = -ENODEV;
- }
- } else
- err = -EINVAL;
-
- }
- return err;
-}
-
-/*
- * This function handles all "interface"-type I/O control requests. The actual
- * 'doing' part of this is dev_ifsioc above.
- */
-
/**
- * dev_ioctl - network device ioctl
- * @net: the applicable net namespace
- * @cmd: command to issue
- * @arg: pointer to a struct ifreq in user space
+ * dev_change_carrier - Change device carrier
+ * @dev: device
+ * @new_carries: new value
*
- * Issue ioctl functions to devices. This is normally called by the
- * user space syscall interfaces but can sometimes be useful for
- * other purposes. The return value is the return from the syscall if
- * positive or a negative errno code on error.
+ * Change device carrier
*/
-
-int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
{
- struct ifreq ifr;
- int ret;
- char *colon;
-
- /* One special case: SIOCGIFCONF takes ifconf argument
- and requires shared lock, because it sleeps writing
- to user space.
- */
-
- if (cmd == SIOCGIFCONF) {
- rtnl_lock();
- ret = dev_ifconf(net, (char __user *) arg);
- rtnl_unlock();
- return ret;
- }
- if (cmd == SIOCGIFNAME)
- return dev_ifname(net, (struct ifreq __user *)arg);
-
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
- return -EFAULT;
-
- ifr.ifr_name[IFNAMSIZ-1] = 0;
-
- colon = strchr(ifr.ifr_name, ':');
- if (colon)
- *colon = 0;
-
- /*
- * See which interface the caller is talking about.
- */
-
- switch (cmd) {
- /*
- * These ioctl calls:
- * - can be done by all.
- * - atomic and do not require locking.
- * - return a value
- */
- case SIOCGIFFLAGS:
- case SIOCGIFMETRIC:
- case SIOCGIFMTU:
- case SIOCGIFHWADDR:
- case SIOCGIFSLAVE:
- case SIOCGIFMAP:
- case SIOCGIFINDEX:
- case SIOCGIFTXQLEN:
- dev_load(net, ifr.ifr_name);
- rcu_read_lock();
- ret = dev_ifsioc_locked(net, &ifr, cmd);
- rcu_read_unlock();
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
-
- case SIOCETHTOOL:
- dev_load(net, ifr.ifr_name);
- rtnl_lock();
- ret = dev_ethtool(net, &ifr);
- rtnl_unlock();
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
-
- /*
- * These ioctl calls:
- * - require superuser power.
- * - require strict serialization.
- * - return a value
- */
- case SIOCGMIIPHY:
- case SIOCGMIIREG:
- case SIOCSIFNAME:
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
- dev_load(net, ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(net, &ifr, cmd);
- rtnl_unlock();
- if (!ret) {
- if (colon)
- *colon = ':';
- if (copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- }
- return ret;
-
- /*
- * These ioctl calls:
- * - require superuser power.
- * - require strict serialization.
- * - do not return a value
- */
- case SIOCSIFMAP:
- case SIOCSIFTXQLEN:
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- /* fall through */
- /*
- * These ioctl calls:
- * - require local superuser power.
- * - require strict serialization.
- * - do not return a value
- */
- case SIOCSIFFLAGS:
- case SIOCSIFMETRIC:
- case SIOCSIFMTU:
- case SIOCSIFHWADDR:
- case SIOCSIFSLAVE:
- case SIOCADDMULTI:
- case SIOCDELMULTI:
- case SIOCSIFHWBROADCAST:
- case SIOCSMIIREG:
- case SIOCBONDENSLAVE:
- case SIOCBONDRELEASE:
- case SIOCBONDSETHWADDR:
- case SIOCBONDCHANGEACTIVE:
- case SIOCBRADDIF:
- case SIOCBRDELIF:
- case SIOCSHWTSTAMP:
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
- /* fall through */
- case SIOCBONDSLAVEINFOQUERY:
- case SIOCBONDINFOQUERY:
- dev_load(net, ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(net, &ifr, cmd);
- rtnl_unlock();
- return ret;
-
- case SIOCGIFMEM:
- /* Get the per device memory space. We can add this but
- * currently do not support it */
- case SIOCSIFMEM:
- /* Set the per device memory buffer space.
- * Not applicable in our case */
- case SIOCSIFLINK:
- return -ENOTTY;
+ const struct net_device_ops *ops = dev->netdev_ops;
- /*
- * Unknown or private ioctl.
- */
- default:
- if (cmd == SIOCWANDEV ||
- (cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15)) {
- dev_load(net, ifr.ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(net, &ifr, cmd);
- rtnl_unlock();
- if (!ret && copy_to_user(arg, &ifr,
- sizeof(struct ifreq)))
- ret = -EFAULT;
- return ret;
- }
- /* Take care of Wireless Extensions */
- if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
- return wext_handle_ioctl(net, &ifr, cmd, arg);
- return -ENOTTY;
- }
+ if (!ops->ndo_change_carrier)
+ return -EOPNOTSUPP;
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ return ops->ndo_change_carrier(dev, new_carrier);
}
-
+EXPORT_SYMBOL(dev_change_carrier);
/**
* dev_new_index - allocate an ifindex
@@ -5482,11 +4884,15 @@ static void rollback_registered_many(struct list_head *head)
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
- /* Notifier chain MUST detach us from master device. */
- WARN_ON(dev->master);
+ /* Notifier chain MUST detach us all upper devices. */
+ WARN_ON(netdev_has_any_upper_dev(dev));
/* Remove entries from kobject tree */
netdev_unregister_kobject(dev);
+#ifdef CONFIG_XPS
+ /* Remove XPS queueing entries */
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
}
synchronize_net();
@@ -5664,10 +5070,9 @@ static int netif_alloc_rx_queues(struct net_device *dev)
BUG_ON(count < 1);
rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
- if (!rx) {
- pr_err("netdev: Unable to allocate %u rx queues\n", count);
+ if (!rx)
return -ENOMEM;
- }
+
dev->_rx = rx;
for (i = 0; i < count; i++)
@@ -5698,10 +5103,9 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
BUG_ON(count < 1);
tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
- if (!tx) {
- pr_err("netdev: Unable to allocate %u tx queues\n", count);
+ if (!tx)
return -ENOMEM;
- }
+
dev->_tx = tx;
netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
@@ -5760,6 +5164,14 @@ int register_netdevice(struct net_device *dev)
}
}
+ if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
+ (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
+ !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
+ netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
+ ret = -EINVAL;
+ goto err_uninit;
+ }
+
ret = -EBUSY;
if (!dev->ifindex)
dev->ifindex = dev_new_index(net);
@@ -5815,6 +5227,13 @@ int register_netdevice(struct net_device *dev)
list_netdevice(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
+ /* If the device has permanent device address, driver should
+ * set dev_addr and also addr_assign_type should be set to
+ * NET_ADDR_PERM (default value).
+ */
+ if (dev->addr_assign_type == NET_ADDR_PERM)
+ memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
+
/* Notify protocols, that a new device appeared. */
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
ret = notifier_to_errno(ret);
@@ -6173,10 +5592,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
alloc_size += NETDEV_ALIGN - 1;
p = kzalloc(alloc_size, GFP_KERNEL);
- if (!p) {
- pr_err("alloc_netdev: Unable to allocate device\n");
+ if (!p)
return NULL;
- }
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
@@ -6199,6 +5616,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
INIT_LIST_HEAD(&dev->link_watch_list);
+ INIT_LIST_HEAD(&dev->upper_dev_list);
dev->priv_flags = IFF_XMIT_DST_RELEASE;
setup(dev);
@@ -6842,19 +6260,9 @@ static int __init net_dev_init(void)
hotcpu_notifier(dev_cpu_callback, 0);
dst_init();
- dev_mcast_init();
rc = 0;
out:
return rc;
}
subsys_initcall(net_dev_init);
-
-static int __init initialize_hashrnd(void)
-{
- get_random_bytes(&hashrnd, sizeof(hashrnd));
- return 0;
-}
-
-late_initcall_sync(initialize_hashrnd);
-