From 5be5515a8ea198de6eb204a0ff25faf98b8ff719 Mon Sep 17 00:00:00 2001 From: Julio Faracco Date: Tue, 1 Oct 2019 11:39:04 -0300 Subject: net: core: dev: replace state xoff flag comparison by netif_xmit_stopped method Function netif_schedule_queue() has a hardcoded comparison between queue state and any xoff flag. This comparison does the same thing as method netif_xmit_stopped(). In terms of code clarity, it is better. See other methods like: generic_xdp_tx() and dev_direct_xmit(). Signed-off-by: Julio Faracco Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index bf3ed413abaf..21a9c2987cbb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2771,7 +2771,7 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) void netif_schedule_queue(struct netdev_queue *txq) { rcu_read_lock(); - if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { + if (!netif_xmit_stopped(txq)) { struct Qdisc *q = rcu_dereference(txq->qdisc); __netif_schedule(q); -- cgit v1.2.3 From ff92741270bf8b6e78aa885f166b68c7a67ab13a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:15 +0200 Subject: net: introduce name_node struct to be used in hashlist Introduce name_node structure to hold name of device and put it into hashlist instead of putting there struct net_device directly. Add a necessary infrastructure to manipulate the hashlist. This prepares the code to use the same hashlist for alternative names introduced later in this set. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 79 insertions(+), 18 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 21a9c2987cbb..d2053d07c94a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -228,6 +228,67 @@ static inline void rps_unlock(struct softnet_data *sd) #endif } +static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, + const char *name) +{ + struct netdev_name_node *name_node; + + name_node = kmalloc(sizeof(*name_node), GFP_KERNEL); + if (!name_node) + return NULL; + INIT_HLIST_NODE(&name_node->hlist); + name_node->dev = dev; + name_node->name = name; + return name_node; +} + +static struct netdev_name_node * +netdev_name_node_head_alloc(struct net_device *dev) +{ + return netdev_name_node_alloc(dev, dev->name); +} + +static void netdev_name_node_free(struct netdev_name_node *name_node) +{ + kfree(name_node); +} + +static void netdev_name_node_add(struct net *net, + struct netdev_name_node *name_node) +{ + hlist_add_head_rcu(&name_node->hlist, + dev_name_hash(net, name_node->name)); +} + +static void netdev_name_node_del(struct netdev_name_node *name_node) +{ + hlist_del_rcu(&name_node->hlist); +} + +static struct netdev_name_node *netdev_name_node_lookup(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + +static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, + const char *name) +{ + struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *name_node; + + hlist_for_each_entry_rcu(name_node, head, hlist) + if (!strcmp(name_node->name, name)) + return name_node; + return NULL; +} + /* Device list insertion */ static void list_netdevice(struct net_device *dev) { @@ -237,7 +298,7 @@ static void list_netdevice(struct net_device *dev) write_lock_bh(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); write_unlock_bh(&dev_base_lock); @@ -255,7 +316,7 @@ static void unlist_netdevice(struct net_device *dev) /* Unlink dev from the device chain */ write_lock_bh(&dev_base_lock); list_del_rcu(&dev->dev_list); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); write_unlock_bh(&dev_base_lock); @@ -733,14 +794,10 @@ EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); struct net_device *__dev_get_by_name(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); + struct netdev_name_node *node_name; - hlist_for_each_entry(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; - - return NULL; + node_name = netdev_name_node_lookup(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(__dev_get_by_name); @@ -758,14 +815,10 @@ EXPORT_SYMBOL(__dev_get_by_name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) { - struct net_device *dev; - struct hlist_head *head = dev_name_hash(net, name); - - hlist_for_each_entry_rcu(dev, head, name_hlist) - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; + struct netdev_name_node *node_name; - return NULL; + node_name = netdev_name_node_lookup_rcu(net, name); + return node_name ? node_name->dev : NULL; } EXPORT_SYMBOL(dev_get_by_name_rcu); @@ -1232,13 +1285,13 @@ rollback: netdev_adjacent_rename_links(dev, oldname); write_lock_bh(&dev_base_lock); - hlist_del_rcu(&dev->name_hlist); + netdev_name_node_del(dev->name_node); write_unlock_bh(&dev_base_lock); synchronize_rcu(); write_lock_bh(&dev_base_lock); - hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); + netdev_name_node_add(net, dev->name_node); write_unlock_bh(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); @@ -8264,6 +8317,8 @@ static void rollback_registered_many(struct list_head *head) dev_uc_flush(dev); dev_mc_flush(dev); + netdev_name_node_free(dev->name_node); + if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -8706,6 +8761,10 @@ int register_netdevice(struct net_device *dev) if (ret < 0) goto out; + dev->name_node = netdev_name_node_head_alloc(dev); + if (!dev->name_node) + goto out; + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -8827,6 +8886,8 @@ out: return ret; err_uninit: + if (dev->name_node) + netdev_name_node_free(dev->name_node); if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); if (dev->priv_destructor) -- cgit v1.2.3 From 36fbf1e52bd3ff8a5cb604955eedfc9350c2e6cc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:16 +0200 Subject: net: rtnetlink: add linkprop commands to add and delete alternative ifnames Add two commands to add and delete list of link properties. Implement the first property type along - alternative ifnames. Each net device can have multiple alternative names. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index d2053d07c94a..7a456c6a7ad8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -245,7 +245,13 @@ static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev, static struct netdev_name_node * netdev_name_node_head_alloc(struct net_device *dev) { - return netdev_name_node_alloc(dev, dev->name); + struct netdev_name_node *name_node; + + name_node = netdev_name_node_alloc(dev, dev->name); + if (!name_node) + return NULL; + INIT_LIST_HEAD(&name_node->list); + return name_node; } static void netdev_name_node_free(struct netdev_name_node *name_node) @@ -289,6 +295,55 @@ static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, return NULL; } +int netdev_name_node_alt_create(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (name_node) + return -EEXIST; + name_node = netdev_name_node_alloc(dev, name); + if (!name_node) + return -ENOMEM; + netdev_name_node_add(net, name_node); + /* The node that holds dev->name acts as a head of per-device list. */ + list_add_tail(&name_node->list, &dev->name_node->list); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_create); + +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + list_del(&name_node->list); + netdev_name_node_del(name_node); + kfree(name_node->name); + netdev_name_node_free(name_node); +} + +int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) +{ + struct netdev_name_node *name_node; + struct net *net = dev_net(dev); + + name_node = netdev_name_node_lookup(net, name); + if (!name_node) + return -ENOENT; + __netdev_name_node_alt_destroy(name_node); + + return 0; +} +EXPORT_SYMBOL(netdev_name_node_alt_destroy); + +static void netdev_name_node_alt_flush(struct net_device *dev) +{ + struct netdev_name_node *name_node, *tmp; + + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) + __netdev_name_node_alt_destroy(name_node); +} + /* Device list insertion */ static void list_netdevice(struct net_device *dev) { @@ -8317,6 +8372,7 @@ static void rollback_registered_many(struct list_head *head) dev_uc_flush(dev); dev_mc_flush(dev); + netdev_name_node_alt_flush(dev); netdev_name_node_free(dev->name_node); if (dev->netdev_ops->ndo_uninit) -- cgit v1.2.3 From afa0df5998131153ec3036f41e76ece33bf1334f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 10:15:09 +0200 Subject: net: push loops and nb calls into helper functions Push iterations over net namespaces and netdevices from register_netdevice_notifier() and unregister_netdevice_notifier() into helper functions. Along with that introduce continue_reverse macros to make the code a bit nicer allowing to get rid of "last" marks. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 89 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 61 insertions(+), 28 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 7a456c6a7ad8..a8b70cb6c732 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1725,6 +1725,62 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, return nb->notifier_call(nb, val, &info); } +static int call_netdevice_register_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + int err; + + err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); + err = notifier_to_errno(err); + if (err) + return err; + + if (!(dev->flags & IFF_UP)) + return 0; + + call_netdevice_notifier(nb, NETDEV_UP, dev); + return 0; +} + +static void call_netdevice_unregister_notifiers(struct notifier_block *nb, + struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + call_netdevice_notifier(nb, NETDEV_GOING_DOWN, + dev); + call_netdevice_notifier(nb, NETDEV_DOWN, dev); + } + call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); +} + +static int call_netdevice_register_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + err = call_netdevice_register_notifiers(nb, dev); + if (err) + goto rollback; + } + return 0; + +rollback: + for_each_netdev_continue_reverse(net, dev) + call_netdevice_unregister_notifiers(nb, dev); + return err; +} + +static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb, + struct net *net) +{ + struct net_device *dev; + + for_each_netdev(net, dev) + call_netdevice_unregister_notifiers(nb, dev); +} + static int dev_boot_phase = 1; /** @@ -1743,8 +1799,6 @@ static int dev_boot_phase = 1; int register_netdevice_notifier(struct notifier_block *nb) { - struct net_device *dev; - struct net_device *last; struct net *net; int err; @@ -1757,17 +1811,9 @@ int register_netdevice_notifier(struct notifier_block *nb) if (dev_boot_phase) goto unlock; for_each_net(net) { - for_each_netdev(net, dev) { - err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); - err = notifier_to_errno(err); - if (err) - goto rollback; - - if (!(dev->flags & IFF_UP)) - continue; - - call_netdevice_notifier(nb, NETDEV_UP, dev); - } + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto rollback; } unlock: @@ -1776,22 +1822,9 @@ unlock: return err; rollback: - last = dev; - for_each_net(net) { - for_each_netdev(net, dev) { - if (dev == last) - goto outroll; - - if (dev->flags & IFF_UP) { - call_netdevice_notifier(nb, NETDEV_GOING_DOWN, - dev); - call_netdevice_notifier(nb, NETDEV_DOWN, dev); - } - call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); - } - } + for_each_net_continue_reverse(net) + call_netdevice_unregister_net_notifiers(nb, net); -outroll: raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; } -- cgit v1.2.3 From a30c7b429f2dd980202c912fcb76442364937b4d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 10:15:10 +0200 Subject: net: introduce per-netns netdevice notifiers Often the code for example in drivers is interested in getting notifier call only from certain network namespace. In addition to the existing global netdevice notifier chain introduce per-netns chains and allow users to register to that. Eventually this would eliminate unnecessary overhead in case there are many netdevices in many network namespaces. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index a8b70cb6c732..c680225e0da8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1874,6 +1874,80 @@ unlock: } EXPORT_SYMBOL(unregister_netdevice_notifier); +/** + * register_netdevice_notifier_net - register a per-netns network notifier block + * @net: network namespace + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_register(&net->netdev_chain, nb); + if (err) + goto unlock; + if (dev_boot_phase) + goto unlock; + + err = call_netdevice_register_net_notifiers(nb, net); + if (err) + goto chain_unregister; + +unlock: + rtnl_unlock(); + return err; + +chain_unregister: + raw_notifier_chain_unregister(&netdev_chain, nb); + goto unlock; +} +EXPORT_SYMBOL(register_netdevice_notifier_net); + +/** + * unregister_netdevice_notifier_net - unregister a per-netns + * network notifier block + * @net: network namespace + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + * + * After unregistering unregister and down device events are synthesized + * for all devices on the device list to the removed notifier to remove + * the need for special case cleanup code. + */ + +int unregister_netdevice_notifier_net(struct net *net, + struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = raw_notifier_chain_unregister(&net->netdev_chain, nb); + if (err) + goto unlock; + + call_netdevice_unregister_net_notifiers(nb, net); + +unlock: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(unregister_netdevice_notifier_net); + /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function @@ -1886,7 +1960,18 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); static int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info) { + struct net *net = dev_net(info->dev); + int ret; + ASSERT_RTNL(); + + /* Run per-netns notifier block chain first, then run the global one. + * Hopefully, one day, the global one is going to be removed after + * all notifier block registrators get converted to be per-netns. + */ + ret = raw_notifier_call_chain(&net->netdev_chain, val, info); + if (ret & NOTIFY_STOP_MASK) + return ret; return raw_notifier_call_chain(&netdev_chain, val, info); } @@ -9785,6 +9870,8 @@ static int __net_init netdev_init(struct net *net) if (net->dev_index_head == NULL) goto err_idx; + RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain); + return 0; err_idx: -- cgit v1.2.3 From 9077f052abd5391a866dd99e27212213648becef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Oct 2019 08:59:24 -0700 Subject: net: propagate errors correctly in register_netdevice() If netdev_name_node_head_alloc() fails to allocate memory, we absolutely want register_netdevice() to return -ENOMEM instead of zero :/ One of the syzbot report looked like : general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 8760 Comm: syz-executor839 Not tainted 5.3.0+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:ovs_vport_add+0x185/0x500 net/openvswitch/vport.c:205 Code: 89 c6 e8 3e b6 3a fa 49 81 fc 00 f0 ff ff 0f 87 6d 02 00 00 e8 8c b4 3a fa 4c 89 e2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 d3 02 00 00 49 8d 7c 24 08 49 8b 34 24 48 b8 00 RSP: 0018:ffff88808fe5f4e0 EFLAGS: 00010247 RAX: dffffc0000000000 RBX: ffffffff89be8820 RCX: ffffffff87385162 RDX: 0000000000000000 RSI: ffffffff87385174 RDI: 0000000000000007 RBP: ffff88808fe5f510 R08: ffff8880933c6600 R09: fffffbfff14ee13c R10: fffffbfff14ee13b R11: ffffffff8a7709df R12: 0000000000000004 R13: ffffffff89be8850 R14: ffff88808fe5f5e0 R15: 0000000000000002 FS: 0000000001d71880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000280 CR3: 0000000096e4c000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: new_vport+0x1b/0x1d0 net/openvswitch/datapath.c:194 ovs_dp_cmd_new+0x5e5/0xe30 net/openvswitch/datapath.c:1644 genl_family_rcv_msg+0x74b/0xf90 net/netlink/genetlink.c:629 genl_rcv_msg+0xca/0x170 net/netlink/genetlink.c:654 netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477 genl_rcv+0x29/0x40 net/netlink/genetlink.c:665 netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline] netlink_unicast+0x531/0x710 net/netlink/af_netlink.c:1328 netlink_sendmsg+0x8a5/0xd60 net/netlink/af_netlink.c:1917 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x803/0x920 net/socket.c:2311 __sys_sendmsg+0x105/0x1d0 net/socket.c:2356 __do_sys_sendmsg net/socket.c:2365 [inline] __se_sys_sendmsg net/socket.c:2363 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2363 Fixes: ff92741270bf ("net: introduce name_node struct to be used in hashlist") Signed-off-by: Eric Dumazet Cc: Jiri Pirko Reported-by: syzbot Tested-by: Willem de Bruijn Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index c680225e0da8..944de67ee95d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8935,6 +8935,7 @@ int register_netdevice(struct net_device *dev) if (ret < 0) goto out; + ret = -ENOMEM; dev->name_node = netdev_name_node_head_alloc(dev); if (!dev->name_node) goto out; -- cgit v1.2.3 From 8211fbfaf2fe66ac4ca28bb52b4e7f61dcac0378 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 6 Oct 2019 18:52:43 +0200 Subject: net: core: use helper skb_ensure_writable in more places Use helper skb_ensure_writable in two more places to simplify the code. Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- net/core/dev.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 944de67ee95d..7d05e042c6ba 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3165,12 +3165,9 @@ int skb_checksum_help(struct sk_buff *skb) offset += skb->csum_offset; BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__sum16))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + ret = skb_ensure_writable(skb, offset + sizeof(__sum16)); + if (ret) + goto out; *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: @@ -3205,12 +3202,11 @@ int skb_crc32c_csum_help(struct sk_buff *skb) ret = -EINVAL; goto out; } - if (skb_cloned(skb) && - !skb_clone_writable(skb, offset + sizeof(__le32))) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } + + ret = skb_ensure_writable(skb, offset + sizeof(__le32)); + if (ret) + goto out; + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, skb->len - start, ~(__u32)0, crc32c_csum_stub)); -- cgit v1.2.3 From bacb7e1855969bba78b32302453d2cc8ba0bc403 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Oct 2019 14:20:34 -0700 Subject: Revert "tun: call dev_get_valid_name() before register_netdevice()" This reverts commit 0ad646c81b2182f7fa67ec0c8c825e0ee165696d. As noticed by Jakub, this is no longer needed after commit 11fc7d5a0a2d ("tun: fix memory leak in error path") This no longer exports dev_get_valid_name() for the exclusive use of tun driver. Suggested-by: Jakub Kicinski Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/core/dev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 7d05e042c6ba..8bc3dce71fc0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1249,8 +1249,8 @@ int dev_alloc_name(struct net_device *dev, const char *name) } EXPORT_SYMBOL(dev_alloc_name); -int dev_get_valid_name(struct net *net, struct net_device *dev, - const char *name) +static int dev_get_valid_name(struct net *net, struct net_device *dev, + const char *name) { BUG_ON(!net); @@ -1266,7 +1266,6 @@ int dev_get_valid_name(struct net *net, struct net_device *dev, return 0; } -EXPORT_SYMBOL(dev_get_valid_name); /** * dev_change_name - change name of a device -- cgit v1.2.3 From 6570bc79c0dfff0f228b7afd2de720fb4e84d61d Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 14 Oct 2019 11:00:33 +0300 Subject: net: core: use listified Rx for GRO_NORMAL in napi_gro_receive() Commit 323ebb61e32b4 ("net: use listified RX for handling GRO_NORMAL skbs") made use of listified skb processing for the users of napi_gro_frags(). The same technique can be used in a way more common napi_gro_receive() to speed up non-merged (GRO_NORMAL) skbs for a wide range of drivers including gro_cells and mac80211 users. This slightly changes the return value in cases where skb is being dropped by the core stack, but it seems to have no impact on related drivers' functionality. gro_normal_batch is left untouched as it's very individual for every single system configuration and might be tuned in manual order to achieve an optimal performance. Signed-off-by: Alexander Lobakin Acked-by: Edward Cree Signed-off-by: David S. Miller --- net/core/dev.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 8bc3dce71fc0..74f593986524 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5884,6 +5884,26 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static void gro_normal_list(struct napi_struct *napi) +{ + if (!napi->rx_count) + return; + netif_receive_skb_list_internal(&napi->rx_list); + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +{ + list_add_tail(&skb->list, &napi->rx_list); + if (++napi->rx_count >= gro_normal_batch) + gro_normal_list(napi); +} + static void napi_skb_free_stolen_head(struct sk_buff *skb) { skb_dst_drop(skb); @@ -5891,12 +5911,13 @@ static void napi_skb_free_stolen_head(struct sk_buff *skb) kmem_cache_free(skbuff_head_cache, skb); } -static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) +static gro_result_t napi_skb_finish(struct napi_struct *napi, + struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - if (netif_receive_skb_internal(skb)) - ret = GRO_DROP; + gro_normal_one(napi, skb); break; case GRO_DROP: @@ -5928,7 +5949,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_gro_reset_offset(skb); - ret = napi_skb_finish(dev_gro_receive(napi, skb), skb); + ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); return ret; @@ -5974,26 +5995,6 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); -/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static void gro_normal_list(struct napi_struct *napi) -{ - if (!napi->rx_count) - return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; -} - -/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, - * pass the whole batch up to the stack. - */ -static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) -{ - list_add_tail(&skb->list, &napi->rx_list); - if (++napi->rx_count >= gro_normal_batch) - gro_normal_list(napi); -} - static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) -- cgit v1.2.3 From 90b2be27bb0e56483f335cc10fb59ec66882b949 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Nov 2019 08:45:23 -0800 Subject: net/sched: annotate lockless accesses to qdisc->empty KCSAN reported the following race [1] BUG: KCSAN: data-race in __dev_queue_xmit / net_tx_action read to 0xffff8880ba403508 of 1 bytes by task 21814 on cpu 1: __dev_xmit_skb net/core/dev.c:3389 [inline] __dev_queue_xmit+0x9db/0x1b40 net/core/dev.c:3761 dev_queue_xmit+0x21/0x30 net/core/dev.c:3825 neigh_hh_output include/net/neighbour.h:500 [inline] neigh_output include/net/neighbour.h:509 [inline] ip6_finish_output2+0x873/0xec0 net/ipv6/ip6_output.c:116 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] ip6_local_out+0x74/0x90 net/ipv6/output_core.c:179 ip6_send_skb+0x53/0x110 net/ipv6/ip6_output.c:1795 udp_v6_send_skb.isra.0+0x3ec/0xa70 net/ipv6/udp.c:1173 udpv6_sendmsg+0x1906/0x1c20 net/ipv6/udp.c:1471 inet6_sendmsg+0x6d/0x90 net/ipv6/af_inet6.c:576 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 ___sys_sendmsg+0x2b7/0x5d0 net/socket.c:2311 __sys_sendmmsg+0x123/0x350 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x64/0x80 net/socket.c:2439 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 write to 0xffff8880ba403508 of 1 bytes by interrupt on cpu 0: qdisc_run_begin include/net/sch_generic.h:160 [inline] qdisc_run include/net/pkt_sched.h:120 [inline] net_tx_action+0x2b1/0x6c0 net/core/dev.c:4551 __do_softirq+0x115/0x33f kernel/softirq.c:292 do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1082 do_softirq.part.0+0x6b/0x80 kernel/softirq.c:337 do_softirq kernel/softirq.c:329 [inline] __local_bh_enable_ip+0x76/0x80 kernel/softirq.c:189 local_bh_enable include/linux/bottom_half.h:32 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:688 [inline] ip6_finish_output2+0x7bb/0xec0 net/ipv6/ip6_output.c:117 __ip6_finish_output net/ipv6/ip6_output.c:142 [inline] __ip6_finish_output+0x2d7/0x330 net/ipv6/ip6_output.c:127 ip6_finish_output+0x41/0x160 net/ipv6/ip6_output.c:152 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip6_output+0xf2/0x280 net/ipv6/ip6_output.c:175 dst_output include/net/dst.h:436 [inline] ip6_local_out+0x74/0x90 net/ipv6/output_core.c:179 ip6_send_skb+0x53/0x110 net/ipv6/ip6_output.c:1795 udp_v6_send_skb.isra.0+0x3ec/0xa70 net/ipv6/udp.c:1173 udpv6_sendmsg+0x1906/0x1c20 net/ipv6/udp.c:1471 inet6_sendmsg+0x6d/0x90 net/ipv6/af_inet6.c:576 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 ___sys_sendmsg+0x2b7/0x5d0 net/socket.c:2311 __sys_sendmmsg+0x123/0x350 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x64/0x80 net/socket.c:2439 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 21817 Comm: syz-executor.2 Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: d518d2ed8640 ("net/sched: fix race between deactivation and dequeue for NOLOCK qdisc") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Paolo Abeni Cc: Davide Caratti Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index bb15800c8cb5..1c799d486623 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3607,7 +3607,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, qdisc_calculate_pkt_len(skb, q); if (q->flags & TCQ_F_NOLOCK) { - if ((q->flags & TCQ_F_CAN_BYPASS) && q->empty && + if ((q->flags & TCQ_F_CAN_BYPASS) && READ_ONCE(q->empty) && qdisc_run_begin(q)) { if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { -- cgit v1.2.3 From 8aef998df3979faa19626acf889abecb733342db Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 15 Nov 2019 12:11:35 +0300 Subject: net: core: allow fast GRO for skbs with Ethernet header in head Commit 78d3fd0b7de8 ("gro: Only use skb_gro_header for completely non-linear packets") back in May'09 (v2.6.31-rc1) has changed the original condition '!skb_headlen(skb)' to 'skb->mac_header == skb->tail' in gro_reset_offset() saying: "Since the drivers that need this optimisation all provide completely non-linear packets" (note that this condition has become the current 'skb_mac_header(skb) == skb_tail_pointer(skb)' later with commmit ced14f6804a9 ("net: Correct comparisons and calculations using skb->tail and skb-transport_header") without any functional changes). For now, we have the following rough statistics for v5.4-rc7: 1) napi_gro_frags: 14 2) napi_gro_receive with skb->head containing (most of) payload: 83 3) napi_gro_receive with skb->head containing all the headers: 20 4) napi_gro_receive with skb->head containing only Ethernet header: 2 With the current condition, fast GRO with the usage of NAPI_GRO_CB(skb)->frag0 is available only in the [1] case. Packets pushed by [2] and [3] go through the 'slow' path, but it's not a problem for them as they already contain all the needed headers in skb->head, so pskb_may_pull() only moves skb->data. The layout of skbs in the fourth [4] case at the moment of dev_gro_receive() is identical to skbs that have come through [1], as napi_frags_skb() pulls Ethernet header to skb->head. The only difference is that the mentioned condition is always false for them, because skb_put() and friends irreversibly alter the tail pointer. They also go through the 'slow' path, but now every single pskb_may_pull() in every single .gro_receive() will call the *really* slow __pskb_pull_tail() to pull headers to head. This significantly decreases the overall performance for no visible reasons. The only two users of method [4] is: * drivers/staging/qlge * drivers/net/wireless/iwlwifi (all three variants: dvm, mvm, mvm-mq) Note that in case with wireless drivers we can't use [1] (napi_gro_frags()) at least for now and mac80211 stack always performs pushes and pulls anyways, so performance hit is inavoidable. At the moment of v2.6.31 the mentioned change was necessary (that's why I don't add the "Fixes:" tag), but it became obsolete since skb_gro_mac_header() has gone in commit a50e233c50db ("net-gro: restore frag0 optimization"), so we can simply revert the condition in gro_reset_offset() to allow skbs from [4] go through the 'fast' path just like in case [1]. This was tested on a 600 MHz MIPS CPU and a custom driver and this patch gave boosts up to 40 Mbps to method [4] in both directions comparing to net-next, which made overall performance relatively close to [1] (without it, [4] is the slowest). v2: - Add more references and explanations to commit message - Fix some typos ibid - No functional changes Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 1c799d486623..da78a433c10c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5611,8 +5611,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; - if (skb_mac_header(skb) == skb_tail_pointer(skb) && - pinfo->nr_frags && + if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, -- cgit v1.2.3 From fc5141cb6a60afd81cf53cf4f9bd986f1b846010 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 22 Nov 2019 20:38:01 +0800 Subject: net: gro: use vlan API instead of accessing directly Use vlan common api to access the vlan_tag info. Signed-off-by: Tonghao Zhang Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index da78a433c10c..c7fc902ccbdc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5586,7 +5586,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); if (skb_vlan_tag_present(p)) - diffs |= p->vlan_tci ^ skb->vlan_tci; + diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) -- cgit v1.2.3