From 3891845e1ef6e6807075d4241966b26f6ecb0a5c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 27 Oct 2008 17:51:47 -0700 Subject: netns: Coexist with the sysfs limitations v2 To make testing of the network namespace simpler allow the network namespace code and the sysfs code to be compiled and run at the same time. To do this only virtual devices are allowed in the additional network namespaces and those virtual devices are not placed in the kobject tree. Since virtual devices don't actually do anything interesting hardware wise that needs device management there should be no loss in keeping them out of the kobject tree and by implication sysfs. The gain in ease of testing and code coverage should be significant. Changelog: v2: As pointed out by Benjamin Thery it only makes sense to call device_rename in the initial network namespace for now. Signed-off-by: Eric W. Biederman Acked-by: Benjamin Thery Tested-by: Serge Hallyn Acked-by: Serge Hallyn Acked-by: Daniel Lezcano Signed-off-by: David S. Miller --- net/core/dev.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index d9038e328cc1..3a2b8be9e67b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -924,10 +924,15 @@ int dev_change_name(struct net_device *dev, const char *newname) strlcpy(dev->name, newname, IFNAMSIZ); rollback: - ret = device_rename(&dev->dev, dev->name); - if (ret) { - memcpy(dev->name, oldname, IFNAMSIZ); - return ret; + /* For now only devices in the initial network namespace + * are in sysfs. + */ + if (net == &init_net) { + ret = device_rename(&dev->dev, dev->name); + if (ret) { + memcpy(dev->name, oldname, IFNAMSIZ); + return ret; + } } write_lock_bh(&dev_base_lock); @@ -4460,6 +4465,15 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char if (dev->features & NETIF_F_NETNS_LOCAL) goto out; +#ifdef CONFIG_SYSFS + /* Don't allow real devices to be moved when sysfs + * is enabled. + */ + err = -EINVAL; + if (dev->dev.parent) + goto out; +#endif + /* Ensure the device has been registrered */ err = -EINVAL; if (dev->reg_state != NETREG_REGISTERED) @@ -4517,6 +4531,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char */ dev_addr_discard(dev); + netdev_unregister_kobject(dev); + /* Actually switch the network namespace */ dev_net_set(dev, net); @@ -4533,7 +4549,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char } /* Fixup kobjects */ - netdev_unregister_kobject(dev); err = netdev_register_kobject(dev); WARN_ON(err); -- cgit v1.2.3 From 24f8b2385e03a4f4c8dac513d03b5eaa475822b9 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 3 Nov 2008 17:14:38 -0800 Subject: net: increase receive packet quantum This patch gets about 1.25% back on tbench regression. My change to NAPI for multiqueue support changed the time limit on network receive processing. Under sustained loads like tbench, this can cause the receiver to reschedule prematurely. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 3a2b8be9e67b..8f9d3b38a44b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2373,7 +2373,7 @@ EXPORT_SYMBOL(__napi_schedule); static void net_rx_action(struct softirq_action *h) { struct list_head *list = &__get_cpu_var(softnet_data).poll_list; - unsigned long start_time = jiffies; + unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; @@ -2384,13 +2384,10 @@ static void net_rx_action(struct softirq_action *h) int work, weight; /* If softirq window is exhuasted then punt. - * - * Note that this is a slight policy change from the - * previous NAPI code, which would allow up to 2 - * jiffies to pass before breaking out. The test - * used to be "jiffies - start_time > 1". + * Allow this to run for 2 jiffies since which will allow + * an average latency of 1.5/HZ. */ - if (unlikely(budget <= 0 || jiffies != start_time)) + if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) goto softnet_break; local_irq_enable(); -- cgit v1.2.3 From d0c082cea6dfb9b674b4f6e1e84025662dbd24e8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 5 Nov 2008 15:59:38 -0800 Subject: netns: Delete virtual interfaces during namespace cleanup When physical devices are inside of network namespace and that network namespace terminates we can not make them go away. We have to keep them and moving them to the initial network namespace is the best we can do. For virtual devices left in a network namespace that is exiting we have no need to preserve them and we now have the infrastructure that allows us to delete them. So delete virtual devices when we exit a network namespace. Keeping the necessary user space clean up after a network namespace exits much more tractable. Acked-by: Daniel Lezcano Acked-by: Pavel Emelyanov Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 8f9d3b38a44b..9475f3e624a8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4852,6 +4852,12 @@ static void __net_exit default_device_exit(struct net *net) if (dev->features & NETIF_F_NETNS_LOCAL) continue; + /* Delete virtual devices */ + if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { + dev->rtnl_link_ops->dellink(dev); + continue; + } + /* Push remaing network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); err = dev_change_net_namespace(dev, &init_net, fb_name); -- cgit v1.2.3 From ae33bc40c0d96d02f51a996482ea7e41c5152695 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 5 Nov 2008 16:00:02 -0800 Subject: net: Guaranetee the proper ordering of the loopback device. I was recently hunting a bug that occurred in network namespace cleanup. In looking at the code it became apparrent that we have and will continue to have cases where if we have anything going on in a network namespace there will be assumptions that the loopback device is present. Things like sending igmp unsubscribe messages when we bring down network devices invokes the routing code which assumes that at least the loopback driver is present. Therefore to avoid magic initcall ordering hackery that is hard to follow and hard to get right insert a call to register the loopback device directly from net_dev_init(). This guarantes that the loopback device is the first device registered and the last network device to go away. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 9475f3e624a8..811507c39805 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4904,6 +4904,18 @@ static int __init net_dev_init(void) if (register_pernet_subsys(&netdev_net_ops)) goto out; + /* The loopback device is special if any other network devices + * is present in a network namespace the loopback device must + * be present. Since we now dynamically allocate and free the + * loopback device ensure this invariant is maintained by + * keeping the loopback device as the first device on the + * list of network devices. Ensuring the loopback devices + * is the first device that appears and the last network device + * that disappears. + */ + if (register_pernet_device(&loopback_net_ops)) + goto out; + if (register_pernet_device(&default_device_ops)) goto out; -- cgit v1.2.3 From 0a36b345ab99d6b3c96999e7e3b79bd243cf9bf7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 5 Nov 2008 16:00:24 -0800 Subject: net: Don't leak packets when a netns is going down I have been tracking for a while a case where when the network namespace exits the cleanup gets stck in an endless precessess of: unregister_netdevice: waiting for lo to become free. Usage count = 3 unregister_netdevice: waiting for lo to become free. Usage count = 3 unregister_netdevice: waiting for lo to become free. Usage count = 3 unregister_netdevice: waiting for lo to become free. Usage count = 3 unregister_netdevice: waiting for lo to become free. Usage count = 3 unregister_netdevice: waiting for lo to become free. Usage count = 3 unregister_netdevice: waiting for lo to become free. Usage count = 3 It turns out that if you listen on a multicast address an unsubscribe packet is sent when the network device goes down. If you shutdown the network namespace without carefully cleaning up this can trigger the unsubscribe packet to be sent over the loopback interface while the network namespace is going down. All of which is fine except when we drop the packet and forget to free it leaking the skb and the dst entry attached to. As it turns out the dst entry hold a reference to the idev which holds the dev and keeps everything from being cleaned up. Yuck! By fixing my earlier thinko and add the needed kfree_skb and everything cleans up beautifully. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 811507c39805..a0c60607f1a7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2253,8 +2253,10 @@ int netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); /* Don't receive packets in an exiting network namespace */ - if (!net_alive(dev_net(skb->dev))) + if (!net_alive(dev_net(skb->dev))) { + kfree_skb(skb); goto out; + } #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { -- cgit v1.2.3 From 3d8160b1493bcadca74fbb635d79b3928b8999cf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 7 Nov 2008 22:52:14 -0800 Subject: Revert "net: Guaranetee the proper ordering of the loopback device." This reverts commit ae33bc40c0d96d02f51a996482ea7e41c5152695. --- net/core/dev.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index e0dc67a789b7..2306d56fbb5e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4909,18 +4909,6 @@ static int __init net_dev_init(void) if (register_pernet_subsys(&netdev_net_ops)) goto out; - /* The loopback device is special if any other network devices - * is present in a network namespace the loopback device must - * be present. Since we now dynamically allocate and free the - * loopback device ensure this invariant is maintained by - * keeping the loopback device as the first device on the - * list of network devices. Ensuring the loopback devices - * is the first device that appears and the last network device - * that disappears. - */ - if (register_pernet_device(&loopback_net_ops)) - goto out; - if (register_pernet_device(&default_device_ops)) goto out; -- cgit v1.2.3 From 505d4f73dda9e20d59da05008f1f5eb432613e71 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 7 Nov 2008 22:54:20 -0800 Subject: net: Guaranetee the proper ordering of the loopback device. v2 I was recently hunting a bug that occurred in network namespace cleanup. In looking at the code it became apparrent that we have and will continue to have cases where if we have anything going on in a network namespace there will be assumptions that the loopback device is present. Things like sending igmp unsubscribe messages when we bring down network devices invokes the routing code which assumes that at least the loopback driver is present. Therefore to avoid magic initcall ordering hackery that is hard to follow and hard to get right insert a call to register the loopback device directly from net_dev_init(). This guarantes that the loopback device is the first device registered and the last network device to go away. But do it carefully so we register the loopback device after we clear dev_boot_phase. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/dev.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 2306d56fbb5e..31568b2068ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4909,9 +4909,6 @@ static int __init net_dev_init(void) if (register_pernet_subsys(&netdev_net_ops)) goto out; - if (register_pernet_device(&default_device_ops)) - goto out; - /* * Initialise the packet receive queues. */ @@ -4928,10 +4925,25 @@ static int __init net_dev_init(void) queue->backlog.weight = weight_p; } - netdev_dma_register(); - dev_boot_phase = 0; + /* The loopback device is special if any other network devices + * is present in a network namespace the loopback device must + * be present. Since we now dynamically allocate and free the + * loopback device ensure this invariant is maintained by + * keeping the loopback device as the first device on the + * list of network devices. Ensuring the loopback devices + * is the first device that appears and the last network device + * that disappears. + */ + if (register_pernet_device(&loopback_net_ops)) + goto out; + + if (register_pernet_device(&default_device_ops)) + goto out; + + netdev_dma_register(); + open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); -- cgit v1.2.3 From 8192b0c482d7078fcdcb4854341b977426f6f09b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Nov 2008 10:39:10 +1100 Subject: CRED: Wrap task credential accesses in the networking subsystem Wrap access to task credentials so that they can be separated more easily from the task_struct during the introduction of COW creds. Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id(). Change some task->e?[ug]id to task_e?[ug]id(). In some places it makes more sense to use RCU directly rather than a convenient wrapper; these will be addressed by later patches. Signed-off-by: David Howells Reviewed-by: James Morris Acked-by: Serge Hallyn Cc: netdev@vger.kernel.org Signed-off-by: James Morris --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index d9038e328cc1..262df226b3c9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2958,6 +2958,8 @@ static void dev_change_rx_flags(struct net_device *dev, int flags) static int __dev_set_promiscuity(struct net_device *dev, int inc) { unsigned short old_flags = dev->flags; + uid_t uid; + gid_t gid; ASSERT_RTNL(); @@ -2982,15 +2984,17 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc) printk(KERN_INFO "device %s %s promiscuous mode\n", dev->name, (dev->flags & IFF_PROMISC) ? "entered" : "left"); - if (audit_enabled) + if (audit_enabled) { + current_uid_gid(&uid, &gid); audit_log(current->audit_context, GFP_ATOMIC, AUDIT_ANOM_PROMISCUOUS, "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", dev->name, (dev->flags & IFF_PROMISC), (old_flags & IFF_PROMISC), audit_get_loginuid(current), - current->uid, current->gid, + uid, gid, audit_get_sessionid(current)); + } dev_change_rx_flags(dev, IFF_PROMISC); } -- cgit v1.2.3 From 908cd2dabbfbbefb02f6b908a1188a62e685136a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 16 Nov 2008 19:50:35 -0800 Subject: net: use %pF for /proc/net/ptype Technically, patch changes format for modules, but I think nobody cares. -86dd :ipv6:ipv6_rcv+0x0 +86dd ipv6_rcv+0x0/0x400 [ipv6] Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/core/dev.c | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 31568b2068ac..e08c0fcd603b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -108,7 +108,6 @@ #include #include #include -#include #include #include #include @@ -2801,31 +2800,6 @@ static void ptype_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } -static void ptype_seq_decode(struct seq_file *seq, void *sym) -{ -#ifdef CONFIG_KALLSYMS - unsigned long offset = 0, symsize; - const char *symname; - char *modname; - char namebuf[128]; - - symname = kallsyms_lookup((unsigned long)sym, &symsize, &offset, - &modname, namebuf); - - if (symname) { - char *delim = ":"; - - if (!modname) - modname = delim = ""; - seq_printf(seq, "%s%s%s%s+0x%lx", delim, modname, delim, - symname, offset); - return; - } -#endif - - seq_printf(seq, "[%p]", sym); -} - static int ptype_seq_show(struct seq_file *seq, void *v) { struct packet_type *pt = v; @@ -2838,10 +2812,8 @@ static int ptype_seq_show(struct seq_file *seq, void *v) else seq_printf(seq, "%04x", ntohs(pt->type)); - seq_printf(seq, " %-8s ", - pt->dev ? pt->dev->name : ""); - ptype_seq_decode(seq, pt->func); - seq_putc(seq, '\n'); + seq_printf(seq, " %-8s %pF\n", + pt->dev ? pt->dev->name : "", pt->func); } return 0; -- cgit v1.2.3 From d314774cf2cd5dfeb39a00d37deee65d4c627927 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 19 Nov 2008 21:32:24 -0800 Subject: netdev: network device operations infrastructure This patch changes the network device internal API to move adminstrative operations out of the network device structure and into a separate structure. This patch involves some hackery to maintain compatablity between the new and old model, so all 300+ drivers don't have to be changed at once. For drivers that aren't converted yet, the netdevice_ops virt function list still resides in the net_device structure. For old protocols, the new net_device_ops are copied out to the old net_device pointers. After the transistion is completed the nag message can be changed to an WARN_ON, and the compatiablity code can be made configurable. Some function pointers aren't moved: * destructor can't be in net_device_ops because it may need to be referenced after the module is unloaded. * neighbor setup is manipulated in a couple of places that need special consideration * hard_start_xmit is in the fast path for transmit. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 109 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 77 insertions(+), 32 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index e08c0fcd603b..ca14ab407b33 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1059,6 +1059,7 @@ void dev_load(struct net *net, const char *name) */ int dev_open(struct net_device *dev) { + const struct net_device_ops *ops = dev->netdev_ops; int ret = 0; ASSERT_RTNL(); @@ -1081,11 +1082,11 @@ int dev_open(struct net_device *dev) */ set_bit(__LINK_STATE_START, &dev->state); - if (dev->validate_addr) - ret = dev->validate_addr(dev); + if (ops->ndo_validate_addr) + ret = ops->ndo_validate_addr(dev); - if (!ret && dev->open) - ret = dev->open(dev); + if (!ret && ops->ndo_open) + ret = ops->ndo_open(dev); /* * If it went open OK then: @@ -1129,6 +1130,7 @@ int dev_open(struct net_device *dev) */ int dev_close(struct net_device *dev) { + const struct net_device_ops *ops = dev->netdev_ops; ASSERT_RTNL(); might_sleep(); @@ -1161,8 +1163,8 @@ int dev_close(struct net_device *dev) * We allow it to be called even after a DETACH hot-plug * event. */ - if (dev->stop) - dev->stop(dev); + if (ops->ndo_stop) + ops->ndo_stop(dev); /* * Device is now down. @@ -2930,8 +2932,10 @@ int netdev_set_master(struct net_device *slave, struct net_device *master) static void dev_change_rx_flags(struct net_device *dev, int flags) { - if (dev->flags & IFF_UP && dev->change_rx_flags) - dev->change_rx_flags(dev, flags); + const struct net_device_ops *ops = dev->netdev_ops; + + if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) + ops->ndo_change_rx_flags(dev, flags); } static int __dev_set_promiscuity(struct net_device *dev, int inc) @@ -3051,6 +3055,8 @@ int dev_set_allmulti(struct net_device *dev, int inc) */ void __dev_set_rx_mode(struct net_device *dev) { + const struct net_device_ops *ops = dev->netdev_ops; + /* dev_open will call this function so the list will stay sane. */ if (!(dev->flags&IFF_UP)) return; @@ -3058,8 +3064,8 @@ void __dev_set_rx_mode(struct net_device *dev) if (!netif_device_present(dev)) return; - if (dev->set_rx_mode) - dev->set_rx_mode(dev); + if (ops->ndo_set_rx_mode) + ops->ndo_set_rx_mode(dev); else { /* Unicast addresses changes may only happen under the rtnl, * therefore calling __dev_set_promiscuity here is safe. @@ -3072,8 +3078,8 @@ void __dev_set_rx_mode(struct net_device *dev) dev->uc_promisc = 0; } - if (dev->set_multicast_list) - dev->set_multicast_list(dev); + if (ops->ndo_set_multicast_list) + ops->ndo_set_multicast_list(dev); } } @@ -3432,6 +3438,7 @@ int dev_change_flags(struct net_device *dev, unsigned flags) */ int dev_set_mtu(struct net_device *dev, int new_mtu) { + const struct net_device_ops *ops = dev->netdev_ops; int err; if (new_mtu == dev->mtu) @@ -3445,10 +3452,11 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) return -ENODEV; err = 0; - if (dev->change_mtu) - err = dev->change_mtu(dev, new_mtu); + if (ops->ndo_change_mtu) + err = ops->ndo_change_mtu(dev, new_mtu); else dev->mtu = new_mtu; + if (!err && dev->flags & IFF_UP) call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); return err; @@ -3463,15 +3471,16 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) */ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) { + const struct net_device_ops *ops = dev->netdev_ops; int err; - if (!dev->set_mac_address) + if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; if (sa->sa_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - err = dev->set_mac_address(dev, sa); + err = ops->ndo_set_mac_address(dev, sa); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; @@ -3551,6 +3560,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); + const struct net_device_ops *ops = dev->netdev_ops; if (!dev) return -ENODEV; @@ -3578,15 +3588,15 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return 0; case SIOCSIFMAP: - if (dev->set_config) { + if (ops->ndo_set_config) { if (!netif_device_present(dev)) return -ENODEV; - return dev->set_config(dev, &ifr->ifr_map); + return ops->ndo_set_config(dev, &ifr->ifr_map); } return -EOPNOTSUPP; case SIOCADDMULTI: - if ((!dev->set_multicast_list && !dev->set_rx_mode) || + if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) @@ -3595,7 +3605,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) dev->addr_len, 1); case SIOCDELMULTI: - if ((!dev->set_multicast_list && !dev->set_rx_mode) || + if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || ifr->ifr_hwaddr.sa_family != AF_UNSPEC) return -EINVAL; if (!netif_device_present(dev)) @@ -3633,10 +3643,9 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) cmd == SIOCBRDELIF || cmd == SIOCWANDEV) { err = -EOPNOTSUPP; - if (dev->do_ioctl) { + if (ops->ndo_do_ioctl) { if (netif_device_present(dev)) - err = dev->do_ioctl(dev, ifr, - cmd); + err = ops->ndo_do_ioctl(dev, ifr, cmd); else err = -ENODEV; } @@ -3897,8 +3906,8 @@ static void rollback_registered(struct net_device *dev) */ dev_addr_discard(dev); - if (dev->uninit) - dev->uninit(dev); + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); /* Notifier chain MUST detach us from master device. */ WARN_ON(dev->master); @@ -3988,7 +3997,7 @@ int register_netdevice(struct net_device *dev) struct hlist_head *head; struct hlist_node *p; int ret; - struct net *net; + struct net *net = dev_net(dev); BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -3997,8 +4006,7 @@ int register_netdevice(struct net_device *dev) /* When net_device's are persistent, this will be fatal. */ BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); - BUG_ON(!dev_net(dev)); - net = dev_net(dev); + BUG_ON(!net); spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); @@ -4006,9 +4014,46 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; +#ifdef CONFIG_COMPAT_NET_DEV_OPS + /* Netdevice_ops API compatiability support. + * This is temporary until all network devices are converted. + */ + if (dev->netdev_ops) { + const struct net_device_ops *ops = dev->netdev_ops; + + dev->init = ops->ndo_init; + dev->uninit = ops->ndo_uninit; + dev->open = ops->ndo_open; + dev->change_rx_flags = ops->ndo_change_rx_flags; + dev->set_rx_mode = ops->ndo_set_rx_mode; + dev->set_multicast_list = ops->ndo_set_multicast_list; + dev->set_mac_address = ops->ndo_set_mac_address; + dev->validate_addr = ops->ndo_validate_addr; + dev->do_ioctl = ops->ndo_do_ioctl; + dev->set_config = ops->ndo_set_config; + dev->change_mtu = ops->ndo_change_mtu; + dev->tx_timeout = ops->ndo_tx_timeout; + dev->get_stats = ops->ndo_get_stats; + dev->vlan_rx_register = ops->ndo_vlan_rx_register; + dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; + dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; +#ifdef CONFIG_NET_POLL_CONTROLLER + dev->poll_controller = ops->ndo_poll_controller; +#endif + } else { + char drivername[64]; + pr_info("%s (%s): not using net_device_ops yet\n", + dev->name, netdev_drivername(dev, drivername, 64)); + + /* This works only because net_device_ops and the + compatiablity structure are the same. */ + dev->netdev_ops = (void *) &(dev->init); + } +#endif + /* Init, if this function is available */ - if (dev->init) { - ret = dev->init(dev); + if (dev->netdev_ops->ndo_init) { + ret = dev->netdev_ops->ndo_init(dev); if (ret) { if (ret > 0) ret = -EIO; @@ -4086,8 +4131,8 @@ out: return ret; err_uninit: - if (dev->uninit) - dev->uninit(dev); + if (dev->netdev_ops->ndo_uninit) + dev->netdev_ops->ndo_uninit(dev); goto out; } -- cgit v1.2.3 From eeda3fd64f75bcbfaa70ce946513abaf3f23b8e0 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 19 Nov 2008 21:40:23 -0800 Subject: netdev: introduce dev_get_stats() In order for the network device ops get_stats call to be immutable, the handling of the default internal network device stats block has to be changed. Add a new helper function which replaces the old use of internal_get_stats. Note: change return code to make it clear that the caller should not go changing the returned statistics. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index ca14ab407b33..8843f4e3f5e1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2620,7 +2620,7 @@ void dev_seq_stop(struct seq_file *seq, void *v) static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { - struct net_device_stats *stats = dev->get_stats(dev); + const struct net_device_stats *stats = dev_get_stats(dev); seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", @@ -4288,10 +4288,24 @@ void netdev_run_todo(void) } } -static struct net_device_stats *internal_stats(struct net_device *dev) -{ - return &dev->stats; +/** + * dev_get_stats - get network device statistics + * @dev: device to get statistics from + * + * Get network statistics from device. The device driver may provide + * its own method by setting dev->netdev_ops->get_stats; otherwise + * the internal statistics structure is used. + */ +const struct net_device_stats *dev_get_stats(struct net_device *dev) + { + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_get_stats) + return ops->ndo_get_stats(dev); + else + return &dev->stats; } +EXPORT_SYMBOL(dev_get_stats); static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, @@ -4370,7 +4384,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, netdev_init_queues(dev); - dev->get_stats = internal_stats; netpoll_netdev_init(dev); setup(dev); strcpy(dev->name, name); -- cgit v1.2.3 From 008298231abbeb91bc7be9e8b078607b816d1a4a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 20 Nov 2008 20:14:53 -0800 Subject: netdev: add more functions to netdevice ops This patch moves neigh_setup and hard_start_xmit into the network device ops structure. For bisection, fix all the previously converted drivers as well. Bonding driver took the biggest hit on this. Added a prefetch of the hard_start_xmit in the fast path to try and reduce any impact this would have. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 8843f4e3f5e1..4615e9a443aa 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1660,6 +1660,9 @@ static int dev_gso_segment(struct sk_buff *skb) int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq) { + const struct net_device_ops *ops = dev->netdev_ops; + + prefetch(&dev->netdev_ops->ndo_start_xmit); if (likely(!skb->next)) { if (!list_empty(&ptype_all)) dev_queue_xmit_nit(skb, dev); @@ -1671,7 +1674,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, goto gso; } - return dev->hard_start_xmit(skb, dev); + return ops->ndo_start_xmit(skb, dev); } gso: @@ -1681,7 +1684,7 @@ gso: skb->next = nskb->next; nskb->next = NULL; - rc = dev->hard_start_xmit(nskb, dev); + rc = ops->ndo_start_xmit(nskb, dev); if (unlikely(rc)) { nskb->next = skb->next; skb->next = nskb; @@ -1755,10 +1758,11 @@ static u16 simple_tx_hash(struct net_device *dev, struct sk_buff *skb) static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { + const struct net_device_ops *ops = dev->netdev_ops; u16 queue_index = 0; - if (dev->select_queue) - queue_index = dev->select_queue(dev, skb); + if (ops->ndo_select_queue) + queue_index = ops->ndo_select_queue(dev, skb); else if (dev->real_num_tx_queues > 1) queue_index = simple_tx_hash(dev, skb); -- cgit v1.2.3 From b74ca3a896b9ab5f952bc440154758e708c48884 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Mon, 8 Dec 2008 01:14:16 -0800 Subject: netdevice: Kill netdev->priv This is the last shoot of this series. After I removing all directly reference of netdev->priv, I am killing "priv" of "struct net_device" and fixing relative comments/docs. Anyone will not be allowed to reference netdev->priv directly. If you want to reference the memory of private data, use netdev_priv() instead. If the private data is not allocted when alloc_netdev(), use netdev->ml_priv to point that memory after you creating that private data. Signed-off-by: Wang Chen Signed-off-by: David S. Miller --- net/core/dev.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 4615e9a443aa..f54cac76438a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4378,12 +4378,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->num_tx_queues = queue_count; dev->real_num_tx_queues = queue_count; - if (sizeof_priv) { - dev->priv = ((char *)dev + - ((sizeof(struct net_device) + NETDEV_ALIGN_CONST) - & ~NETDEV_ALIGN_CONST)); - } - dev->gso_max_size = GSO_MAX_SIZE; netdev_init_queues(dev); -- cgit v1.2.3 From 1a881f27c50b4fbd6858a8696a189263621136b0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:27:47 -0800 Subject: net: Add frag_list support to GSO This patch allows GSO to handle frag_list in a limited way for the purposes of allowing packets merged by GRO to be refragmented on output. Most hardware won't (and aren't expected to) support handling GRO frag_list packets directly. Therefore we will perform GSO in software for those cases. However, for drivers that can support it (such as virtual NICs) we may not have to segment the packets at all. Whether the added overhead of GRO/GSO is worthwhile for bridges and routers when weighed against the benefit of potentially increasing the MTU within the host is still an open question. However, for the case of host nodes this is undoubtedly a win. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index f54cac76438a..e415f0b0d0d0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1533,8 +1533,6 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) __be16 type = skb->protocol; int err; - BUG_ON(skb_shinfo(skb)->frag_list); - skb_reset_mac_header(skb); skb->mac_len = skb->network_header - skb->mac_header; __skb_pull(skb, skb->mac_len); -- cgit v1.2.3 From d565b0a1a9b6ee7dff46e1f68b26b526ac11ae50 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:38:52 -0800 Subject: net: Add Generic Receive Offload infrastructure This patch adds the top-level GRO (Generic Receive Offload) infrastructure. This is pretty similar to LRO except that this is protocol-independent. Instead of holding packets in an lro_mgr structure, they're now held in napi_struct. For drivers that intend to use this, they can set the NETIF_F_GRO bit and call napi_gro_receive instead of netif_receive_skb or just call netif_rx. The latter will call napi_receive_skb automatically. When napi_gro_receive is used, the driver must either call napi_complete/napi_rx_complete, or call napi_gro_flush in softirq context if the driver uses the primitives __napi_complete/__napi_rx_complete. Protocols will set the gro_receive and gro_complete function pointers in order to participate in this scheme. In addition to the packet, gro_receive will get a list of currently held packets. Each packet in the list has a same_flow field which is non-zero if it is a potential match for the new packet. For each packet that may match, they also have a flush field which is non-zero if the held packet must not be merged with the new packet. Once gro_receive has determined that the new skb matches a held packet, the held packet may be processed immediately if the new skb cannot be merged with it. In this case gro_receive should return the pointer to the existing skb in gro_list. Otherwise the new skb should be merged into the existing packet and NULL should be returned, unless the new skb makes it impossible for any further merges to be made (e.g., FIN packet) where the merged skb should be returned. Whenever the skb is merged into an existing entry, the gro_receive function should set NAPI_GRO_CB(skb)->same_flow. Note that if an skb merely matches an existing entry but can't be merged with it, then this shouldn't be set. If gro_receive finds it pointless to hold the new skb for future merging, it should set NAPI_GRO_CB(skb)->flush. Held packets will be flushed by napi_gro_flush which is called by napi_complete and napi_rx_complete. Currently held packets are stored in a singly liked list just like LRO. The list is limited to a maximum of 8 entries. In future, this may be expanded to use a hash table to allow more flows to be held for merging. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 191 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index e415f0b0d0d0..d8d7d1fccde4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,9 @@ #include "net-sysfs.h" +/* Instead of increasing this, you should create a hash table. */ +#define MAX_GRO_SKBS 8 + /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -2335,6 +2338,122 @@ static void flush_backlog(void *arg) } } +static int napi_gro_complete(struct sk_buff *skb) +{ + struct packet_type *ptype; + __be16 type = skb->protocol; + struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + int err = -ENOENT; + + if (!skb_shinfo(skb)->frag_list) + goto out; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + if (ptype->type != type || ptype->dev || !ptype->gro_complete) + continue; + + err = ptype->gro_complete(skb); + break; + } + rcu_read_unlock(); + + if (err) { + WARN_ON(&ptype->list == head); + kfree_skb(skb); + return NET_RX_SUCCESS; + } + +out: + __skb_push(skb, -skb_network_offset(skb)); + return netif_receive_skb(skb); +} + +void napi_gro_flush(struct napi_struct *napi) +{ + struct sk_buff *skb, *next; + + for (skb = napi->gro_list; skb; skb = next) { + next = skb->next; + skb->next = NULL; + napi_gro_complete(skb); + } + + napi->gro_list = NULL; +} +EXPORT_SYMBOL(napi_gro_flush); + +int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct packet_type *ptype; + __be16 type = skb->protocol; + struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; + int count = 0; + int mac_len; + + if (!(skb->dev->features & NETIF_F_GRO)) + goto normal; + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, head, list) { + struct sk_buff *p; + + if (ptype->type != type || ptype->dev || !ptype->gro_receive) + continue; + + skb_reset_network_header(skb); + mac_len = skb->network_header - skb->mac_header; + skb->mac_len = mac_len; + NAPI_GRO_CB(skb)->same_flow = 0; + NAPI_GRO_CB(skb)->flush = 0; + + for (p = napi->gro_list; p; p = p->next) { + count++; + NAPI_GRO_CB(p)->same_flow = + p->mac_len == mac_len && + !memcmp(skb_mac_header(p), skb_mac_header(skb), + mac_len); + NAPI_GRO_CB(p)->flush = 0; + } + + pp = ptype->gro_receive(&napi->gro_list, skb); + break; + } + rcu_read_unlock(); + + if (&ptype->list == head) + goto normal; + + if (pp) { + struct sk_buff *nskb = *pp; + + *pp = nskb->next; + nskb->next = NULL; + napi_gro_complete(nskb); + count--; + } + + if (NAPI_GRO_CB(skb)->same_flow) + goto ok; + + if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) { + __skb_push(skb, -skb_network_offset(skb)); + goto normal; + } + + NAPI_GRO_CB(skb)->count = 1; + skb->next = napi->gro_list; + napi->gro_list = skb; + +ok: + return NET_RX_SUCCESS; + +normal: + return netif_receive_skb(skb); +} +EXPORT_SYMBOL(napi_gro_receive); + static int process_backlog(struct napi_struct *napi, int quota) { int work = 0; @@ -2354,9 +2473,11 @@ static int process_backlog(struct napi_struct *napi, int quota) } local_irq_enable(); - netif_receive_skb(skb); + napi_gro_receive(napi, skb); } while (++work < quota && jiffies == start_time); + napi_gro_flush(napi); + return work; } @@ -2377,6 +2498,68 @@ void __napi_schedule(struct napi_struct *n) } EXPORT_SYMBOL(__napi_schedule); +void __napi_complete(struct napi_struct *n) +{ + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + BUG_ON(n->gro_list); + + list_del(&n->poll_list); + smp_mb__before_clear_bit(); + clear_bit(NAPI_STATE_SCHED, &n->state); +} +EXPORT_SYMBOL(__napi_complete); + +void napi_complete(struct napi_struct *n) +{ + unsigned long flags; + + /* + * don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu + */ + if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) + return; + + napi_gro_flush(n); + local_irq_save(flags); + __napi_complete(n); + local_irq_restore(flags); +} +EXPORT_SYMBOL(napi_complete); + +void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight) +{ + INIT_LIST_HEAD(&napi->poll_list); + napi->gro_list = NULL; + napi->poll = poll; + napi->weight = weight; + list_add(&napi->dev_list, &dev->napi_list); +#ifdef CONFIG_NETPOLL + napi->dev = dev; + spin_lock_init(&napi->poll_lock); + napi->poll_owner = -1; +#endif + set_bit(NAPI_STATE_SCHED, &napi->state); +} +EXPORT_SYMBOL(netif_napi_add); + +void netif_napi_del(struct napi_struct *napi) +{ + struct sk_buff *skb, *next; + + list_del(&napi->dev_list); + + for (skb = napi->gro_list; skb; skb = next) { + next = skb->next; + skb->next = NULL; + kfree_skb(skb); + } + + napi->gro_list = NULL; +} +EXPORT_SYMBOL(netif_napi_del); + static void net_rx_action(struct softirq_action *h) { @@ -4380,7 +4563,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, netdev_init_queues(dev); - netpoll_netdev_init(dev); + INIT_LIST_HEAD(&dev->napi_list); setup(dev); strcpy(dev->name, name); return dev; @@ -4397,10 +4580,15 @@ EXPORT_SYMBOL(alloc_netdev_mq); */ void free_netdev(struct net_device *dev) { + struct napi_struct *p, *n; + release_net(dev_net(dev)); kfree(dev->_tx); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + netif_napi_del(p); + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); @@ -4949,6 +5137,7 @@ static int __init net_dev_init(void) queue->backlog.poll = process_backlog; queue->backlog.weight = weight_p; + queue->backlog.gro_list = NULL; } dev_boot_phase = 0; -- cgit v1.2.3 From 2d91d78b68606ff7ce52ea70e187dee7831aa2f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 17 Dec 2008 15:47:29 -0800 Subject: Phonet: allocate a non-Ethernet ARP type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also leave some room for more 802.11 types. Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index d8d7d1fccde4..15aab0c46d6d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -283,8 +283,8 @@ static const unsigned short netdev_lock_type[] = ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, - ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_VOID, - ARPHRD_NONE}; + ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, + ARPHRD_VOID, ARPHRD_NONE}; static const char *netdev_lock_name[] = {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", @@ -300,8 +300,8 @@ static const char *netdev_lock_name[] = "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", - "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_VOID", - "_xmit_NONE"}; + "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", + "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; -- cgit v1.2.3 From 57c81fffc863fb4c1804bc963bcbfb82d736c6df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 17 Dec 2008 15:47:48 -0800 Subject: Phonet: allocate separate ARP type for GPRS over a Phonet pipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A separate xmit lock class supports GPRS over a Phonet pipe over a TUN device (type ARPHRD_NONE). Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 15aab0c46d6d..048cf1197872 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -284,7 +284,7 @@ static const unsigned short netdev_lock_type[] = ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, - ARPHRD_VOID, ARPHRD_NONE}; + ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE}; static const char *netdev_lock_name[] = {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", @@ -301,7 +301,7 @@ static const char *netdev_lock_name[] = "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", - "_xmit_VOID", "_xmit_NONE"}; + "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; -- cgit v1.2.3 From 5f2f6da76c429c42d54f73807f00b8fd761a7d68 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Mon, 22 Dec 2008 19:35:28 -0800 Subject: net: Fix oops in dev_ifsioc() A command like this: "brctl addif br1 eth1" issued as a user gave me an oops when bridge module wasn't loaded. It's caused by using a dev pointer before checking for NULL. Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 048cf1197872..daca72e6b37b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3745,11 +3745,13 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - const struct net_device_ops *ops = dev->netdev_ops; + const struct net_device_ops *ops; if (!dev) return -ENODEV; + ops = dev->netdev_ops; + switch (cmd) { case SIOCSIFFLAGS: /* Set interface flags */ return dev_change_flags(dev, ifr->ifr_flags); -- cgit v1.2.3 From d7b06636be162d3f74c9ce5d6d0d9ea4e5d362c8 Mon Sep 17 00:00:00 2001 From: Peter P Waskiewicz Jr Date: Fri, 26 Dec 2008 01:35:35 -0800 Subject: net: Init NAPI dev_list on napi_del The recent GRO patches introduced the NAPI removal of devices in free_netdev. For drivers that can change the number of queues during driver operation, the NAPI infrastructure doesn't allow the freeing and re-addition of NAPI entities without reloading the driver. This change reinitializes the dev_list in each NAPI struct on delete, instead of just deleting it (and assigning the list pointers to POISON). Drivers that wish to remove/re-add NAPI will need to re-initialize the netdev napi_list after removing all NAPI instances, before re-adding NAPI devices again. Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index daca72e6b37b..536a8ac189c8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2548,7 +2548,7 @@ void netif_napi_del(struct napi_struct *napi) { struct sk_buff *skb, *next; - list_del(&napi->dev_list); + list_del_init(&napi->dev_list); for (skb = napi->gro_list; skb; skb = next) { next = skb->next; -- cgit v1.2.3 From 0da2afd59653d2edf5c8e0f09b23f367ab5bc80f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 26 Dec 2008 14:57:42 -0800 Subject: gro: Fix potential use after free The initial skb may have been freed after napi_gro_complete in napi_gro_receive if it was merged into an existing packet. Thus we cannot check same_flow (which indicates whether it was merged) after calling napi_gro_complete. This patch fixes this by saving the same_flow status before the call to napi_gro_complete. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core/dev.c') diff --git a/net/core/dev.c b/net/core/dev.c index 536a8ac189c8..303e984ee6a6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2390,6 +2390,7 @@ int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) __be16 type = skb->protocol; struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; int count = 0; + int same_flow; int mac_len; if (!(skb->dev->features & NETIF_F_GRO)) @@ -2425,6 +2426,8 @@ int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (&ptype->list == head) goto normal; + same_flow = NAPI_GRO_CB(skb)->same_flow; + if (pp) { struct sk_buff *nskb = *pp; @@ -2434,7 +2437,7 @@ int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) count--; } - if (NAPI_GRO_CB(skb)->same_flow) + if (same_flow) goto ok; if (NAPI_GRO_CB(skb)->flush || count >= MAX_GRO_SKBS) { -- cgit v1.2.3