From 6cf3f41e6c08bca6641a695449791c38a25f35ff Mon Sep 17 00:00:00 2001 From: Jay Vosburgh Date: Mon, 3 Nov 2008 18:16:50 -0800 Subject: bonding, net: Move last_rx update into bonding recv logic The only user of the net_device->last_rx field is bonding. This patch adds a conditional update of last_rx to the bonding special logic in skb_bond_should_drop, causing last_rx to only be updated when the ARP monitor is running. This frees network device drivers from the necessity of updating last_rx, which can have cache line thrash issues. Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller --- include/linux/netdevice.h | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9d77b1d7dca8..f1b0dbe58464 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1742,22 +1742,26 @@ static inline int skb_bond_should_drop(struct sk_buff *skb) struct net_device *dev = skb->dev; struct net_device *master = dev->master; - if (master && - (dev->priv_flags & IFF_SLAVE_INACTIVE)) { - if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && - skb->protocol == __constant_htons(ETH_P_ARP)) - return 0; - - if (master->priv_flags & IFF_MASTER_ALB) { - if (skb->pkt_type != PACKET_BROADCAST && - skb->pkt_type != PACKET_MULTICAST) + if (master) { + if (master->priv_flags & IFF_MASTER_ARPMON) + dev->last_rx = jiffies; + + if (dev->priv_flags & IFF_SLAVE_INACTIVE) { + if ((dev->priv_flags & IFF_SLAVE_NEEDARP) && + skb->protocol == __constant_htons(ETH_P_ARP)) return 0; - } - if (master->priv_flags & IFF_MASTER_8023AD && - skb->protocol == __constant_htons(ETH_P_SLOW)) - return 0; - return 1; + if (master->priv_flags & IFF_MASTER_ALB) { + if (skb->pkt_type != PACKET_BROADCAST && + skb->pkt_type != PACKET_MULTICAST) + return 0; + } + if (master->priv_flags & IFF_MASTER_8023AD && + skb->protocol == __constant_htons(ETH_P_SLOW)) + return 0; + + return 1; + } } return 0; } -- cgit v1.2.3 From ae33bc40c0d96d02f51a996482ea7e41c5152695 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 5 Nov 2008 16:00:02 -0800 Subject: net: Guaranetee the proper ordering of the loopback device. I was recently hunting a bug that occurred in network namespace cleanup. In looking at the code it became apparrent that we have and will continue to have cases where if we have anything going on in a network namespace there will be assumptions that the loopback device is present. Things like sending igmp unsubscribe messages when we bring down network devices invokes the routing code which assumes that at least the loopback driver is present. Therefore to avoid magic initcall ordering hackery that is hard to follow and hard to get right insert a call to register the loopback device directly from net_dev_init(). This guarantes that the loopback device is the first device registered and the last network device to go away. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f1b0dbe58464..12d7f4469dc9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1766,6 +1766,7 @@ static inline int skb_bond_should_drop(struct sk_buff *skb) return 0; } +extern struct pernet_operations __net_initdata loopback_net_ops; #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ -- cgit v1.2.3 From 3d8160b1493bcadca74fbb635d79b3928b8999cf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 7 Nov 2008 22:52:14 -0800 Subject: Revert "net: Guaranetee the proper ordering of the loopback device." This reverts commit ae33bc40c0d96d02f51a996482ea7e41c5152695. --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 12d7f4469dc9..f1b0dbe58464 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1766,7 +1766,6 @@ static inline int skb_bond_should_drop(struct sk_buff *skb) return 0; } -extern struct pernet_operations __net_initdata loopback_net_ops; #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ -- cgit v1.2.3 From 505d4f73dda9e20d59da05008f1f5eb432613e71 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 7 Nov 2008 22:54:20 -0800 Subject: net: Guaranetee the proper ordering of the loopback device. v2 I was recently hunting a bug that occurred in network namespace cleanup. In looking at the code it became apparrent that we have and will continue to have cases where if we have anything going on in a network namespace there will be assumptions that the loopback device is present. Things like sending igmp unsubscribe messages when we bring down network devices invokes the routing code which assumes that at least the loopback driver is present. Therefore to avoid magic initcall ordering hackery that is hard to follow and hard to get right insert a call to register the loopback device directly from net_dev_init(). This guarantes that the loopback device is the first device registered and the last network device to go away. But do it carefully so we register the loopback device after we clear dev_boot_phase. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f1b0dbe58464..12d7f4469dc9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1766,6 +1766,7 @@ static inline int skb_bond_should_drop(struct sk_buff *skb) return 0; } +extern struct pernet_operations __net_initdata loopback_net_ops; #endif /* __KERNEL__ */ #endif /* _LINUX_DEV_H */ -- cgit v1.2.3 From d314774cf2cd5dfeb39a00d37deee65d4c627927 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 19 Nov 2008 21:32:24 -0800 Subject: netdev: network device operations infrastructure This patch changes the network device internal API to move adminstrative operations out of the network device structure and into a separate structure. This patch involves some hackery to maintain compatablity between the new and old model, so all 300+ drivers don't have to be changed at once. For drivers that aren't converted yet, the netdevice_ops virt function list still resides in the net_device structure. For old protocols, the new net_device_ops are copied out to the old net_device pointers. After the transistion is completed the nag message can be changed to an WARN_ON, and the compatiablity code can be made configurable. Some function pointers aren't moved: * destructor can't be in net_device_ops because it may need to be referenced after the module is unloaded. * neighbor setup is manipulated in a couple of places that need special consideration * hard_start_xmit is in the fast path for transmit. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 232 +++++++++++++++++++++++++++++++++------------- 1 file changed, 168 insertions(+), 64 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 12d7f4469dc9..9060f5f3517a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -451,6 +451,131 @@ struct netdev_queue { struct Qdisc *qdisc_sleeping; } ____cacheline_aligned_in_smp; + +/* + * This structure defines the management hooks for network devices. + * The following hooks can bed defined and are optonal (can be null) + * unless otherwise noted. + * + * int (*ndo_init)(struct net_device *dev); + * This function is called once when network device is registered. + * The network device can use this to any late stage initializaton + * or semantic validattion. It can fail with an error code which will + * be propogated back to register_netdev + * + * void (*ndo_uninit)(struct net_device *dev); + * This function is called when device is unregistered or when registration + * fails. It is not called if init fails. + * + * int (*ndo_open)(struct net_device *dev); + * This function is called when network device transistions to the up + * state. + * + * int (*ndo_stop)(struct net_device *dev); + * This function is called when network device transistions to the down + * state. + * + * void (*ndo_change_rx_flags)(struct net_device *dev, int flags); + * This function is called to allow device receiver to make + * changes to configuration when multicast or promiscious is enabled. + * + * void (*ndo_set_rx_mode)(struct net_device *dev); + * This function is called device changes address list filtering. + * + * void (*ndo_set_multicast_list)(struct net_device *dev); + * This function is called when the multicast address list changes. + * + * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); + * This function is called when the Media Access Control address + * needs to be changed. If not this interface is not defined, the + * mac address can not be changed. + * + * int (*ndo_validate_addr)(struct net_device *dev); + * Test if Media Access Control address is valid for the device. + * + * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); + * Called when a user request an ioctl which can't be handled by + * the generic interface code. If not defined ioctl's return + * not supported error code. + * + * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); + * Used to set network devices bus interface parameters. This interface + * is retained for legacy reason, new devices should use the bus + * interface (PCI) for low level management. + * + * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); + * Called when a user wants to change the Maximum Transfer Unit + * of a device. If not defined, any request to change MTU will + * will return an error. + * + * void (*ndo_tx_timeout) (struct net_device *dev); + * Callback uses when the transmitter has not made any progress + * for dev->watchdog ticks. + * + * struct net_device_stats* (*get_stats)(struct net_device *dev); + * Called when a user wants to get the network device usage + * statistics. If not defined, the counters in dev->stats will + * be used. + * + * void (*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group *grp); + * If device support VLAN receive accleration + * (ie. dev->features & NETIF_F_HW_VLAN_RX), then this function is called + * when vlan groups for the device changes. Note: grp is NULL + * if no vlan's groups are being used. + * + * void (*ndo_vlan_rx_add_vid)(struct net_device *dev, unsigned short vid); + * If device support VLAN filtering (dev->features & NETIF_F_HW_VLAN_FILTER) + * this function is called when a VLAN id is registered. + * + * void (*ndo_vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid); + * If device support VLAN filtering (dev->features & NETIF_F_HW_VLAN_FILTER) + * this function is called when a VLAN id is unregistered. + * + * void (*ndo_poll_controller)(struct net_device *dev); + */ +struct net_device_ops { + int (*ndo_init)(struct net_device *dev); + void (*ndo_uninit)(struct net_device *dev); + int (*ndo_open)(struct net_device *dev); + int (*ndo_stop)(struct net_device *dev); +#define HAVE_CHANGE_RX_FLAGS + void (*ndo_change_rx_flags)(struct net_device *dev, + int flags); +#define HAVE_SET_RX_MODE + void (*ndo_set_rx_mode)(struct net_device *dev); +#define HAVE_MULTICAST + void (*ndo_set_multicast_list)(struct net_device *dev); +#define HAVE_SET_MAC_ADDR + int (*ndo_set_mac_address)(struct net_device *dev, + void *addr); +#define HAVE_VALIDATE_ADDR + int (*ndo_validate_addr)(struct net_device *dev); +#define HAVE_PRIVATE_IOCTL + int (*ndo_do_ioctl)(struct net_device *dev, + struct ifreq *ifr, int cmd); +#define HAVE_SET_CONFIG + int (*ndo_set_config)(struct net_device *dev, + struct ifmap *map); +#define HAVE_CHANGE_MTU + int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); + +#define HAVE_TX_TIMEOUT + void (*ndo_tx_timeout) (struct net_device *dev); + + struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); + + void (*ndo_vlan_rx_register)(struct net_device *dev, + struct vlan_group *grp); + void (*ndo_vlan_rx_add_vid)(struct net_device *dev, + unsigned short vid); + void (*ndo_vlan_rx_kill_vid)(struct net_device *dev, + unsigned short vid); +#ifdef CONFIG_NET_POLL_CONTROLLER +#define HAVE_NETDEV_POLL + void (*ndo_poll_controller)(struct net_device *dev); +#endif +}; + /* * The DEVICE structure. * Actually, this whole structure is a big mistake. It mixes I/O @@ -498,11 +623,6 @@ struct net_device #ifdef CONFIG_NETPOLL struct list_head napi_list; #endif - - /* The device initialization function. Called only once. */ - int (*init)(struct net_device *dev); - - /* ------- Fields preinitialized in Space.c finish here ------- */ /* Net device features */ unsigned long features; @@ -546,15 +666,13 @@ struct net_device * for all in netdev_increment_features. */ #define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \ - NETIF_F_SG | NETIF_F_HIGHDMA | \ + NETIF_F_SG | NETIF_F_HIGHDMA | \ NETIF_F_FRAGLIST) /* Interface index. Unique device identifier */ int ifindex; int iflink; - - struct net_device_stats* (*get_stats)(struct net_device *dev); struct net_device_stats stats; #ifdef CONFIG_WIRELESS_EXT @@ -564,18 +682,13 @@ struct net_device /* Instance data managed by the core of Wireless Extensions. */ struct iw_public_data * wireless_data; #endif + /* Management operations */ + const struct net_device_ops *netdev_ops; const struct ethtool_ops *ethtool_ops; /* Hardware header description */ const struct header_ops *header_ops; - /* - * This marks the end of the "visible" part of the structure. All - * fields hereafter are internal to the system, and may change at - * will (read: may be cleaned up at will). - */ - - unsigned int flags; /* interface flags (a la BSD) */ unsigned short gflags; unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */ @@ -634,7 +747,7 @@ struct net_device unsigned long last_rx; /* Time of last Rx */ /* Interface address info used in eth_type_trans() */ unsigned char dev_addr[MAX_ADDR_LEN]; /* hw address, (before bcast - because most packets are unicast) */ + because most packets are unicast) */ unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */ @@ -648,6 +761,10 @@ struct net_device /* Number of TX queues currently active in device */ unsigned int real_num_tx_queues; + /* Map buffer to appropriate transmit queue */ + u16 (*select_queue)(struct net_device *dev, + struct sk_buff *skb); + unsigned long tx_queue_len; /* Max frames per queue allowed */ spinlock_t tx_global_lock; /* @@ -662,9 +779,6 @@ struct net_device int watchdog_timeo; /* used by dev_watchdog() */ struct timer_list watchdog_timer; -/* - * refcnt is a very hot point, so align it on SMP - */ /* Number of references to this device */ atomic_t refcnt ____cacheline_aligned_in_smp; @@ -683,56 +797,14 @@ struct net_device NETREG_RELEASED, /* called free_netdev */ } reg_state; - /* Called after device is detached from network. */ - void (*uninit)(struct net_device *dev); - /* Called after last user reference disappears. */ - void (*destructor)(struct net_device *dev); + /* Called from unregister, can be used to call free_netdev */ + void (*destructor)(struct net_device *dev); - /* Pointers to interface service routines. */ - int (*open)(struct net_device *dev); - int (*stop)(struct net_device *dev); -#define HAVE_NETDEV_POLL -#define HAVE_CHANGE_RX_FLAGS - void (*change_rx_flags)(struct net_device *dev, - int flags); -#define HAVE_SET_RX_MODE - void (*set_rx_mode)(struct net_device *dev); -#define HAVE_MULTICAST - void (*set_multicast_list)(struct net_device *dev); -#define HAVE_SET_MAC_ADDR - int (*set_mac_address)(struct net_device *dev, - void *addr); -#define HAVE_VALIDATE_ADDR - int (*validate_addr)(struct net_device *dev); -#define HAVE_PRIVATE_IOCTL - int (*do_ioctl)(struct net_device *dev, - struct ifreq *ifr, int cmd); -#define HAVE_SET_CONFIG - int (*set_config)(struct net_device *dev, - struct ifmap *map); -#define HAVE_CHANGE_MTU - int (*change_mtu)(struct net_device *dev, int new_mtu); + int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); -#define HAVE_TX_TIMEOUT - void (*tx_timeout) (struct net_device *dev); - - void (*vlan_rx_register)(struct net_device *dev, - struct vlan_group *grp); - void (*vlan_rx_add_vid)(struct net_device *dev, - unsigned short vid); - void (*vlan_rx_kill_vid)(struct net_device *dev, - unsigned short vid); - - int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); #ifdef CONFIG_NETPOLL struct netpoll_info *npinfo; #endif -#ifdef CONFIG_NET_POLL_CONTROLLER - void (*poll_controller)(struct net_device *dev); -#endif - - u16 (*select_queue)(struct net_device *dev, - struct sk_buff *skb); #ifdef CONFIG_NET_NS /* Network namespace this network device is inside */ @@ -763,6 +835,38 @@ struct net_device /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; + +#ifdef CONFIG_COMPAT_NET_DEV_OPS + struct { + int (*init)(struct net_device *dev); + void (*uninit)(struct net_device *dev); + int (*open)(struct net_device *dev); + int (*stop)(struct net_device *dev); + void (*change_rx_flags)(struct net_device *dev, + int flags); + void (*set_rx_mode)(struct net_device *dev); + void (*set_multicast_list)(struct net_device *dev); + int (*set_mac_address)(struct net_device *dev, + void *addr); + int (*validate_addr)(struct net_device *dev); + int (*do_ioctl)(struct net_device *dev, + struct ifreq *ifr, int cmd); + int (*set_config)(struct net_device *dev, + struct ifmap *map); + int (*change_mtu)(struct net_device *dev, int new_mtu); + void (*tx_timeout) (struct net_device *dev); + struct net_device_stats* (*get_stats)(struct net_device *dev); + void (*vlan_rx_register)(struct net_device *dev, + struct vlan_group *grp); + void (*vlan_rx_add_vid)(struct net_device *dev, + unsigned short vid); + void (*vlan_rx_kill_vid)(struct net_device *dev, + unsigned short vid); +#ifdef CONFIG_NET_POLL_CONTROLLER + void (*poll_controller)(struct net_device *dev); +#endif +#endif + }; }; #define to_net_dev(d) container_of(d, struct net_device, dev) -- cgit v1.2.3 From eeda3fd64f75bcbfaa70ce946513abaf3f23b8e0 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 19 Nov 2008 21:40:23 -0800 Subject: netdev: introduce dev_get_stats() In order for the network device ops get_stats call to be immutable, the handling of the default internal network device stats block has to be changed. Add a new helper function which replaces the old use of internal_get_stats. Note: change return code to make it clear that the caller should not go changing the returned statistics. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9060f5f3517a..981a089d5149 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -864,9 +864,9 @@ struct net_device unsigned short vid); #ifdef CONFIG_NET_POLL_CONTROLLER void (*poll_controller)(struct net_device *dev); -#endif #endif }; +#endif }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -1780,6 +1780,8 @@ extern void netdev_features_change(struct net_device *dev); /* Load a device via the kmod */ extern void dev_load(struct net *net, const char *name); extern void dev_mcast_init(void); +extern const struct net_device_stats *dev_get_stats(struct net_device *dev); + extern int netdev_max_backlog; extern int weight_p; extern int netdev_set_master(struct net_device *dev, struct net_device *master); -- cgit v1.2.3 From 008298231abbeb91bc7be9e8b078607b816d1a4a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 20 Nov 2008 20:14:53 -0800 Subject: netdev: add more functions to netdevice ops This patch moves neigh_setup and hard_start_xmit into the network device ops structure. For bisection, fix all the previously converted drivers as well. Bonding driver took the biggest hit on this. Added a prefetch of the hard_start_xmit in the fast path to try and reduce any impact this would have. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 981a089d5149..d8fb23679ee3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -454,8 +454,8 @@ struct netdev_queue { /* * This structure defines the management hooks for network devices. - * The following hooks can bed defined and are optonal (can be null) - * unless otherwise noted. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. * * int (*ndo_init)(struct net_device *dev); * This function is called once when network device is registered. @@ -475,6 +475,15 @@ struct netdev_queue { * This function is called when network device transistions to the down * state. * + * int (*ndo_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev); + * Called when a packet needs to be transmitted. + * Must return NETDEV_TX_OK , NETDEV_TX_BUSY, or NETDEV_TX_LOCKED, + * Required can not be NULL. + * + * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb); + * Called to decide which queue to when device supports multiple + * transmit queues. + * * void (*ndo_change_rx_flags)(struct net_device *dev, int flags); * This function is called to allow device receiver to make * changes to configuration when multicast or promiscious is enabled. @@ -508,7 +517,7 @@ struct netdev_queue { * of a device. If not defined, any request to change MTU will * will return an error. * - * void (*ndo_tx_timeout) (struct net_device *dev); + * void (*ndo_tx_timeout)(struct net_device *dev); * Callback uses when the transmitter has not made any progress * for dev->watchdog ticks. * @@ -538,6 +547,10 @@ struct net_device_ops { void (*ndo_uninit)(struct net_device *dev); int (*ndo_open)(struct net_device *dev); int (*ndo_stop)(struct net_device *dev); + int (*ndo_start_xmit) (struct sk_buff *skb, + struct net_device *dev); + u16 (*ndo_select_queue)(struct net_device *dev, + struct sk_buff *skb); #define HAVE_CHANGE_RX_FLAGS void (*ndo_change_rx_flags)(struct net_device *dev, int flags); @@ -557,8 +570,10 @@ struct net_device_ops { int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); #define HAVE_CHANGE_MTU - int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); - + int (*ndo_change_mtu)(struct net_device *dev, + int new_mtu); + int (*ndo_neigh_setup)(struct net_device *dev, + struct neigh_parms *); #define HAVE_TX_TIMEOUT void (*ndo_tx_timeout) (struct net_device *dev); @@ -761,18 +776,12 @@ struct net_device /* Number of TX queues currently active in device */ unsigned int real_num_tx_queues; - /* Map buffer to appropriate transmit queue */ - u16 (*select_queue)(struct net_device *dev, - struct sk_buff *skb); - unsigned long tx_queue_len; /* Max frames per queue allowed */ spinlock_t tx_global_lock; /* * One part is mostly used on xmit path (device) */ void *priv; /* pointer to private data */ - int (*hard_start_xmit) (struct sk_buff *skb, - struct net_device *dev); /* These may be needed for future network-power-down code. */ unsigned long trans_start; /* Time (in jiffies) of last Tx */ @@ -800,8 +809,6 @@ struct net_device /* Called from unregister, can be used to call free_netdev */ void (*destructor)(struct net_device *dev); - int (*neigh_setup)(struct net_device *dev, struct neigh_parms *); - #ifdef CONFIG_NETPOLL struct netpoll_info *npinfo; #endif @@ -842,6 +849,10 @@ struct net_device void (*uninit)(struct net_device *dev); int (*open)(struct net_device *dev); int (*stop)(struct net_device *dev); + int (*hard_start_xmit) (struct sk_buff *skb, + struct net_device *dev); + u16 (*select_queue)(struct net_device *dev, + struct sk_buff *skb); void (*change_rx_flags)(struct net_device *dev, int flags); void (*set_rx_mode)(struct net_device *dev); @@ -854,6 +865,8 @@ struct net_device int (*set_config)(struct net_device *dev, struct ifmap *map); int (*change_mtu)(struct net_device *dev, int new_mtu); + int (*neigh_setup)(struct net_device *dev, + struct neigh_parms *); void (*tx_timeout) (struct net_device *dev); struct net_device_stats* (*get_stats)(struct net_device *dev); void (*vlan_rx_register)(struct net_device *dev, -- cgit v1.2.3 From 2f90b8657ec942d1880f720e0177ee71df7c8e3c Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 20 Nov 2008 20:52:10 -0800 Subject: ixgbe: this patch adds support for DCB to the kernel and ixgbe driver This adds support for Data Center Bridging (DCB) features in the ixgbe driver and adds an rtnetlink interface for configuring DCB to the kernel. The DCB feature support included are Priority Grouping (PG) - which allows bandwidth guarantees to be allocated to groups to traffic based on the 802.1q priority, and Priority Based Flow Control (PFC) - which introduces a new MAC control PAUSE frame which works at granularity of the 802.1p priority instead of the link (IEEE 802.3x). Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8fb23679ee3..6095af572dfd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -43,6 +43,9 @@ #include #include +#ifdef CONFIG_DCBNL +#include +#endif struct vlan_group; struct ethtool_ops; @@ -843,6 +846,11 @@ struct net_device #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; +#ifdef CONFIG_DCBNL + /* Data Center Bridging netlink ops */ + struct dcbnl_rtnl_ops *dcbnl_ops; +#endif + #ifdef CONFIG_COMPAT_NET_DEV_OPS struct { int (*init)(struct net_device *dev); -- cgit v1.2.3 From 47fd5b8373ecc6bf5473e4139b62b06425448252 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 25 Nov 2008 00:20:43 -0800 Subject: netdev: add HAVE_NET_DEVICE_OPS As a concession to vendors who have to deal with one source for different kernel versions, add a HAVE_NET_DEVICE_OPS so they don't end up hard coding ifdef against kernel version. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6095af572dfd..76a89f8e6a19 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -545,6 +545,7 @@ struct netdev_queue { * * void (*ndo_poll_controller)(struct net_device *dev); */ +#define HAVE_NET_DEVICE_OPS struct net_device_ops { int (*ndo_init)(struct net_device *dev); void (*ndo_uninit)(struct net_device *dev); -- cgit v1.2.3 From 7a6b6f515f77d1c62a2f383b6dce18cb0af0cf4f Mon Sep 17 00:00:00 2001 From: Jeff Kirsher Date: Tue, 25 Nov 2008 01:02:08 -0800 Subject: DCB: fix kconfig option Since the netlink option for DCB is necessary to actually be useful, simplified the Kconfig option. In addition, added useful help text for the Kconfig option. Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 76a89f8e6a19..0df0db068ac3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -43,7 +43,7 @@ #include #include -#ifdef CONFIG_DCBNL +#ifdef CONFIG_DCB #include #endif @@ -847,7 +847,7 @@ struct net_device #define GSO_MAX_SIZE 65536 unsigned int gso_max_size; -#ifdef CONFIG_DCBNL +#ifdef CONFIG_DCB /* Data Center Bridging netlink ops */ struct dcbnl_rtnl_ops *dcbnl_ops; #endif -- cgit v1.2.3 From b74ca3a896b9ab5f952bc440154758e708c48884 Mon Sep 17 00:00:00 2001 From: Wang Chen Date: Mon, 8 Dec 2008 01:14:16 -0800 Subject: netdevice: Kill netdev->priv This is the last shoot of this series. After I removing all directly reference of netdev->priv, I am killing "priv" of "struct net_device" and fixing relative comments/docs. Anyone will not be allowed to reference netdev->priv directly. If you want to reference the memory of private data, use netdev_priv() instead. If the private data is not allocted when alloc_netdev(), use netdev->ml_priv to point that memory after you creating that private data. Signed-off-by: Wang Chen Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0df0db068ac3..47e731528315 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -785,7 +785,6 @@ struct net_device /* * One part is mostly used on xmit path (device) */ - void *priv; /* pointer to private data */ /* These may be needed for future network-power-down code. */ unsigned long trans_start; /* Time (in jiffies) of last Tx */ -- cgit v1.2.3 From 7b363e440021a1cf9ed76944b2685f48dacefb3e Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 9 Dec 2008 23:22:26 -0800 Subject: netpoll: fix race on poll_list resulting in garbage entry A few months back a race was discused between the netpoll napi service path, and the fast path through net_rx_action: http://kerneltrap.org/mailarchive/linux-netdev/2007/10/16/345470 A patch was submitted for that bug, but I think we missed a case. Consider the following scenario: INITIAL STATE CPU0 has one napi_struct A on its poll_list CPU1 is calling netpoll_send_skb and needs to call poll_napi on the same napi_struct A that CPU0 has on its list CPU0 CPU1 net_rx_action poll_napi !list_empty (returns true) locks poll_lock for A poll_one_napi napi->poll netif_rx_complete __napi_complete (removes A from poll_list) list_entry(list->next) In the above scenario, net_rx_action assumes that the per-cpu poll_list is exclusive to that cpu. netpoll of course violates that, and because the netpoll path can dequeue from the poll list, its possible for CPU0 to detect a non-empty list at the top of the while loop in net_rx_action, but have it become empty by the time it calls list_entry. Since the poll_list isn't surrounded by any other structure, the returned data from that list_entry call in this situation is garbage, and any number of crashes can result based on what exactly that garbage is. Given that its not fasible for performance reasons to place exclusive locks arround each cpus poll list to provide that mutal exclusion, I think the best solution is modify the netpoll path in such a way that we continue to guarantee that the poll_list for a cpu is in fact exclusive to that cpu. To do this I've implemented the patch below. It adds an additional bit to the state field in the napi_struct. When executing napi->poll from the netpoll_path, this bit will be set. When a driver calls netif_rx_complete, if that bit is set, it will not remove the napi_struct from the poll_list. That work will be saved for the next iteration of net_rx_action. I've tested this and it seems to work well. About the biggest drawback I can see to it is the fact that it might result in an extra loop through net_rx_action in the event that the device is actually contended for (i.e. the netpoll path actually preforms all the needed work no the device, and the call to net_rx_action winds up doing nothing, except removing the napi_struct from the poll_list. However I think this is probably a small price to pay, given that the alternative is a crash. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9d77b1d7dca8..e26f54952892 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -319,6 +319,7 @@ enum { NAPI_STATE_SCHED, /* Poll is scheduled */ NAPI_STATE_DISABLE, /* Disable pending */ + NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ }; extern void __napi_schedule(struct napi_struct *n); @@ -1497,6 +1498,12 @@ static inline void netif_rx_complete(struct net_device *dev, { unsigned long flags; + /* + * don't let napi dequeue from the cpu poll list + * just in case its running on a different cpu + */ + if (unlikely(test_bit(NAPI_STATE_NPSVC, &napi->state))) + return; local_irq_save(flags); __netif_rx_complete(dev, napi); local_irq_restore(flags); -- cgit v1.2.3 From 1a881f27c50b4fbd6858a8696a189263621136b0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:27:47 -0800 Subject: net: Add frag_list support to GSO This patch allows GSO to handle frag_list in a limited way for the purposes of allowing packets merged by GRO to be refragmented on output. Most hardware won't (and aren't expected to) support handling GRO frag_list packets directly. Therefore we will perform GSO in software for those cases. However, for drivers that can support it (such as virtual NICs) we may not have to segment the packets at all. Whether the added overhead of GRO/GSO is worthwhile for bridges and routers when weighed against the benefit of potentially increasing the MTU within the host is still an open question. However, for the case of host nodes this is undoubtedly a win. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b60c26b7d31c..bdf5465deb91 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1858,6 +1858,8 @@ static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) { return skb_is_gso(skb) && (!skb_gso_ok(skb, dev->features) || + (skb_shinfo(skb)->frag_list && + !(dev->features & NETIF_F_FRAGLIST)) || unlikely(skb->ip_summed != CHECKSUM_PARTIAL)); } -- cgit v1.2.3 From d565b0a1a9b6ee7dff46e1f68b26b526ac11ae50 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 15 Dec 2008 23:38:52 -0800 Subject: net: Add Generic Receive Offload infrastructure This patch adds the top-level GRO (Generic Receive Offload) infrastructure. This is pretty similar to LRO except that this is protocol-independent. Instead of holding packets in an lro_mgr structure, they're now held in napi_struct. For drivers that intend to use this, they can set the NETIF_F_GRO bit and call napi_gro_receive instead of netif_receive_skb or just call netif_rx. The latter will call napi_receive_skb automatically. When napi_gro_receive is used, the driver must either call napi_complete/napi_rx_complete, or call napi_gro_flush in softirq context if the driver uses the primitives __napi_complete/__napi_rx_complete. Protocols will set the gro_receive and gro_complete function pointers in order to participate in this scheme. In addition to the packet, gro_receive will get a list of currently held packets. Each packet in the list has a same_flow field which is non-zero if it is a potential match for the new packet. For each packet that may match, they also have a flush field which is non-zero if the held packet must not be merged with the new packet. Once gro_receive has determined that the new skb matches a held packet, the held packet may be processed immediately if the new skb cannot be merged with it. In this case gro_receive should return the pointer to the existing skb in gro_list. Otherwise the new skb should be merged into the existing packet and NULL should be returned, unless the new skb makes it impossible for any further merges to be made (e.g., FIN packet) where the merged skb should be returned. Whenever the skb is merged into an existing entry, the gro_receive function should set NAPI_GRO_CB(skb)->same_flow. Note that if an skb merely matches an existing entry but can't be merged with it, then this shouldn't be set. If gro_receive finds it pointless to hold the new skb for future merging, it should set NAPI_GRO_CB(skb)->flush. Held packets will be flushed by napi_gro_flush which is called by napi_complete and napi_rx_complete. Currently held packets are stored in a singly liked list just like LRO. The list is limited to a maximum of 8 entries. In future, this may be expanded to use a hash table to allow more flows to be held for merging. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/netdevice.h | 80 +++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 52 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index bdf5465deb91..58856b6737fb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -314,8 +314,9 @@ struct napi_struct { spinlock_t poll_lock; int poll_owner; struct net_device *dev; - struct list_head dev_list; #endif + struct list_head dev_list; + struct sk_buff *gro_list; }; enum @@ -376,22 +377,8 @@ static inline int napi_reschedule(struct napi_struct *napi) * * Mark NAPI processing as complete. */ -static inline void __napi_complete(struct napi_struct *n) -{ - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - list_del(&n->poll_list); - smp_mb__before_clear_bit(); - clear_bit(NAPI_STATE_SCHED, &n->state); -} - -static inline void napi_complete(struct napi_struct *n) -{ - unsigned long flags; - - local_irq_save(flags); - __napi_complete(n); - local_irq_restore(flags); -} +extern void __napi_complete(struct napi_struct *n); +extern void napi_complete(struct napi_struct *n); /** * napi_disable - prevent NAPI from scheduling @@ -640,9 +627,7 @@ struct net_device unsigned long state; struct list_head dev_list; -#ifdef CONFIG_NETPOLL struct list_head napi_list; -#endif /* Net device features */ unsigned long features; @@ -661,6 +646,7 @@ struct net_device #define NETIF_F_LLTX 4096 /* LockLess TX - deprecated. Please */ /* do not use LLTX in new drivers */ #define NETIF_F_NETNS_LOCAL 8192 /* Does not change network namespaces */ +#define NETIF_F_GRO 16384 /* Generic receive offload */ #define NETIF_F_LRO 32768 /* large receive offload */ /* Segmentation offload features */ @@ -984,22 +970,8 @@ static inline void *netdev_priv(const struct net_device *dev) * netif_napi_add() must be used to initialize a napi context prior to calling * *any* of the other napi related functions. */ -static inline void netif_napi_add(struct net_device *dev, - struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), - int weight) -{ - INIT_LIST_HEAD(&napi->poll_list); - napi->poll = poll; - napi->weight = weight; -#ifdef CONFIG_NETPOLL - napi->dev = dev; - list_add(&napi->dev_list, &dev->napi_list); - spin_lock_init(&napi->poll_lock); - napi->poll_owner = -1; -#endif - set_bit(NAPI_STATE_SCHED, &napi->state); -} +void netif_napi_add(struct net_device *dev, struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), int weight); /** * netif_napi_del - remove a napi context @@ -1007,12 +979,20 @@ static inline void netif_napi_add(struct net_device *dev, * * netif_napi_del() removes a napi context from the network device napi list */ -static inline void netif_napi_del(struct napi_struct *napi) -{ -#ifdef CONFIG_NETPOLL - list_del(&napi->dev_list); -#endif -} +void netif_napi_del(struct napi_struct *napi); + +struct napi_gro_cb { + /* This is non-zero if the packet may be of the same flow. */ + int same_flow; + + /* This is non-zero if the packet cannot be merged with the new skb. */ + int flush; + + /* Number of segments aggregated. */ + int count; +}; + +#define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb) struct packet_type { __be16 type; /* This is really htons(ether_type). */ @@ -1024,6 +1004,9 @@ struct packet_type { struct sk_buff *(*gso_segment)(struct sk_buff *skb, int features); int (*gso_send_check)(struct sk_buff *skb); + struct sk_buff **(*gro_receive)(struct sk_buff **head, + struct sk_buff *skb); + int (*gro_complete)(struct sk_buff *skb); void *af_packet_priv; struct list_head list; }; @@ -1377,6 +1360,9 @@ extern int netif_rx(struct sk_buff *skb); extern int netif_rx_ni(struct sk_buff *skb); #define HAVE_NETIF_RECEIVE_SKB 1 extern int netif_receive_skb(struct sk_buff *skb); +extern void napi_gro_flush(struct napi_struct *napi); +extern int napi_gro_receive(struct napi_struct *napi, + struct sk_buff *skb); extern void netif_nit_deliver(struct sk_buff *skb); extern int dev_valid_name(const char *name); extern int dev_ioctl(struct net *net, unsigned int cmd, void __user *); @@ -1621,17 +1607,7 @@ static inline void __netif_rx_complete(struct net_device *dev, static inline void netif_rx_complete(struct net_device *dev, struct napi_struct *napi) { - unsigned long flags; - - /* - * don't let napi dequeue from the cpu poll list - * just in case its running on a different cpu - */ - if (unlikely(test_bit(NAPI_STATE_NPSVC, &napi->state))) - return; - local_irq_save(flags); - __netif_rx_complete(dev, napi); - local_irq_restore(flags); + napi_complete(napi); } static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) -- cgit v1.2.3 From 908a7a16b852ffd618a9127be8d62432182d81b4 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 22 Dec 2008 20:43:12 -0800 Subject: net: Remove unused netdev arg from some NAPI interfaces. When the napi api was changed to separate its 1:1 binding to the net_device struct, the netif_rx_[prep|schedule|complete] api failed to remove the now vestigual net_device structure parameter. This patch cleans up that api by properly removing it.. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'include/linux/netdevice.h') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 58856b6737fb..41e1224651cf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1555,8 +1555,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) } /* Test if receive needs to be scheduled but only if up */ -static inline int netif_rx_schedule_prep(struct net_device *dev, - struct napi_struct *napi) +static inline int netif_rx_schedule_prep(struct napi_struct *napi) { return napi_schedule_prep(napi); } @@ -1564,27 +1563,24 @@ static inline int netif_rx_schedule_prep(struct net_device *dev, /* Add interface to tail of rx poll list. This assumes that _prep has * already been called and returned 1. */ -static inline void __netif_rx_schedule(struct net_device *dev, - struct napi_struct *napi) +static inline void __netif_rx_schedule(struct napi_struct *napi) { __napi_schedule(napi); } /* Try to reschedule poll. Called by irq handler. */ -static inline void netif_rx_schedule(struct net_device *dev, - struct napi_struct *napi) +static inline void netif_rx_schedule(struct napi_struct *napi) { - if (netif_rx_schedule_prep(dev, napi)) - __netif_rx_schedule(dev, napi); + if (netif_rx_schedule_prep(napi)) + __netif_rx_schedule(napi); } /* Try to reschedule poll. Called by dev->poll() after netif_rx_complete(). */ -static inline int netif_rx_reschedule(struct net_device *dev, - struct napi_struct *napi) +static inline int netif_rx_reschedule(struct napi_struct *napi) { if (napi_schedule_prep(napi)) { - __netif_rx_schedule(dev, napi); + __netif_rx_schedule(napi); return 1; } return 0; @@ -1593,8 +1589,7 @@ static inline int netif_rx_reschedule(struct net_device *dev, /* same as netif_rx_complete, except that local_irq_save(flags) * has already been issued */ -static inline void __netif_rx_complete(struct net_device *dev, - struct napi_struct *napi) +static inline void __netif_rx_complete(struct napi_struct *napi) { __napi_complete(napi); } @@ -1604,8 +1599,7 @@ static inline void __netif_rx_complete(struct net_device *dev, * it completes the work. The device cannot be out of poll list at this * moment, it is BUG(). */ -static inline void netif_rx_complete(struct net_device *dev, - struct napi_struct *napi) +static inline void netif_rx_complete(struct napi_struct *napi) { napi_complete(napi); } -- cgit v1.2.3