From d594e987c6f5417cc63dd7e107a2a03a7eeee03f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Jun 2012 03:50:35 +0000 Subject: sock_diag: add SK_MEMINFO_BACKLOG Adding socket backlog len in INET_DIAG_SKMEMINFO is really useful to diagnose various TCP problems. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock_diag.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 5fd146720f39..0d934ce1075f 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -46,6 +46,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; return 0; -- cgit v1.2.3 From 35b2a113cb0298d4f9a1263338b456094a414057 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 16 May 2012 23:40:18 +0200 Subject: wireless: remove wext sysfs The only user of this was hal prior to its 0.5.12 release which happened over two years ago, so I'm sure this can be removed without issues. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/core/net-sysfs.c | 74 ---------------------------------------------------- 1 file changed, 74 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fdf9e61d0651..72607174ea5a 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -417,72 +417,6 @@ static struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; - -#ifdef CONFIG_WIRELESS_EXT_SYSFS -/* helper function that does all the locking etc for wireless stats */ -static ssize_t wireless_show(struct device *d, char *buf, - ssize_t (*format)(const struct iw_statistics *, - char *)) -{ - struct net_device *dev = to_net_dev(d); - const struct iw_statistics *iw; - ssize_t ret = -EINVAL; - - if (!rtnl_trylock()) - return restart_syscall(); - if (dev_isalive(dev)) { - iw = get_wireless_stats(dev); - if (iw) - ret = (*format)(iw, buf); - } - rtnl_unlock(); - - return ret; -} - -/* show function template for wireless fields */ -#define WIRELESS_SHOW(name, field, format_string) \ -static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \ -{ \ - return sprintf(buf, format_string, iw->field); \ -} \ -static ssize_t show_iw_##name(struct device *d, \ - struct device_attribute *attr, char *buf) \ -{ \ - return wireless_show(d, buf, format_iw_##name); \ -} \ -static DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL) - -WIRELESS_SHOW(status, status, fmt_hex); -WIRELESS_SHOW(link, qual.qual, fmt_dec); -WIRELESS_SHOW(level, qual.level, fmt_dec); -WIRELESS_SHOW(noise, qual.noise, fmt_dec); -WIRELESS_SHOW(nwid, discard.nwid, fmt_dec); -WIRELESS_SHOW(crypt, discard.code, fmt_dec); -WIRELESS_SHOW(fragment, discard.fragment, fmt_dec); -WIRELESS_SHOW(misc, discard.misc, fmt_dec); -WIRELESS_SHOW(retries, discard.retries, fmt_dec); -WIRELESS_SHOW(beacon, miss.beacon, fmt_dec); - -static struct attribute *wireless_attrs[] = { - &dev_attr_status.attr, - &dev_attr_link.attr, - &dev_attr_level.attr, - &dev_attr_noise.attr, - &dev_attr_nwid.attr, - &dev_attr_crypt.attr, - &dev_attr_fragment.attr, - &dev_attr_retries.attr, - &dev_attr_misc.attr, - &dev_attr_beacon.attr, - NULL -}; - -static struct attribute_group wireless_group = { - .name = "wireless", - .attrs = wireless_attrs, -}; -#endif #endif /* CONFIG_SYSFS */ #ifdef CONFIG_RPS @@ -1463,14 +1397,6 @@ int netdev_register_kobject(struct net_device *net) groups++; *groups++ = &netstat_group; -#ifdef CONFIG_WIRELESS_EXT_SYSFS - if (net->ieee80211_ptr) - *groups++ = &wireless_group; -#ifdef CONFIG_WIRELESS_EXT - else if (net->wireless_handlers) - *groups++ = &wireless_group; -#endif -#endif #endif /* CONFIG_SYSFS */ error = device_add(dev); -- cgit v1.2.3 From 94b6042cfed02229b05e04002ab00085b60f8213 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 6 Jun 2012 15:23:37 +0000 Subject: net: Update kernel-doc for __alloc_skb() __alloc_skb() now extends tailroom to allow the use of padding added by the heap allocator. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 016694d62484..1d74cea22aaa 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -160,8 +160,8 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) * @node: numa node to allocate memory on * * Allocate a new &sk_buff. The returned buffer has no headroom and a - * tail room of size bytes. The object has a reference count of one. - * The return is the buffer. On a failure the return is %NULL. + * tail room of at least size bytes. The object has a reference count + * of one. The return is the buffer. On a failure the return is %NULL. * * Buffers may only be allocated from interrupts using a @gfp_mask of * %GFP_ATOMIC. -- cgit v1.2.3 From 80f12eccce775dc6bb93dba9b52529740f929237 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 6 Jun 2012 17:13:06 +0000 Subject: Added kernel support in EEE Ethtool commands This patch extends the kernel's ethtool interface by adding support for 2 new EEE commands - get_eee and set_eee. Thanks goes to Giuseppe Cavallaro for his original patch adding this support. Signed-off-by: Yuval Mintz Signed-off-by: Eilon Greenstein Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 9c2afb480270..c73d0a59212c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -729,6 +729,40 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) return dev->ethtool_ops->set_wol(dev, &wol); } +static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_eee edata; + int rc; + + if (!dev->ethtool_ops->get_eee) + return -EOPNOTSUPP; + + memset(&edata, 0, sizeof(struct ethtool_eee)); + edata.cmd = ETHTOOL_GEEE; + rc = dev->ethtool_ops->get_eee(dev, &edata); + + if (rc) + return rc; + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + + return 0; +} + +static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_eee edata; + + if (!dev->ethtool_ops->set_eee) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + return dev->ethtool_ops->set_eee(dev, &edata); +} + static int ethtool_nway_reset(struct net_device *dev) { if (!dev->ethtool_ops->nway_reset) @@ -1471,6 +1505,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_set_value_void(dev, useraddr, dev->ethtool_ops->set_msglevel); break; + case ETHTOOL_GEEE: + rc = ethtool_get_eee(dev, useraddr); + break; + case ETHTOOL_SEEE: + rc = ethtool_set_eee(dev, useraddr); + break; case ETHTOOL_NWAY_RST: rc = ethtool_nway_reset(dev); break; -- cgit v1.2.3 From 95603e2293de556de7e82221649bfd7fd98b64a3 Mon Sep 17 00:00:00 2001 From: Michel Machado Date: Tue, 12 Jun 2012 10:16:35 +0000 Subject: net-next: add dev_loopback_xmit() to avoid duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add dev_loopback_xmit() in order to deduplicate functions ip_dev_loopback_xmit() (in net/ipv4/ip_output.c) and ip6_dev_loopback_xmit() (in net/ipv6/ip6_output.c). I was about to reinvent the wheel when I noticed that ip_dev_loopback_xmit() and ip6_dev_loopback_xmit() do exactly what I need and are not IP-only functions, but they were not available to reuse elsewhere. ip6_dev_loopback_xmit() does not have line "skb_dst_force(skb);", but I understand that this is harmless, and should be in dev_loopback_xmit(). Signed-off-by: Michel Machado CC: "David S. Miller" CC: Alexey Kuznetsov CC: James Morris CC: Hideaki YOSHIFUJI CC: Patrick McHardy CC: Eric Dumazet CC: Jiri Pirko CC: "Michał Mirosław" CC: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index cd0981977f5c..c6e29ea65bd9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2475,6 +2475,23 @@ static void skb_update_prio(struct sk_buff *skb) static DEFINE_PER_CPU(int, xmit_recursion); #define RECURSION_LIMIT 10 +/** + * dev_loopback_xmit - loop back @skb + * @skb: buffer to transmit + */ +int dev_loopback_xmit(struct sk_buff *skb) +{ + skb_reset_mac_header(skb); + __skb_pull(skb, skb_network_offset(skb)); + skb->pkt_type = PACKET_LOOPBACK; + skb->ip_summed = CHECKSUM_UNNECESSARY; + WARN_ON(!skb_dst(skb)); + skb_dst_force(skb); + netif_rx_ni(skb); + return 0; +} +EXPORT_SYMBOL(dev_loopback_xmit); + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit -- cgit v1.2.3 From 2da45db2bdd432a9dca825099c791f5c851f92b9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 12 Jun 2012 13:05:41 +0000 Subject: ethtool: Make more commands available to unprivileged processes 'Get' commands should generally not require CAP_NET_ADMIN, with the exception of those that expose internal state. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c73d0a59212c..cbf033dcaf1f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1443,6 +1443,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: + case ETHTOOL_GLINK: case ETHTOOL_GCOALESCE: case ETHTOOL_GRINGPARAM: case ETHTOOL_GPAUSEPARAM: @@ -1451,6 +1452,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GSG: case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: + case ETHTOOL_GSTATS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: case ETHTOOL_GUFO: @@ -1463,8 +1465,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: + case ETHTOOL_GRXFHINDIR: case ETHTOOL_GFEATURES: + case ETHTOOL_GCHANNELS: case ETHTOOL_GET_TS_INFO: + case ETHTOOL_GEEE: break; default: if (!capable(CAP_NET_ADMIN)) -- cgit v1.2.3 From 41063e9dd11956f2d285e12e4342e1d232ba0ea2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 19 Jun 2012 21:22:05 -0700 Subject: ipv4: Early TCP socket demux. Input packet processing for local sockets involves two major demuxes. One for the route and one for the socket. But we can optimize this down to one demux for certain kinds of local sockets. Currently we only do this for established TCP sockets, but it could at least in theory be expanded to other kinds of connections. If a TCP socket is established then it's identity is fully specified. This means that whatever input route was used during the three-way handshake must work equally well for the rest of the connection since the keys will not change. Once we move to established state, we cache the receive packet's input route to use later. Like the existing cached route in sk->sk_dst_cache used for output packets, we have to check for route invalidations using dst->obsolete and dst->ops->check(). Early demux occurs outside of a socket locked section, so when a route invalidation occurs we defer the fixup of sk->sk_rx_dst until we are actually inside of established state packet processing and thus have the socket locked. Signed-off-by: David S. Miller --- net/core/sock.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 9e5b71fda6ec..929bdcc2383b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1465,6 +1465,11 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +void sock_edemux(struct sk_buff *skb) +{ + sock_put(skb->sk); +} +EXPORT_SYMBOL(sock_edemux); int sock_i_uid(struct sock *sk) { -- cgit v1.2.3 From 7b46866dd0a6fe38ecee523eb27eda9c8f484dc5 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 26 Jun 2012 23:36:11 +0000 Subject: sock_diag: Do not use RTA_PUT() macros Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/sock_diag.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 0d934ce1075f..ff2967acbfae 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include @@ -35,9 +34,7 @@ EXPORT_SYMBOL_GPL(sock_diag_save_cookie); int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) { - __u32 *mem; - - mem = RTA_DATA(__RTA_PUT(skb, attrtype, SK_MEMINFO_VARS * sizeof(__u32))); + u32 mem[SK_MEMINFO_VARS]; mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; @@ -48,10 +45,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; - return 0; - -rtattr_failure: - return -EMSGSIZE; + return nla_put(skb, attrtype, sizeof(mem), &mem); } EXPORT_SYMBOL_GPL(sock_diag_put_meminfo); @@ -121,7 +115,7 @@ static inline void sock_diag_unlock_handler(const struct sock_diag_handler *h) static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { int err; - struct sock_diag_req *req = NLMSG_DATA(nlh); + struct sock_diag_req *req = nlmsg_data(nlh); const struct sock_diag_handler *hndl; if (nlmsg_len(nlh) < sizeof(*req)) -- cgit v1.2.3 From 4c3af034fafeb7269176bf1310c9bcff0b9fd9bb Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 26 Jun 2012 23:36:16 +0000 Subject: netlink: Get rid of obsolete rtnetlink macros Removes all RTA_GET*() and RTA_PUT*() variations, as well as the the unused rtattr_strcmp(). Get rid of rtm_get_table() by moving it to its only user decnet. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 21318d15bbc3..bc8a1cdaac98 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -541,19 +541,6 @@ static const int rta_max[RTM_NR_FAMILIES] = [RTM_FAM(RTM_NEWACTION)] = TCAA_MAX, }; -void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) -{ - struct rtattr *rta; - int size = RTA_LENGTH(attrlen); - - rta = (struct rtattr *)skb_put(skb, RTA_ALIGN(size)); - rta->rta_type = attrtype; - rta->rta_len = size; - memcpy(RTA_DATA(rta), data, attrlen); - memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size); -} -EXPORT_SYMBOL(__rta_fill); - int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; -- cgit v1.2.3 From 22911fc581f6a241e2897a7a8603e97344a6ec82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Jun 2012 00:23:44 +0000 Subject: net: skb_free_datagram_locked() doesnt drop all packets dropwatch wrongly diagnose all received UDP packets as drops. This patch removes trace_kfree_skb() done in skb_free_datagram_locked(). Locations calling skb_free_datagram_locked() should do it on their own. As a result, drops are accounted on the right function. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index ae6acf6a3dea..0337e2b76862 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -248,7 +248,6 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) unlock_sock_fast(sk, slow); /* skb is now orphaned, can be freed outside of locked section */ - trace_kfree_skb(skb, skb_free_datagram_locked); __kfree_skb(skb); } EXPORT_SYMBOL(skb_free_datagram_locked); -- cgit v1.2.3 From 7a9bc9b81a5bc6e44ebc80ef781332e4385083f2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 29 Jun 2012 01:32:45 -0700 Subject: ipv4: Elide fib_validate_source() completely when possible. If rpfilter is off (or the SKB has an IPSEC path) and there are not tclassid users, we don't have to do anything at all when fib_validate_source() is invoked besides setting the itag to zero. We monitor tclassid uses with a counter (modified only under RTNL and marked __read_mostly) and we protect the fib_validate_source() real work with a test against this counter and whether rpfilter is to be done. Having a way to know whether we need no tclassid processing or not also opens the door for future optimized rpfilter algorithms that do not perform full FIB lookups. Signed-off-by: David S. Miller --- net/core/fib_rules.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 72cceb79d0d4..ab7db83236c9 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -151,6 +151,8 @@ static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) { list_del_rcu(&rule->list); + if (ops->delete) + ops->delete(rule); fib_rule_put(rule); } } @@ -499,6 +501,8 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).pid); + if (ops->delete) + ops->delete(rule); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); -- cgit v1.2.3 From a31f2d17b331db970259e875b7223d3aba7e3821 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 Jun 2012 06:15:21 +0000 Subject: netlink: add netlink_kernel_cfg parameter to netlink_kernel_create This patch adds the following structure: struct netlink_kernel_cfg { unsigned int groups; void (*input)(struct sk_buff *skb); struct mutex *cb_mutex; }; That can be passed to netlink_kernel_create to set optional configurations for netlink kernel sockets. I've populated this structure by looking for NULL and zero parameters at the existing code. The remaining parameters that always need to be set are still left in the original interface. That includes optional parameters for the netlink socket creation. This allows easy extensibility of this interface in the future. This patch also adapts all callers to use this new interface. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 9 +++++++-- net/core/sock_diag.c | 8 ++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index bc8a1cdaac98..2b325c340b44 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2353,8 +2353,13 @@ static struct notifier_block rtnetlink_dev_notifier = { static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; - sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX, - rtnetlink_rcv, &rtnl_mutex, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .groups = RTNLGRP_MAX, + .input = rtnetlink_rcv, + .cb_mutex = &rtnl_mutex, + }; + + sk = netlink_kernel_create(net, NETLINK_ROUTE, THIS_MODULE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index ff2967acbfae..07a29eb34a41 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -171,8 +171,12 @@ EXPORT_SYMBOL_GPL(sock_diag_nlsk); static int __init sock_diag_init(void) { - sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0, - sock_diag_rcv, NULL, THIS_MODULE); + struct netlink_kernel_cfg cfg = { + .input = sock_diag_rcv, + }; + + sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, + THIS_MODULE, &cfg); return sock_diag_nlsk == NULL ? -ENOMEM : 0; } -- cgit v1.2.3 From a263b3093641fb1ec377582c90986a7fd0625184 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 02:02:15 -0700 Subject: ipv4: Make neigh lookups directly in output packet path. Do not use the dst cached neigh, we'll be getting rid of that. Signed-off-by: David S. Miller --- net/core/neighbour.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d81d026138f0..a793af9af150 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -474,8 +474,8 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, - struct net_device *dev) +struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, bool want_ref) { u32 hash_val; int key_len = tbl->key_len; @@ -535,14 +535,16 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, n1 = rcu_dereference_protected(n1->next, lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { - neigh_hold(n1); + if (want_ref) + neigh_hold(n1); rc = n1; goto out_tbl_unlock; } } n->dead = 0; - neigh_hold(n); + if (want_ref) + neigh_hold(n); rcu_assign_pointer(n->next, rcu_dereference_protected(nht->hash_buckets[hash_val], lockdep_is_held(&tbl->lock))); @@ -558,7 +560,7 @@ out_neigh_release: neigh_release(n); goto out; } -EXPORT_SYMBOL(neigh_create); +EXPORT_SYMBOL(__neigh_create); static u32 pneigh_hash(const void *pkey, int key_len) { -- cgit v1.2.3 From 5110effee8fde2edfacac9cd12a9960ab2dc39ea Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 02:21:03 -0700 Subject: net: Do delayed neigh confirmation. When a dst_confirm() happens, mark the confirmation as pending in the dst. Then on the next packet out, when we have the neigh in-hand, do the update. This removes the dependency in dst_confirm() of dst's having an attached neigh. While we're here, remove the explicit 'dst' NULL check, all except 2 or 3 call sites ensure it's not NULL. So just fix those cases up. Signed-off-by: David S. Miller --- net/core/dst.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 43d94cedbf7c..a6e19a23a745 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -152,7 +152,7 @@ EXPORT_SYMBOL(dst_discard); const u32 dst_default_metrics[RTAX_MAX]; void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, int flags) + int initial_ref, int initial_obsolete, unsigned short flags) { struct dst_entry *dst; @@ -188,6 +188,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; + dst->pending_confirm = 0; dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); -- cgit v1.2.3 From 13a43d94ab026c423dc8902170ef27c2bd36aa87 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 22:15:37 -0700 Subject: neigh: Convert over to dst_neigh_lookup_skb(). Signed-off-by: David S. Miller --- net/core/neighbour.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a793af9af150..117afaf51268 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1201,10 +1201,23 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, write_unlock_bh(&neigh->lock); rcu_read_lock(); - /* On shaper/eql skb->dst->neighbour != neigh :( */ - if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL) - n1 = n2; + + /* Why not just use 'neigh' as-is? The problem is that + * things such as shaper, eql, and sch_teql can end up + * using alternative, different, neigh objects to output + * the packet in the output path. So what we need to do + * here is re-lookup the top-level neigh in the path so + * we can reinject the packet there. + */ + n2 = NULL; + if (dst) { + n2 = dst_neigh_lookup_skb(dst, skb); + if (n2) + n1 = n2; + } n1->output(n1, skb); + if (n2) + neigh_release(n2); rcu_read_unlock(); write_lock_bh(&neigh->lock); -- cgit v1.2.3 From 36bdbcae2fa2a6dfa99344d4190fcea0aa7b7c25 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Jul 2012 22:58:02 -0700 Subject: net: Kill dst->_neighbour, accessors, and final uses. No longer used. Signed-off-by: David S. Miller --- net/core/dst.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index a6e19a23a745..07bacff84aa4 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -171,7 +171,6 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst_init_metrics(dst, dst_default_metrics, true); dst->expires = 0UL; dst->path = dst; - RCU_INIT_POINTER(dst->_neighbour, NULL); #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -225,19 +224,12 @@ EXPORT_SYMBOL(__dst_free); struct dst_entry *dst_destroy(struct dst_entry * dst) { struct dst_entry *child; - struct neighbour *neigh; smp_rmb(); again: - neigh = rcu_dereference_protected(dst->_neighbour, 1); child = dst->child; - if (neigh) { - RCU_INIT_POINTER(dst->_neighbour, NULL); - neigh_release(neigh); - } - if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); @@ -361,19 +353,9 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, if (!unregister) { dst->input = dst->output = dst_discard; } else { - struct neighbour *neigh; - dst->dev = dev_net(dst->dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); - rcu_read_lock(); - neigh = dst_get_neighbour_noref(dst); - if (neigh && neigh->dev == dev) { - neigh->dev = dst->dev; - dev_hold(dst->dev); - dev_put(dev); - } - rcu_read_unlock(); } } -- cgit v1.2.3 From 16917b87a23b429226527f393270047069d665e9 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Sun, 1 Jul 2012 03:18:50 +0000 Subject: net-next: Add netif_get_num_default_rss_queues Most multi-queue networking driver consider the number of online cpus when configuring RSS queues. This patch adds a wrapper to the number of cpus, setting an upper limit on the number of cpus a driver should consider (by default) when allocating resources for his queues. Signed-off-by: Yuval Mintz Signed-off-by: Eilon Greenstein Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ed674e212b7a..69f7a1a393d8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1793,6 +1793,17 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif +/* netif_get_num_default_rss_queues - default number of RSS queues + * + * This routine should set an upper limit on the number of RSS queues + * used by default by multiqueue devices. + */ +int netif_get_num_default_rss_queues() +{ + return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); +} +EXPORT_SYMBOL(netif_get_num_default_rss_queues); + static inline void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; -- cgit v1.2.3 From 87a50699cb6d169591cc776fb82683a2c77cecac Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 10 Jul 2012 05:06:14 -0700 Subject: rtnetlink: Remove ts/tsage args to rtnl_put_cacheinfo(). Nobody provides non-zero values any longer. Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2b325c340b44..64127eee786d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -615,7 +615,7 @@ nla_put_failure: EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, - u32 ts, u32 tsage, long expires, u32 error) + long expires, u32 error) { struct rta_cacheinfo ci = { .rta_lastuse = jiffies_to_clock_t(jiffies - dst->lastuse), @@ -623,8 +623,6 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_clntref = atomic_read(&(dst->__refcnt)), .rta_error = error, .rta_id = id, - .rta_ts = ts, - .rta_tsage = tsage, }; if (expires) -- cgit v1.2.3 From a55b138b1da3d25c04f66f8df03d659dfd46c950 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 10 Jul 2012 10:54:38 +0000 Subject: net: Properly define functions with no parameters Defining a function with no parameters as 'T foo()' is the deprecated K&R style, and is not strictly equivalent to defining it as 'T foo(void)'. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 69f7a1a393d8..9c21548e5b31 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1798,7 +1798,7 @@ EXPORT_SYMBOL(netif_set_real_num_rx_queues); * This routine should set an upper limit on the number of RSS queues * used by default by multiqueue devices. */ -int netif_get_num_default_rss_queues() +int netif_get_num_default_rss_queues(void) { return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); } -- cgit v1.2.3 From 2c53040f018b6c36a46eec75b9b937aaa5f78e6d Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 10 Jul 2012 10:55:09 +0000 Subject: net: Fix (nearly-)kernel-doc comments for various functions Fix incorrect start markers, wrapped summary lines, missing section breaks, incorrect separators, and some name mismatches. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 8 +++++--- net/core/rtnetlink.c | 2 +- net/core/skbuff.c | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9c21548e5b31..5ab6f4b37c0c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1691,7 +1691,8 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_unlock(); } -/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change +/** + * netif_setup_tc - Handle tc mappings on real_num_tx_queues change * @dev: Network device * @txq: number of queues available * @@ -1793,7 +1794,8 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif -/* netif_get_num_default_rss_queues - default number of RSS queues +/** + * netif_get_num_default_rss_queues - default number of RSS queues * * This routine should set an upper limit on the number of RSS queues * used by default by multiqueue devices. @@ -5670,7 +5672,7 @@ int netdev_refcnt_read(const struct net_device *dev) } EXPORT_SYMBOL(netdev_refcnt_read); -/* +/** * netdev_wait_allrefs - wait until all references are gone. * * This is called when unregistering network devices. diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 64127eee786d..045db8ad87c8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2174,7 +2174,7 @@ skip: } /** - * ndo_dflt_fdb_dump: default netdevice operation to dump an FDB table. + * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. * @nlh: netlink message header * @dev: netdevice * diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5a789a807ec3..506f678e9d95 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -713,7 +713,8 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); -/* skb_copy_ubufs - copy userspace skb frags buffers to kernel +/** + * skb_copy_ubufs - copy userspace skb frags buffers to kernel * @skb: the skb to modify * @gfp_mask: allocation priority * @@ -2614,7 +2615,7 @@ unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, EXPORT_SYMBOL(skb_find_text); /** - * skb_append_datato_frags: - append the user data to a skb + * skb_append_datato_frags - append the user data to a skb * @sk: sock structure * @skb: skb structure to be appened with user data. * @getfrag: call back function to be used for getting the user data -- cgit v1.2.3 From 46d3ceabd8d98ed0ad10f20c595ca784e34786c5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Jul 2012 05:50:31 +0000 Subject: tcp: TCP Small Queues This introduce TSQ (TCP Small Queues) TSQ goal is to reduce number of TCP packets in xmit queues (qdisc & device queues), to reduce RTT and cwnd bias, part of the bufferbloat problem. sk->sk_wmem_alloc not allowed to grow above a given limit, allowing no more than ~128KB [1] per tcp socket in qdisc/dev layers at a given time. TSO packets are sized/capped to half the limit, so that we have two TSO packets in flight, allowing better bandwidth use. As a side effect, setting the limit to 40000 automatically reduces the standard gso max limit (65536) to 40000/2 : It can help to reduce latencies of high prio packets, having smaller TSO packets. This means we divert sock_wfree() to a tcp_wfree() handler, to queue/send following frames when skb_orphan() [2] is called for the already queued skbs. Results on my dev machines (tg3/ixgbe nics) are really impressive, using standard pfifo_fast, and with or without TSO/GSO. Without reduction of nominal bandwidth, we have reduction of buffering per bulk sender : < 1ms on Gbit (instead of 50ms with TSO) < 8ms on 100Mbit (instead of 132 ms) I no longer have 4 MBytes backlogged in qdisc by a single netperf session, and both side socket autotuning no longer use 4 Mbytes. As skb destructor cannot restart xmit itself ( as qdisc lock might be taken at this point ), we delegate the work to a tasklet. We use one tasklest per cpu for performance reasons. If tasklet finds a socket owned by the user, it sets TSQ_OWNED flag. This flag is tested in a new protocol method called from release_sock(), to eventually send new segments. [1] New /proc/sys/net/ipv4/tcp_limit_output_bytes tunable [2] skb_orphan() is usually called at TX completion time, but some drivers call it in their start_xmit() handler. These drivers should at least use BQL, or else a single TCP session can still fill the whole NIC TX ring, since TSQ will have no effect. Signed-off-by: Eric Dumazet Cc: Dave Taht Cc: Tom Herbert Cc: Matt Mathis Cc: Yuchung Cheng Cc: Nandita Dukkipati Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 929bdcc2383b..24039ac12426 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2159,6 +2159,10 @@ void release_sock(struct sock *sk) spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); + + if (sk->sk_prot->release_cb) + sk->sk_prot->release_cb(sk); + sk->sk_lock.owned = 0; if (waitqueue_active(&sk->sk_lock.wq)) wake_up(&sk->sk_lock.wq); -- cgit v1.2.3 From 540eb7bf0bbedb65277d68ab89ae43cdec3fd6ba Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 12 Jul 2012 14:23:50 +0000 Subject: net: Update alloc frag to reduce get/put page usage and recycle pages This patch is meant to help improve performance by reducing the number of locked operations required to allocate a frag on x86 and other platforms. This is accomplished by using atomic_set operations on the page count instead of calling get_page and put_page. It is based on work originally provided by Eric Dumazet. In addition it also helps to reduce memory overhead when using TCP. This is done by recycling the page if the only holder of the frame is the netdev_alloc_frag call itself. This can occur when skb heads are stolen by either GRO or TCP and the driver providing the packets is using paged frags to store all of the data for the packets. Cc: Eric Dumazet Signed-off-by: Alexander Duyck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 506f678e9d95..8b6d38fdb443 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -296,9 +296,12 @@ EXPORT_SYMBOL(build_skb); struct netdev_alloc_cache { struct page *page; unsigned int offset; + unsigned int pagecnt_bias; }; static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); +#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) + /** * netdev_alloc_frag - allocate a page fragment * @fragsz: fragment size @@ -317,17 +320,26 @@ void *netdev_alloc_frag(unsigned int fragsz) if (unlikely(!nc->page)) { refill: nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); + if (unlikely(!nc->page)) + goto end; +recycle: + atomic_set(&nc->page->_count, NETDEV_PAGECNT_BIAS); + nc->pagecnt_bias = NETDEV_PAGECNT_BIAS; nc->offset = 0; } - if (likely(nc->page)) { - if (nc->offset + fragsz > PAGE_SIZE) { - put_page(nc->page); - goto refill; - } - data = page_address(nc->page) + nc->offset; - nc->offset += fragsz; - get_page(nc->page); + + if (nc->offset + fragsz > PAGE_SIZE) { + /* avoid unnecessary locked operations if possible */ + if ((atomic_read(&nc->page->_count) == nc->pagecnt_bias) || + atomic_sub_and_test(nc->pagecnt_bias, &nc->page->_count)) + goto recycle; + goto refill; } + + data = page_address(nc->page) + nc->offset; + nc->offset += fragsz; + nc->pagecnt_bias--; +end: local_irq_restore(flags); return data; } -- cgit v1.2.3 From 51d7cccf07238f5236c5b9269231a30dd5f8e714 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 16 Jul 2012 04:28:49 +0000 Subject: net: make sock diag per-namespace Before this patch sock_diag works for init_net only and dumps information about sockets from all namespaces. This patch expands sock_diag for all name-spaces. It creates a netlink kernel socket for each netns and filters data during dumping. v2: filter accoding with netns in all places remove an unused variable. Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Cc: Pavel Emelyanov CC: Eric Dumazet Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Andrew Vagin Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/core/sock_diag.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 07a29eb34a41..9d8755e4a7a5 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -166,23 +166,36 @@ static void sock_diag_rcv(struct sk_buff *skb) mutex_unlock(&sock_diag_mutex); } -struct sock *sock_diag_nlsk; -EXPORT_SYMBOL_GPL(sock_diag_nlsk); - -static int __init sock_diag_init(void) +static int __net_init diag_net_init(struct net *net) { struct netlink_kernel_cfg cfg = { .input = sock_diag_rcv, }; - sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, + net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, THIS_MODULE, &cfg); - return sock_diag_nlsk == NULL ? -ENOMEM : 0; + return net->diag_nlsk == NULL ? -ENOMEM : 0; +} + +static void __net_exit diag_net_exit(struct net *net) +{ + netlink_kernel_release(net->diag_nlsk); + net->diag_nlsk = NULL; +} + +static struct pernet_operations diag_net_ops = { + .init = diag_net_init, + .exit = diag_net_exit, +}; + +static int __init sock_diag_init(void) +{ + return register_pernet_subsys(&diag_net_ops); } static void __exit sock_diag_exit(void) { - netlink_kernel_release(sock_diag_nlsk); + unregister_pernet_subsys(&diag_net_ops); } module_init(sock_diag_init); -- cgit v1.2.3 From 30fdd8a082a00126a6feec994e43e8dc12f5bccb Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 17 Jul 2012 05:22:35 +0000 Subject: netpoll: move np->dev and np->dev_name init into __netpoll_setup() Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/netpoll.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index f9f40b932e4b..b4c90e42b443 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -715,14 +715,16 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { - struct net_device *ndev = np->dev; struct netpoll_info *npinfo; const struct net_device_ops *ops; unsigned long flags; int err; + np->dev = ndev; + strlcpy(np->dev_name, ndev->name, IFNAMSIZ); + if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) || !ndev->netdev_ops->ndo_poll_controller) { np_err(np, "%s doesn't support polling, aborting\n", @@ -851,13 +853,11 @@ int netpoll_setup(struct netpoll *np) np_info(np, "local IP %pI4\n", &np->local_ip); } - np->dev = ndev; - /* fill up the skb queue */ refill_skbs(); rtnl_lock(); - err = __netpoll_setup(np); + err = __netpoll_setup(np, ndev); rtnl_unlock(); if (err) -- cgit v1.2.3 From 02756ed4a79f15e4f265c1f6fbc634ce9966f153 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Tue, 17 Jul 2012 02:05:29 +0000 Subject: skbuff: Use correct allocation in skb_copy_ubufs Use correct allocation flags during copy of user space fragments to the kernel. Also "improve" couple of for loops. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8b6d38fdb443..c011d7fab62d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -751,7 +751,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) u8 *vaddr; skb_frag_t *f = &skb_shinfo(skb)->frags[i]; - page = alloc_page(GFP_ATOMIC); + page = alloc_page(gfp_mask); if (!page) { while (head) { struct page *next = (struct page *)head->private; @@ -769,15 +769,15 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) } /* skb frags release userspace buffers */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + for (i = 0; i < num_frags; i++) skb_frag_unref(skb, i); uarg->callback(uarg); /* skb frags point to kernel buffers */ - for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) { - __skb_fill_page_desc(skb, i-1, head, 0, - skb_shinfo(skb)->frags[i - 1].size); + for (i = num_frags - 1; i >= 0; i--) { + __skb_fill_page_desc(skb, i, head, 0, + skb_shinfo(skb)->frags[i].size); head = (struct page *)head->private; } -- cgit v1.2.3 From ddbe503203855939946430e39bae58de11b70b69 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Jul 2012 08:11:12 +0000 Subject: ipv6: add ipv6_addr_hash() helper Introduce ipv6_addr_hash() helper doing a XOR on all bits of an IPv6 address, with an optimized x86_64 version. Use it in flow dissector, as suggested by Andrew McGregor, to reduce hash collision probabilities in fq_codel (and other users of flow dissector) Use it in ip6_tunnel.c and use more bit shuffling, as suggested by David Laight, as existing hash was ignoring most of them. Use it in sunrpc and use more bit shuffling, using hash_32(). Use it in net/ipv6/addrconf.c, using hash_32() as well. As a cleanup, use it in net/ipv4/tcp_metrics.c Signed-off-by: Eric Dumazet Reported-by: Andrew McGregor Cc: Dave Taht Cc: Tom Herbert Cc: David Laight Cc: Joe Perches Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index a225089df5b6..466820b6e344 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -55,8 +56,8 @@ ipv6: return false; ip_proto = iph->nexthdr; - flow->src = iph->saddr.s6_addr32[3]; - flow->dst = iph->daddr.s6_addr32[3]; + flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr); + flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr); nhoff += sizeof(struct ipv6hdr); break; } -- cgit v1.2.3 From d40156aa5ecbd51fed932ed4813df82b56e5ff4d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 20 Jul 2012 02:28:47 +0000 Subject: rtnl: allow to specify different num for rx and tx queue count Also cut out unused function parameters and possible err in return value. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 045db8ad87c8..db5a8ad8a79b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1624,17 +1624,17 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, { int err; struct net_device *dev; - unsigned int num_queues = 1; + unsigned int num_tx_queues = 1; + unsigned int num_rx_queues = 1; - if (ops->get_tx_queues) { - err = ops->get_tx_queues(src_net, tb); - if (err < 0) - goto err; - num_queues = err; - } + if (ops->get_num_tx_queues) + num_tx_queues = ops->get_num_tx_queues(); + if (ops->get_num_rx_queues) + num_rx_queues = ops->get_num_rx_queues(); err = -ENOMEM; - dev = alloc_netdev_mq(ops->priv_size, ifname, ops->setup, num_queues); + dev = alloc_netdev_mqs(ops->priv_size, ifname, ops->setup, + num_tx_queues, num_rx_queues); if (!dev) goto err; -- cgit v1.2.3 From 76ff5cc91935c51fcf1a6a99ffa28b97a6e7a884 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 20 Jul 2012 02:28:48 +0000 Subject: rtnl: allow to specify number of rx and tx queues on device creation This patch introduces IFLA_NUM_TX_QUEUES and IFLA_NUM_RX_QUEUES by which userspace can set number of rx and/or tx queues to be allocated for newly created netdevice. This overrides ops->get_num_[tr]x_queues() Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index db5a8ad8a79b..5bb1ebca2eb0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -771,6 +771,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(ext_filter_mask @@ -889,6 +891,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || + nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || + nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || (dev->ifindex != dev->iflink && nla_put_u32(skb, IFLA_LINK, dev->iflink)) || (dev->master && @@ -1106,6 +1110,8 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_AF_SPEC] = { .type = NLA_NESTED }, [IFLA_EXT_MASK] = { .type = NLA_U32 }, [IFLA_PROMISCUITY] = { .type = NLA_U32 }, + [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, + [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, }; EXPORT_SYMBOL(ifla_policy); @@ -1627,9 +1633,14 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; - if (ops->get_num_tx_queues) + if (tb[IFLA_NUM_TX_QUEUES]) + num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); + else if (ops->get_num_tx_queues) num_tx_queues = ops->get_num_tx_queues(); - if (ops->get_num_rx_queues) + + if (tb[IFLA_NUM_RX_QUEUES]) + num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); + else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); err = -ENOMEM; -- cgit v1.2.3 From f5b0a8743601a4477419171f5046bd07d1c080a0 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 19 Jul 2012 12:31:33 -0700 Subject: net: Document dst->obsolete better. Add a big comment explaining how the field works, and use defines instead of magic constants for the values assigned to it. Suggested by Joe Perches. Signed-off-by: David S. Miller --- net/core/dst.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dst.c b/net/core/dst.c index 07bacff84aa4..069d51d29414 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -94,7 +94,7 @@ loop: * But we do not have state "obsoleted, but * referenced by parent", so it is right. */ - if (dst->obsolete > 1) + if (dst->obsolete > 0) continue; ___dst_free(dst); @@ -202,7 +202,7 @@ static void ___dst_free(struct dst_entry *dst) */ if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) dst->input = dst->output = dst_discard; - dst->obsolete = 2; + dst->obsolete = DST_OBSOLETE_DEAD; } void __dst_free(struct dst_entry *dst) -- cgit v1.2.3 From 1d69c2b343c7e1dc9584b7aa446f40dbab4c4f80 Mon Sep 17 00:00:00 2001 From: "Mark A. Greer" Date: Fri, 20 Jul 2012 13:35:13 +0000 Subject: rtnl: Add #ifdef CONFIG_RPS around num_rx_queues reference Commit 76ff5cc91935c51fcf1a6a99ffa28b97a6e7a884 (rtnl: allow to specify number of rx and tx queues on device creation) added a reference to the net_device structure's 'num_rx_queues' member in net/core/rtnetlink.c:rtnl_fill_ifinfo() However, the definition for 'num_rx_queues' is surrounded by an '#ifdef CONFIG_RPS' while the new reference to it is not. This causes a compile error when CONFIG_RPS is not defined. Fix the compile error by surrounding the new reference to 'num_rx_queues' by an '#ifdef CONFIG_RPS'. CC: Jiri Pirko Signed-off-by: Mark A. Greer Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5bb1ebca2eb0..334b930e0de3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -892,7 +892,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || +#ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || +#endif (dev->ifindex != dev->iflink && nla_put_u32(skb, IFLA_LINK, dev->iflink)) || (dev->master && -- cgit v1.2.3 From 70008aa50e927670ceee7f0c87e159ca7b1517a2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 Jul 2012 09:23:10 +0000 Subject: skbuff: convert to skb_orphan_frags Reduce code duplication a bit using the new helper. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/skbuff.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ccfcb7d8711e..438bbc5fd898 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -804,10 +804,8 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) { struct sk_buff *n; - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, gfp_mask)) - return NULL; - } + if (skb_orphan_frags(skb, gfp_mask)) + return NULL; n = skb + 1; if (skb->fclone == SKB_FCLONE_ORIG && @@ -927,12 +925,10 @@ struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) if (skb_shinfo(skb)->nr_frags) { int i; - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, gfp_mask)) { - kfree_skb(n); - n = NULL; - goto out; - } + if (skb_orphan_frags(skb, gfp_mask)) { + kfree_skb(n); + n = NULL; + goto out; } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; @@ -1005,10 +1001,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, */ if (skb_cloned(skb)) { /* copy this zero copy skb frags */ - if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { - if (skb_copy_ubufs(skb, gfp_mask)) - goto nofrags; - } + if (skb_orphan_frags(skb, gfp_mask)) + goto nofrags; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); -- cgit v1.2.3 From 1080e512d44d4f67b8beb8edf25a1bbcb1066dc7 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 Jul 2012 09:23:17 +0000 Subject: net: orphan frags on receive zero copy packets are normally sent to the outside network, but bridging, tun etc might loop them back to host networking stack. If this happens destructors will never be called, so orphan the frags immediately on receive. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d70e4a3a49f2..cca02ae7a844 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1632,6 +1632,8 @@ static inline int deliver_skb(struct sk_buff *skb, struct packet_type *pt_prev, struct net_device *orig_dev) { + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + return -ENOMEM; atomic_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } @@ -3262,7 +3264,10 @@ ncls: } if (pt_prev) { - ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); + if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) + ret = -ENOMEM; + else + ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); -- cgit v1.2.3 From dcc0fb782b3a6e2abfeaaeb45dd88ed09596be0f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 Jul 2012 09:23:20 +0000 Subject: skbuff: export skb_copy_ubufs Export skb_copy_ubufs so that modules can orphan frags. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 438bbc5fd898..368f65c15e4f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -784,7 +784,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask) skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY; return 0; } - +EXPORT_SYMBOL_GPL(skb_copy_ubufs); /** * skb_clone - duplicate an sk_buff -- cgit v1.2.3 From 406a3c638ce8b17d9704052c07955490f732c2b8 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 20 Jul 2012 10:39:25 +0000 Subject: net: netprio_cgroup: rework update socket logic Instead of updating the sk_cgrp_prioidx struct field on every send this only updates the field when a task is moved via cgroup infrastructure. This allows sockets that may be used by a kernel worker thread to be managed. For example in the iscsi case today a user can put iscsid in a netprio cgroup and control traffic will be sent with the correct sk_cgrp_prioidx value set but as soon as data is sent the kernel worker thread isssues a send and sk_cgrp_prioidx is updated with the kernel worker threads value which is the default case. It seems more correct to only update the field when the user explicitly sets it via control group infrastructure. This allows the users to manage sockets that may be used with other threads. Signed-off-by: John Fastabend Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++ net/core/sock.c | 6 +++--- 2 files changed, 56 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index b2e9caa1ad1a..63d15e8f80e9 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -25,6 +25,8 @@ #include #include +#include + #define PRIOIDX_SZ 128 static unsigned long prioidx_map[PRIOIDX_SZ]; @@ -272,6 +274,56 @@ out_free_devname: return ret; } +void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + struct task_struct *p; + char *tmp = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); + + if (!tmp) { + pr_warn("Unable to attach cgrp due to alloc failure!\n"); + return; + } + + cgroup_taskset_for_each(p, cgrp, tset) { + unsigned int fd; + struct fdtable *fdt; + struct files_struct *files; + + task_lock(p); + files = p->files; + if (!files) { + task_unlock(p); + continue; + } + + rcu_read_lock(); + fdt = files_fdtable(files); + for (fd = 0; fd < fdt->max_fds; fd++) { + char *path; + struct file *file; + struct socket *sock; + unsigned long s; + int rv, err = 0; + + file = fcheck_files(files, fd); + if (!file) + continue; + + path = d_path(&file->f_path, tmp, PAGE_SIZE); + rv = sscanf(path, "socket:[%lu]", &s); + if (rv <= 0) + continue; + + sock = sock_from_file(file, &err); + if (!err) + sock_update_netprioidx(sock->sk, p); + } + rcu_read_unlock(); + task_unlock(p); + } + kfree(tmp); +} + static struct cftype ss_files[] = { { .name = "prioidx", @@ -289,6 +341,7 @@ struct cgroup_subsys net_prio_subsys = { .name = "net_prio", .create = cgrp_create, .destroy = cgrp_destroy, + .attach = net_prio_attach, #ifdef CONFIG_NETPRIO_CGROUP .subsys_id = net_prio_subsys_id, #endif diff --git a/net/core/sock.c b/net/core/sock.c index 24039ac12426..2676a88f533e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1180,12 +1180,12 @@ void sock_update_classid(struct sock *sk) } EXPORT_SYMBOL(sock_update_classid); -void sock_update_netprioidx(struct sock *sk) +void sock_update_netprioidx(struct sock *sk, struct task_struct *task) { if (in_interrupt()) return; - sk->sk_cgrp_prioidx = task_netprioidx(current); + sk->sk_cgrp_prioidx = task_netprioidx(task); } EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif @@ -1215,7 +1215,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, atomic_set(&sk->sk_wmem_alloc, 1); sock_update_classid(sk); - sock_update_netprioidx(sk); + sock_update_netprioidx(sk, current); } return sk; -- cgit v1.2.3 From b68581778cd0051a3fb9a2b614dee7eccb5127ff Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Jul 2012 16:27:54 -0700 Subject: net: Make skb->skb_iif always track skb->dev Make it follow device decapsulation, from things such as VLAN and bonding. The stuff that actually cares about pre-demuxed device pointers, is handled by the "orig_dev" variable in __netif_receive_skb(). And the only consumer of that is the po->origdev feature of AF_PACKET sockets. Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index cca02ae7a844..0ebaea16632f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3173,8 +3173,6 @@ static int __netif_receive_skb(struct sk_buff *skb) if (netpoll_receive_skb(skb)) return NET_RX_DROP; - if (!skb->skb_iif) - skb->skb_iif = skb->dev->ifindex; orig_dev = skb->dev; skb_reset_network_header(skb); @@ -3186,6 +3184,7 @@ static int __netif_receive_skb(struct sk_buff *skb) rcu_read_lock(); another_round: + skb->skb_iif = skb->dev->ifindex; __this_cpu_inc(softnet_data.processed); -- cgit v1.2.3