From 76620aafd66f0004829764940c5466144969cffc Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 16 Apr 2009 02:02:07 -0700 Subject: gro: New frags interface to avoid copying shinfo It turns out that copying a 16-byte area at ~800k times a second can be really expensive :) This patch redesigns the frags GRO interface to avoid copying that area twice. The two disciples of the frags interface have been converted. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 81 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 45 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 91d792d17e09..619fa141b8f5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2519,16 +2519,10 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) } EXPORT_SYMBOL(napi_reuse_skb); -struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, - struct napi_gro_fraginfo *info) +struct sk_buff *napi_get_frags(struct napi_struct *napi) { struct net_device *dev = napi->dev; struct sk_buff *skb = napi->skb; - struct ethhdr *eth; - skb_frag_t *frag; - int i; - - napi->skb = NULL; if (!skb) { skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN); @@ -2536,47 +2530,14 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, goto out; skb_reserve(skb, NET_IP_ALIGN); - } - - BUG_ON(info->nr_frags > MAX_SKB_FRAGS); - frag = &info->frags[info->nr_frags - 1]; - for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) { - skb_fill_page_desc(skb, i, frag->page, frag->page_offset, - frag->size); - frag++; + napi->skb = skb; } - skb_shinfo(skb)->nr_frags = info->nr_frags; - - skb->data_len = info->len; - skb->len += info->len; - skb->truesize += info->len; - - skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); - - eth = skb_gro_header(skb, sizeof(*eth)); - if (!eth) { - napi_reuse_skb(napi, skb); - skb = NULL; - goto out; - } - - skb_gro_pull(skb, sizeof(*eth)); - - /* - * This works because the only protocols we care about don't require - * special handling. We'll fix it up properly at the end. - */ - skb->protocol = eth->h_proto; - - skb->ip_summed = info->ip_summed; - skb->csum = info->csum; out: return skb; } -EXPORT_SYMBOL(napi_fraginfo_skb); +EXPORT_SYMBOL(napi_get_frags); int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) { @@ -2606,9 +2567,39 @@ int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) } EXPORT_SYMBOL(napi_frags_finish); -int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info) +struct sk_buff *napi_frags_skb(struct napi_struct *napi) +{ + struct sk_buff *skb = napi->skb; + struct ethhdr *eth; + + napi->skb = NULL; + + skb_reset_mac_header(skb); + skb_gro_reset_offset(skb); + + eth = skb_gro_header(skb, sizeof(*eth)); + if (!eth) { + napi_reuse_skb(napi, skb); + skb = NULL; + goto out; + } + + skb_gro_pull(skb, sizeof(*eth)); + + /* + * This works because the only protocols we care about don't require + * special handling. We'll fix it up properly at the end. + */ + skb->protocol = eth->h_proto; + +out: + return skb; +} +EXPORT_SYMBOL(napi_frags_skb); + +int napi_gro_frags(struct napi_struct *napi) { - struct sk_buff *skb = napi_fraginfo_skb(napi, info); + struct sk_buff *skb = napi_frags_skb(napi); if (!skb) return NET_RX_DROP; @@ -2712,7 +2703,7 @@ void netif_napi_del(struct napi_struct *napi) struct sk_buff *skb, *next; list_del_init(&napi->dev_list); - kfree_skb(napi->skb); + napi_free_frags(napi); for (skb = napi->gro_list; skb; skb = next) { next = skb->next; -- cgit v1.2.3 From 0a1ec07a67bd8b0033dace237249654d015efa21 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 20 Apr 2009 01:25:46 +0000 Subject: net: skb_copy_datagram_const_iovec() There's an skb_copy_datagram_iovec() to copy out of a paged skb, but it modifies the iovec, and does not support starting at an offset in the destination. We want both in tun.c, so let's add the function. It's a carbon copy of skb_copy_datagram_iovec() with enough changes to be annoying. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/datagram.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++ net/core/iovec.c | 26 +++++++++++++++ 2 files changed, 118 insertions(+) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index d0de644b378d..4dbb05cd572b 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -338,6 +338,98 @@ fault: return -EFAULT; } +/** + * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: io vector to copy to + * @to_offset: offset in the io vector to start copying to + * @len: amount of data to copy from buffer to iovec + * + * Returns 0 or -EFAULT. + * Note: the iovec is not modified during the copy. + */ +int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, + const struct iovec *to, int to_offset, + int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to_offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + WARN_ON(start > offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = memcpy_toiovecend(to, vaddr + frag->page_offset + + offset - start, to_offset, copy); + kunmap(page); + if (err) + goto fault; + if (!(len -= copy)) + return 0; + offset += copy; + to_offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + WARN_ON(start > offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_const_iovec(list, + offset - start, + to, to_offset, + copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to_offset += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_const_iovec); + /** * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. * @skb: buffer to copy diff --git a/net/core/iovec.c b/net/core/iovec.c index 4c9c0121c9da..a215545c0a34 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -97,6 +97,31 @@ int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) return 0; } +/* + * Copy kernel to iovec. Returns -EFAULT on error. + */ + +int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, + int offset, int len) +{ + int copy; + for (; len > 0; ++iov) { + /* Skip over the finished iovecs */ + if (unlikely(offset >= iov->iov_len)) { + offset -= iov->iov_len; + continue; + } + copy = min_t(unsigned int, iov->iov_len - offset, len); + offset = 0; + if (copy_to_user(iov->iov_base, kdata, copy)) + return -EFAULT; + kdata += copy; + len -= copy; + } + + return 0; +} + /* * Copy iovec to kernel. Returns -EFAULT on error. * @@ -236,3 +261,4 @@ EXPORT_SYMBOL(csum_partial_copy_fromiovecend); EXPORT_SYMBOL(memcpy_fromiovec); EXPORT_SYMBOL(memcpy_fromiovecend); EXPORT_SYMBOL(memcpy_toiovec); +EXPORT_SYMBOL(memcpy_toiovecend); -- cgit v1.2.3 From 6f26c9a7555e5bcca3560919db9b852015077dae Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 20 Apr 2009 01:26:11 +0000 Subject: tun: fix tun_chr_aio_write so that aio works aio_write gets const struct iovec * but tun_chr_aio_write casts this to struct iovec * and modifies the iovec. As a result, attempts to use io_submit to send packets to a tun device fail with weird errors such as EINVAL. Since tun is the only user of skb_copy_datagram_from_iovec, we can fix this simply by changing the later so that it does not touch the iovec passed to it. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/datagram.c | 20 ++++++++++++++------ net/core/iovec.c | 7 ++++--- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 4dbb05cd572b..914d5fa773b4 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -435,13 +435,15 @@ EXPORT_SYMBOL(skb_copy_datagram_const_iovec); * @skb: buffer to copy * @offset: offset in the buffer to start copying to * @from: io vector to copy to + * @from_offset: offset in the io vector to start copying from * @len: amount of data to copy to buffer from iovec * * Returns 0 or -EFAULT. - * Note: the iovec is modified during the copy. + * Note: the iovec is not modified during the copy. */ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, - struct iovec *from, int len) + const struct iovec *from, int from_offset, + int len) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -450,11 +452,12 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - if (memcpy_fromiovec(skb->data + offset, from, copy)) + if (memcpy_fromiovecend(skb->data + offset, from, 0, copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; + from_offset += copy; } /* Copy paged appendix. Hmm... why does this look so complicated? */ @@ -473,8 +476,9 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if (copy > len) copy = len; vaddr = kmap(page); - err = memcpy_fromiovec(vaddr + frag->page_offset + - offset - start, from, copy); + err = memcpy_fromiovecend(vaddr + frag->page_offset + + offset - start, + from, from_offset, copy); kunmap(page); if (err) goto fault; @@ -482,6 +486,7 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if (!(len -= copy)) return 0; offset += copy; + from_offset += copy; } start = end; } @@ -500,11 +505,14 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, copy = len; if (skb_copy_datagram_from_iovec(list, offset - start, - from, copy)) + from, + from_offset, + copy)) goto fault; if ((len -= copy) == 0) return 0; offset += copy; + from_offset += copy; } start = end; } diff --git a/net/core/iovec.c b/net/core/iovec.c index a215545c0a34..40a76ce19d9f 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -147,10 +147,11 @@ int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) } /* - * For use with ip_build_xmit + * Copy iovec from kernel. Returns -EFAULT on error. */ -int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, - int len) + +int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov, + int offset, int len) { /* Skip over the finished iovecs */ while (offset >= iov->iov_len) { -- cgit v1.2.3 From 683703a26e4677db437a1480682851e27c7a154f Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 27 Apr 2009 03:17:31 -0700 Subject: drop_monitor: Update netlink protocol to include netlink attribute header in alert message When I initially implemented this protocol, I disregarded the use of netlink attribute headers, thinking for my purposes they weren't needed. I've come to find out that, as I'm starting to work with sending down messages with associated data (like config messages), the kernel code spits out warnings about trailing data in a netlink skb that doesn't have an associated header on it. As such, I'm going to start including attribute headers in my netlink transaction, and so for completeness, I should likely include them on messages bound from the kernel to user space. This patch adds that header to the kernel, and bumps the protocol version accordingly Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 9fd0dc3cca99..2797b711a978 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -51,7 +51,7 @@ static struct genl_family net_drop_monitor_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, .name = "NET_DM", - .version = 1, + .version = 2, .maxattr = NET_DM_CMD_MAX, }; @@ -65,13 +65,17 @@ static void reset_per_cpu_data(struct per_cpu_dm_data *data) { size_t al; struct net_dm_alert_msg *msg; + struct nlattr *nla; al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); + al += sizeof(struct nlattr); + data->skb = genlmsg_new(al, GFP_KERNEL); genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_ALERT); - msg = __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_alert_msg)); + nla = nla_reserve(data->skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg)); + msg = nla_data(nla); memset(msg, 0, al); atomic_set(&data->dm_hit_count, dm_hit_limit); } @@ -115,6 +119,7 @@ static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct nlmsghdr *nlh; + struct nlattr *nla; int i; struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data); @@ -127,7 +132,8 @@ static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) } nlh = (struct nlmsghdr *)data->skb->data; - msg = genlmsg_data(nlmsg_data(nlh)); + nla = genlmsg_data(nlmsg_data(nlh)); + msg = nla_data(nla); for (i = 0; i < msg->entries; i++) { if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { msg->points[i].count++; @@ -139,6 +145,7 @@ static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) * We need to create a new entry */ __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point)); + nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); memcpy(msg->points[msg->entries].pc, &location, sizeof(void *)); msg->points[msg->entries].count = 1; msg->entries++; -- cgit v1.2.3 From edbd9e30306067c3a45c035eb95a6f49daaa2337 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 27 Apr 2009 05:44:29 -0700 Subject: gro: Fix handling of headers that extend over the tail The skb_gro_* code fails to handle the case where a header starts in the linear area but ends in the frags area. Since the goal of skb_gro_* is to optimise the case of completely non-linear packets, we can simply bail out if we have anything in the linear area. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e48c08af76ad..6785b067ad50 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2378,18 +2378,13 @@ void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) unsigned int offset = skb_gro_offset(skb); hlen += offset; - if (hlen <= skb_headlen(skb)) - return skb->data + offset; - - if (unlikely(!skb_shinfo(skb)->nr_frags || - skb_shinfo(skb)->frags[0].size <= - hlen - skb_headlen(skb) || + if (unlikely(skb_headlen(skb) || + skb_shinfo(skb)->frags[0].size < hlen || PageHighMem(skb_shinfo(skb)->frags[0].page))) return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; return page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset + - offset - skb_headlen(skb); + skb_shinfo(skb)->frags[0].page_offset + offset; } EXPORT_SYMBOL(skb_gro_header); -- cgit v1.2.3 From 513de11bba246b7a67df4c314d9fc936b6a75d0e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 3 May 2009 14:43:10 -0700 Subject: net: Avoid modulus in skb_tx_hash() for forwarding case. Based almost entirely upon a patch by Eric Dumazet. The common case is to have num-tx-queues <= num_rx_queues and even if num_tx_queues is larger it will not be significantly larger. Therefore, a subtraction loop is always going to be faster than modulus. Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 81442957c5c2..3c8073fe970a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1735,8 +1735,12 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) { u32 hash; - if (skb_rx_queue_recorded(skb)) - return skb_get_rx_queue(skb) % dev->real_num_tx_queues; + if (skb_rx_queue_recorded(skb)) { + hash = skb_get_rx_queue(skb); + while (unlikely (hash >= dev->real_num_tx_queues)) + hash -= dev->real_num_tx_queues; + return hash; + } if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; -- cgit v1.2.3 From 4a84822c60afa2b0e2d3370041f69f9526a34757 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 4 May 2009 11:11:38 -0700 Subject: netns 1/2: don't get/put old netns on CLONE_NEWNET copy_net_ns() doesn't copy anything, it creates fresh netns, so get/put of old netns isn't needed. Signed-off-by: Alexey Dobriyan Acked-by: Serge Hallyn Signed-off-by: David S. Miller --- net/core/net_namespace.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e3bebd36f053..4488010d5a52 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -120,10 +120,8 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) struct net *new_net = NULL; int err; - get_net(old_net); - if (!(flags & CLONE_NEWNET)) - return old_net; + return get_net(old_net); err = -ENOMEM; new_net = net_alloc(); @@ -142,7 +140,6 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) if (err) goto out_free; out: - put_net(old_net); return new_net; out_free: -- cgit v1.2.3 From 088eb2d905de9518dad913995bb8aef493d4a7c5 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 4 May 2009 11:12:14 -0700 Subject: netns 2/2: extract net_create() net_create() will be used by C/R to create fresh netns on restart. Signed-off-by: Alexey Dobriyan Acked-by: Serge Hallyn Signed-off-by: David S. Miller --- net/core/net_namespace.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 4488010d5a52..6b3edc9e6f19 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -115,38 +115,34 @@ static void net_free(struct net *net) kmem_cache_free(net_cachep, net); } -struct net *copy_net_ns(unsigned long flags, struct net *old_net) +static struct net *net_create(void) { - struct net *new_net = NULL; - int err; - - if (!(flags & CLONE_NEWNET)) - return get_net(old_net); - - err = -ENOMEM; - new_net = net_alloc(); - if (!new_net) - goto out_err; + struct net *net; + int rv; + net = net_alloc(); + if (!net) + return ERR_PTR(-ENOMEM); mutex_lock(&net_mutex); - err = setup_net(new_net); - if (!err) { + rv = setup_net(net); + if (rv == 0) { rtnl_lock(); - list_add_tail(&new_net->list, &net_namespace_list); + list_add_tail(&net->list, &net_namespace_list); rtnl_unlock(); } mutex_unlock(&net_mutex); + if (rv < 0) { + net_free(net); + return ERR_PTR(rv); + } + return net; +} - if (err) - goto out_free; -out: - return new_net; - -out_free: - net_free(new_net); -out_err: - new_net = ERR_PTR(err); - goto out; +struct net *copy_net_ns(unsigned long flags, struct net *old_net) +{ + if (!(flags & CLONE_NEWNET)) + return get_net(old_net); + return net_create(); } static void cleanup_net(struct work_struct *work) -- cgit v1.2.3 From f001fde5eadd915f4858d22ed70d7040f48767cf Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 5 May 2009 02:48:28 +0000 Subject: net: introduce a list of device addresses dev_addr_list (v6) v5 -> v6 (current): -removed so far unused static functions -corrected dev_addr_del_multiple to call del instead of add v4 -> v5: -added device address type (suggested by davem) -removed refcounting (better to have simplier code then safe potentially few bytes) v3 -> v4: -changed kzalloc to kmalloc in __hw_addr_add_ii() -ASSERT_RTNL() avoided in dev_addr_flush() and dev_addr_init() v2 -> v3: -removed unnecessary rcu read locking -moved dev_addr_flush() calling to ensure no null dereference of dev_addr v1 -> v2: -added forgotten ASSERT_RTNL to dev_addr_init and dev_addr_flush -removed unnecessary rcu_read locking in dev_addr_init -use compare_ether_addr_64bits instead of compare_ether_addr -use L1_CACHE_BYTES as size for allocating struct netdev_hw_addr -use call_rcu instead of rcu_synchronize -moved is_etherdev_addr into __KERNEL__ ifdef This patch introduces a new list in struct net_device and brings a set of functions to handle the work with device address list. The list is a replacement for the original dev_addr field and because in some situations there is need to carry several device addresses with the net device. To be backward compatible, dev_addr is made to point to the first member of the list so original drivers sees no difference. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 250 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3c8073fe970a..637ea71b0a0d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3434,6 +3434,252 @@ void dev_set_rx_mode(struct net_device *dev) netif_addr_unlock_bh(dev); } +/* hw addresses list handling functions */ + +static int __hw_addr_add(struct list_head *list, unsigned char *addr, + int addr_len, unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + int alloc_size; + + if (addr_len > MAX_ADDR_LEN) + return -EINVAL; + + alloc_size = sizeof(*ha); + if (alloc_size < L1_CACHE_BYTES) + alloc_size = L1_CACHE_BYTES; + ha = kmalloc(alloc_size, GFP_ATOMIC); + if (!ha) + return -ENOMEM; + memcpy(ha->addr, addr, addr_len); + ha->type = addr_type; + list_add_tail_rcu(&ha->list, list); + return 0; +} + +static void ha_rcu_free(struct rcu_head *head) +{ + struct netdev_hw_addr *ha; + + ha = container_of(head, struct netdev_hw_addr, rcu_head); + kfree(ha); +} + +static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr, + int addr_len, unsigned char addr_type, + int ignore_index) +{ + struct netdev_hw_addr *ha; + int i = 0; + + list_for_each_entry(ha, list, list) { + if (i++ != ignore_index && + !memcmp(ha->addr, addr, addr_len) && + (ha->type == addr_type || !addr_type)) { + list_del_rcu(&ha->list); + call_rcu(&ha->rcu_head, ha_rcu_free); + return 0; + } + } + return -ENOENT; +} + +static int __hw_addr_add_multiple_ii(struct list_head *to_list, + struct list_head *from_list, + int addr_len, unsigned char addr_type, + int ignore_index) +{ + int err; + struct netdev_hw_addr *ha, *ha2; + unsigned char type; + + list_for_each_entry(ha, from_list, list) { + type = addr_type ? addr_type : ha->type; + err = __hw_addr_add(to_list, ha->addr, addr_len, type); + if (err) + goto unroll; + } + return 0; + +unroll: + list_for_each_entry(ha2, from_list, list) { + if (ha2 == ha) + break; + type = addr_type ? addr_type : ha2->type; + __hw_addr_del_ii(to_list, ha2->addr, addr_len, type, + ignore_index); + } + return err; +} + +static void __hw_addr_del_multiple_ii(struct list_head *to_list, + struct list_head *from_list, + int addr_len, unsigned char addr_type, + int ignore_index) +{ + struct netdev_hw_addr *ha; + unsigned char type; + + list_for_each_entry(ha, from_list, list) { + type = addr_type ? addr_type : ha->type; + __hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type, + ignore_index); + } +} + +static void __hw_addr_flush(struct list_head *list) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, list, list) { + list_del_rcu(&ha->list); + call_rcu(&ha->rcu_head, ha_rcu_free); + } +} + +/* Device addresses handling functions */ + +static void dev_addr_flush(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + __hw_addr_flush(&dev->dev_addr_list); + dev->dev_addr = NULL; +} + +static int dev_addr_init(struct net_device *dev) +{ + unsigned char addr[MAX_ADDR_LEN]; + struct netdev_hw_addr *ha; + int err; + + /* rtnl_mutex must be held here */ + + INIT_LIST_HEAD(&dev->dev_addr_list); + memset(addr, 0, sizeof(*addr)); + err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr), + NETDEV_HW_ADDR_T_LAN); + if (!err) { + /* + * Get the first (previously created) address from the list + * and set dev_addr pointer to this location. + */ + ha = list_first_entry(&dev->dev_addr_list, + struct netdev_hw_addr, list); + dev->dev_addr = ha->addr; + } + return err; +} + +/** + * dev_addr_add - Add a device address + * @dev: device + * @addr: address to add + * @addr_type: address type + * + * Add a device address to the device or increase the reference count if + * it already exists. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len, + addr_type); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add); + +/** + * dev_addr_del - Release a device address. + * @dev: device + * @addr: address to delete + * @addr_type: address type + * + * Release reference to a device address and remove it from the device + * if the reference count drops to zero. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del(struct net_device *dev, unsigned char *addr, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, + addr_type, 0); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; +} +EXPORT_SYMBOL(dev_addr_del); + +/** + * dev_addr_add_multiple - Add device addresses from another device + * @to_dev: device to which addresses will be added + * @from_dev: device from which addresses will be added + * @addr_type: address type - 0 means type will be used from from_dev + * + * Add device addresses of the one device to another. + ** + * The caller must hold the rtnl_mutex. + */ +int dev_addr_add_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + int err; + + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list, + &from_dev->dev_addr_list, + to_dev->addr_len, addr_type, 0); + if (!err) + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return err; +} +EXPORT_SYMBOL(dev_addr_add_multiple); + +/** + * dev_addr_del_multiple - Delete device addresses by another device + * @to_dev: device where the addresses will be deleted + * @from_dev: device by which addresses the addresses will be deleted + * @addr_type: address type - 0 means type will used from from_dev + * + * Deletes addresses in to device by the list of addresses in from device. + * + * The caller must hold the rtnl_mutex. + */ +int dev_addr_del_multiple(struct net_device *to_dev, + struct net_device *from_dev, + unsigned char addr_type) +{ + ASSERT_RTNL(); + + if (from_dev->addr_len != to_dev->addr_len) + return -EINVAL; + __hw_addr_del_multiple_ii(&to_dev->dev_addr_list, + &from_dev->dev_addr_list, + to_dev->addr_len, addr_type, 0); + call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); + return 0; +} +EXPORT_SYMBOL(dev_addr_del_multiple); + +/* unicast and multicast addresses handling functions */ + int __dev_addr_delete(struct dev_addr_list **list, int *count, void *addr, int alen, int glbl) { @@ -4776,6 +5022,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; + dev_addr_init(dev); netdev_init_queues(dev); INIT_LIST_HEAD(&dev->napi_list); @@ -4801,6 +5048,9 @@ void free_netdev(struct net_device *dev) kfree(dev->_tx); + /* Flush device addresses */ + dev_addr_flush(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); -- cgit v1.2.3 From 61de71c67caec39df0a854a1ef5be0c6be385c2a Mon Sep 17 00:00:00 2001 From: John Dykstra Date: Fri, 8 May 2009 14:57:01 -0700 Subject: Network Drop Monitor: Fix skb_kill_datagram Commit ead2ceb0ec9f85cff19c43b5cdb2f8a054484431 ("Network Drop Monitor: Adding kfree_skb_clean for non-drops and modifying end-of-line points for skbs") established new conventions for identifying dropped packets. Align skb_kill_datagram() with these conventions so that packets that get dropped just before the copy to userspace are properly tracked. Signed-off-by: John Dykstra Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/datagram.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 22ea437c5023..e2a36f05cdf7 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -260,7 +260,9 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) spin_unlock_bh(&sk->sk_receive_queue.lock); } - skb_free_datagram(sk, skb); + kfree_skb(skb); + sk_mem_reclaim_partial(sk); + return err; } -- cgit v1.2.3 From ab9c73ccb52f40576ce017528d542eda3c6ae766 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 8 May 2009 13:30:17 +0000 Subject: net: check retval of dev_addr_init() Add missed checking of dev_addr_init return value in alloc_netdev_mq. Signed-off-by: Jiri Pirko net/core/dev.c | 15 ++++++++++++--- 1 files changed, 12 insertions(+), 3 deletions(-) Signed-off-by: David S. Miller --- net/core/dev.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 637ea71b0a0d..14dd725aaab7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5007,13 +5007,16 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, if (!tx) { printk(KERN_ERR "alloc_netdev: Unable to allocate " "tx qdiscs.\n"); - kfree(p); - return NULL; + goto free_p; } dev = (struct net_device *) (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); dev->padded = (char *)dev - (char *)p; + + if (dev_addr_init(dev)) + goto free_tx; + dev_net_set(dev, &init_net); dev->_tx = tx; @@ -5022,13 +5025,19 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; - dev_addr_init(dev); netdev_init_queues(dev); INIT_LIST_HEAD(&dev->napi_list); setup(dev); strcpy(dev->name, name); return dev; + +free_tx: + kfree(tx); + +free_p: + kfree(p); + return NULL; } EXPORT_SYMBOL(alloc_netdev_mq); -- cgit v1.2.3 From 8b3521eeb7598c3b10c7e14361a7974464527702 Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Mon, 11 May 2009 05:52:49 +0000 Subject: ipv4: remove an unused parameter from configure method of fib_rules_ops. Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 98691e1466b8..17d9f497b797 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -299,7 +299,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) } else if (rule->action == FR_ACT_GOTO) goto errout_free; - err = ops->configure(rule, skb, nlh, frh, tb); + err = ops->configure(rule, skb, frh, tb); if (err < 0) goto errout_free; -- cgit v1.2.3 From 9dc20c5f78c53bf57fb7874b6e942842e1db20d3 Mon Sep 17 00:00:00 2001 From: John Dykstra Date: Tue, 12 May 2009 15:34:50 +0000 Subject: tcp: tcp_prequeue() can use keyed wakeups When TCP frees up write buffer space, avoid waking up tasks that have done a poll() or select() on the same socket specifying read-side events. This is an extension of a read-side patch by Eric Dumazet. Signed-off-by: John Dykstra Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/stream.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/stream.c b/net/core/stream.c index 8727cead64ad..a37debfeb1b2 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -33,7 +33,8 @@ void sk_stream_write_space(struct sock *sk) clear_bit(SOCK_NOSPACE, &sock->flags); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_poll(sk->sk_sleep, POLLOUT | + POLLWRNORM | POLLWRBAND); if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT); } -- cgit v1.2.3 From 7004bf252c53da18f6b55103e0c92f777f846806 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 May 2009 00:34:33 +0000 Subject: net: add tx_packets/tx_bytes/tx_dropped counters in struct netdev_queue offsetof(struct net_device, features)=0x44 offsetof(struct net_device, stats.tx_packets)=0x54 offsetof(struct net_device, stats.tx_bytes)=0x5c offsetof(struct net_device, stats.tx_dropped)=0x6c Network drivers that touch dev->stats.tx_packets/stats.tx_bytes in their tx path can slow down SMP operations, since they dirty a cache line that should stay shared (dev->features is needed in rx and tx paths) We could move away stats field in net_device but it wont help that much. (Two cache lines dirtied in tx path, we can do one only) Better solution is to add tx_packets/tx_bytes/tx_dropped in struct netdev_queue because this structure is already touched in tx path and counters updates will then be free (no increase in size) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 14dd725aaab7..6d3630d16271 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4943,13 +4943,30 @@ void netdev_run_todo(void) * the internal statistics structure is used. */ const struct net_device_stats *dev_get_stats(struct net_device *dev) - { +{ const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_get_stats) return ops->ndo_get_stats(dev); - else - return &dev->stats; + else { + unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; + struct net_device_stats *stats = &dev->stats; + unsigned int i; + struct netdev_queue *txq; + + for (i = 0; i < dev->num_tx_queues; i++) { + txq = netdev_get_tx_queue(dev, i); + tx_bytes += txq->tx_bytes; + tx_packets += txq->tx_packets; + tx_dropped += txq->tx_dropped; + } + if (tx_bytes || tx_packets || tx_dropped) { + stats->tx_bytes = tx_bytes; + stats->tx_packets = tx_packets; + stats->tx_dropped = tx_dropped; + } + return stats; + } } EXPORT_SYMBOL(dev_get_stats); -- cgit v1.2.3 From 336ca57c3b4e2b58ea3273e6d978ab3dfa387b4c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 13 May 2009 16:57:25 +0000 Subject: net-sysfs: Use rtnl_trylock in sysfs methods. The earlier patch to fix the deadlock between a network device going away and writing to sysfs attributes was incomplete. - It did not set signal_pending so we would leak ERSTARTSYS to user space. - It used ERESTARTSYS which only restarts if sigaction configures it to. - It did not cover store and show for ifalias. So fix all of these up and use the new helper restart_syscall so we get the details correct on what it takes. Signed-off-by: Eric W. Biederman Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 2da59a0ac4ac..b9641e816eee 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -78,7 +78,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, goto err; if (!rtnl_trylock()) - return -ERESTARTSYS; + return restart_syscall(); if (dev_isalive(net)) { if ((ret = (*set)(net, new)) == 0) @@ -225,7 +225,8 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - rtnl_lock(); + if (!rtnl_trylock()) + return restart_syscall(); ret = dev_set_alias(netdev, buf, count); rtnl_unlock(); @@ -238,7 +239,8 @@ static ssize_t show_ifalias(struct device *dev, const struct net_device *netdev = to_net_dev(dev); ssize_t ret = 0; - rtnl_lock(); + if (!rtnl_trylock()) + return restart_syscall(); if (netdev->ifalias) ret = sprintf(buf, "%s\n", netdev->ifalias); rtnl_unlock(); -- cgit v1.2.3 From 93f154b594fe47e4a7e5358b309add449a046cd3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 May 2009 22:19:19 -0700 Subject: net: release dst entry in dev_hard_start_xmit() One point of contention in high network loads is the dst_release() performed when a transmited skb is freed. This is because NIC tx completion calls dev_kree_skb() long after original call to dev_queue_xmit(skb). CPU cache is cold and the atomic op in dst_release() stalls. On SMP, this is quite visible if one CPU is 100% handling softirqs for a network device, since dst_clone() is done by other cpus, involving cache line ping pongs. It seems right place to release dst is in dev_hard_start_xmit(), for most devices but ones that are virtual, and some exceptions. David Miller suggested to define a new device flag, set in alloc_netdev_mq() (so that most devices set it at init time), and carefuly unset in devices which dont want a NULL skb->dst in their ndo_start_xmit(). List of devices that must clear this flag is : - loopback device, because it calls netif_rx() and quoting Patrick : "ip_route_input() doesn't accept loopback addresses, so loopback packets already need to have a dst_entry attached." - appletalk/ipddp.c : needs skb->dst in its xmit function - And all devices that call again dev_queue_xmit() from their xmit function (as some classifiers need skb->dst) : bonding, vlan, macvlan, eql, ifb, hdlc_fr Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6d3630d16271..92ebeca29901 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1688,6 +1688,14 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, goto gso; } + /* + * If device doesnt need skb->dst, release it right now while + * its hot in this cpu cache + */ + if ((dev->priv_flags & IFF_XMIT_DST_RELEASE) && skb->dst) { + dst_release(skb->dst); + skb->dst = NULL; + } rc = ops->ndo_start_xmit(skb, dev); /* * TODO: if skb_orphan() was called by @@ -5045,6 +5053,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, netdev_init_queues(dev); INIT_LIST_HEAD(&dev->napi_list); + dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); return dev; -- cgit v1.2.3 From 04af8cf6f320031090ab6fa4600b912b0c18fb4b Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Wed, 20 May 2009 17:26:23 -0700 Subject: net: Remove unused parameter from fill method in fib_rules_ops. The netlink message header (struct nlmsghdr) is an unused parameter in fill method of fib_rules_ops struct. This patch removes this parameter from this method and fixes the places where this method is called. (include/net/fib_rules.h) Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/core/fib_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 17d9f497b797..bd309384f8b8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -500,7 +500,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->target) NLA_PUT_U32(skb, FRA_GOTO, rule->target); - if (ops->fill(rule, skb, nlh, frh) < 0) + if (ops->fill(rule, skb, frh) < 0) goto nla_put_failure; return nlmsg_end(skb, nlh); -- cgit v1.2.3 From 1f7a2bb4eff75c56a71b3896c36a34e787c5e4e5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 21 May 2009 15:10:05 -0700 Subject: netns: remove leftover debugging message Signed-off-by: Stephen Hemminger Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/net_namespace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6b3edc9e6f19..69d7549db9a2 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -198,7 +198,6 @@ static int __init net_ns_init(void) struct net_generic *ng; int err; - printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net)); #ifdef CONFIG_NET_NS net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), SMP_CACHE_BYTES, -- cgit v1.2.3 From ca0f31125c5cf0d48f47c2e1a3785a08876a7e87 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 21 May 2009 15:10:31 -0700 Subject: netns: simplify net_ns_init The net_ns_init code can be simplified. No need to save error code if it is only going to panic if it is set 4 lines later. Signed-off-by: Stephen Hemminger Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/core/net_namespace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 69d7549db9a2..b7292a2719dc 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -196,7 +196,6 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) static int __init net_ns_init(void) { struct net_generic *ng; - int err; #ifdef CONFIG_NET_NS net_cachep = kmem_cache_create("net_namespace", sizeof(struct net), @@ -216,15 +215,14 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); mutex_lock(&net_mutex); - err = setup_net(&init_net); + if (setup_net(&init_net)) + panic("Could not setup the initial network namespace"); rtnl_lock(); list_add_tail(&init_net.list, &net_namespace_list); rtnl_unlock(); mutex_unlock(&net_mutex); - if (err) - panic("Could not setup the initial network namespace"); return 0; } -- cgit v1.2.3 From 4ea7e38696c7e798c47ebbecadfd392f23f814f9 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 21 May 2009 07:36:08 +0000 Subject: dropmon: add ability to detect when hardware dropsrxpackets Patch to add the ability to detect drops in hardware interfaces via dropwatch. Adds a tracepoint to net_rx_action to signal everytime a napi instance is polled. The dropmon code then periodically checks to see if the rx_frames counter has changed, and if so, adds a drop notification to the netlink protocol, using the reserved all-0's vector to indicate the drop location was in hardware, rather than somewhere in the code. Signed-off-by: Neil Horman include/linux/net_dropmon.h | 8 ++ include/trace/napi.h | 11 +++ net/core/dev.c | 5 + net/core/drop_monitor.c | 124 ++++++++++++++++++++++++++++++++++++++++++-- net/core/net-traces.c | 4 + net/core/netpoll.c | 2 6 files changed, 149 insertions(+), 5 deletions(-) Signed-off-by: David S. Miller --- net/core/dev.c | 5 +- net/core/drop_monitor.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++-- net/core/net-traces.c | 4 ++ net/core/netpoll.c | 2 + 4 files changed, 130 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 92ebeca29901..3942266d1f6c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -126,6 +126,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -2771,8 +2772,10 @@ static void net_rx_action(struct softirq_action *h) * accidently calling ->poll() when NAPI is not scheduled. */ work = 0; - if (test_bit(NAPI_STATE_SCHED, &n->state)) + if (test_bit(NAPI_STATE_SCHED, &n->state)) { work = n->poll(n, weight); + trace_napi_poll(n); + } WARN_ON_ONCE(work > weight); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 2797b711a978..a6c2ac2828fb 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -22,8 +22,10 @@ #include #include #include +#include #include +#include #include @@ -38,7 +40,8 @@ static void send_dm_alert(struct work_struct *unused); * and the work handle that will send up * netlink alerts */ -struct sock *dm_sock; +static int trace_state = TRACE_OFF; +static spinlock_t trace_state_lock = SPIN_LOCK_UNLOCKED; struct per_cpu_dm_data { struct work_struct dm_alert_work; @@ -47,6 +50,13 @@ struct per_cpu_dm_data { struct timer_list send_timer; }; +struct dm_hw_stat_delta { + struct net_device *dev; + struct list_head list; + struct rcu_head rcu; + unsigned long last_drop_val; +}; + static struct genl_family net_drop_monitor_family = { .id = GENL_ID_GENERATE, .hdrsize = 0, @@ -59,7 +69,8 @@ static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; - +static unsigned long dm_hw_check_delta = 2*HZ; +static LIST_HEAD(hw_stats_list); static void reset_per_cpu_data(struct per_cpu_dm_data *data) { @@ -115,7 +126,7 @@ static void sched_send_work(unsigned long unused) schedule_work(&data->dm_alert_work); } -static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct nlmsghdr *nlh; @@ -159,24 +170,80 @@ out: return; } +static void trace_kfree_skb_hit(struct sk_buff *skb, void *location) +{ + trace_drop_common(skb, location); +} + +static void trace_napi_poll_hit(struct napi_struct *napi) +{ + struct dm_hw_stat_delta *new_stat; + + /* + * Ratelimit our check time to dm_hw_check_delta jiffies + */ + if (!time_after(jiffies, napi->dev->last_rx + dm_hw_check_delta)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(new_stat, &hw_stats_list, list) { + if ((new_stat->dev == napi->dev) && + (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) { + trace_drop_common(NULL, NULL); + new_stat->last_drop_val = napi->dev->stats.rx_dropped; + break; + } + } + rcu_read_unlock(); +} + + +static void free_dm_hw_stat(struct rcu_head *head) +{ + struct dm_hw_stat_delta *n; + n = container_of(head, struct dm_hw_stat_delta, rcu); + kfree(n); +} + static int set_all_monitor_traces(int state) { int rc = 0; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *temp; + + spin_lock(&trace_state_lock); switch (state) { case TRACE_ON: rc |= register_trace_kfree_skb(trace_kfree_skb_hit); + rc |= register_trace_napi_poll(trace_napi_poll_hit); break; case TRACE_OFF: rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit); + rc |= unregister_trace_napi_poll(trace_napi_poll_hit); tracepoint_synchronize_unregister(); + + /* + * Clean the device list + */ + list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) { + if (new_stat->dev == NULL) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + } + } break; default: rc = 1; break; } + if (!rc) + trace_state = state; + + spin_unlock(&trace_state_lock); + if (rc) return -EINPROGRESS; return rc; @@ -204,6 +271,44 @@ static int net_dm_cmd_trace(struct sk_buff *skb, return -ENOTSUPP; } +static int dropmon_net_event(struct notifier_block *ev_block, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dm_hw_stat_delta *new_stat = NULL; + struct dm_hw_stat_delta *tmp; + + switch (event) { + case NETDEV_REGISTER: + new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL); + + if (!new_stat) + goto out; + + new_stat->dev = dev; + INIT_RCU_HEAD(&new_stat->rcu); + spin_lock(&trace_state_lock); + list_add_rcu(&new_stat->list, &hw_stats_list); + spin_unlock(&trace_state_lock); + break; + case NETDEV_UNREGISTER: + spin_lock(&trace_state_lock); + list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) { + if (new_stat->dev == dev) { + new_stat->dev = NULL; + if (trace_state == TRACE_OFF) { + list_del_rcu(&new_stat->list); + call_rcu(&new_stat->rcu, free_dm_hw_stat); + break; + } + } + } + spin_unlock(&trace_state_lock); + break; + } +out: + return NOTIFY_DONE; +} static struct genl_ops dropmon_ops[] = { { @@ -220,6 +325,10 @@ static struct genl_ops dropmon_ops[] = { }, }; +static struct notifier_block dropmon_net_notifier = { + .notifier_call = dropmon_net_event +}; + static int __init init_net_drop_monitor(void) { int cpu; @@ -243,12 +352,18 @@ static int __init init_net_drop_monitor(void) ret = genl_register_ops(&net_drop_monitor_family, &dropmon_ops[i]); if (ret) { - printk(KERN_CRIT "failed to register operation %d\n", + printk(KERN_CRIT "Failed to register operation %d\n", dropmon_ops[i].cmd); goto out_unreg; } } + rc = register_netdevice_notifier(&dropmon_net_notifier); + if (rc < 0) { + printk(KERN_CRIT "Failed to register netdevice notifier\n"); + goto out_unreg; + } + rc = 0; for_each_present_cpu(cpu) { @@ -259,6 +374,7 @@ static int __init init_net_drop_monitor(void) data->send_timer.data = cpu; data->send_timer.function = sched_send_work; } + goto out; out_unreg: diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb45665e4f..b07b25bd2cde 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -27,3 +28,6 @@ DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); + +DEFINE_TRACE(napi_poll); +EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 64f51eec6576..00b14e2c50ed 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -24,6 +24,7 @@ #include #include #include +#include /* * We maintain a small pool of fully-sized skbs, to make sure the @@ -137,6 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); + trace_napi_poll(napi->dev); clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); -- cgit v1.2.3 From 7d18f114897ff4358419b14d551e704a4299a440 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 21 May 2009 23:30:09 -0700 Subject: net: Fix arg to trace_napi_poll() in netpoll. Reproted by Stephen Rothwell. Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 00b14e2c50ed..67b4f3e3d4a5 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -138,7 +138,7 @@ static int poll_one_napi(struct netpoll_info *npinfo, set_bit(NAPI_STATE_NPSVC, &napi->state); work = napi->poll(napi, budget); - trace_napi_poll(napi->dev); + trace_napi_poll(napi); clear_bit(NAPI_STATE_NPSVC, &napi->state); atomic_dec(&trapped); -- cgit v1.2.3 From 82c49a352e0fd7af7e79a922b863f33f619f3209 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 22 May 2009 22:11:37 +0000 Subject: skbuff: Move new code into __copy_skb_header Hi: skbuff: Move new __skb_clone code into __copy_skb_header It seems that people just keep on adding stuff to __skb_clone instead __copy_skb_header. This is wrong as it means your brand-new attributes won't always get copied as you intended. This patch moves them to the right place, and adds a comment to prevent this from happening again. Signed-off-by: Herbert Xu Thanks, Signed-off-by: David S. Miller --- net/core/skbuff.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d152394b2611..e47afb20b894 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -538,6 +538,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif new->protocol = old->protocol; new->mark = old->mark; + new->iif = old->iif; __nf_copy(new, old); #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) @@ -550,10 +551,18 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif #endif new->vlan_tci = old->vlan_tci; +#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) + new->do_not_encrypt = old->do_not_encrypt; + new->requeue = old->requeue; +#endif skb_copy_secmark(new, old); } +/* + * You should not add any new code to this function. Add it to + * __copy_skb_header above instead. + */ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) { #define C(x) n->x = skb->x @@ -569,16 +578,11 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->cloned = 1; n->nohdr = 0; n->destructor = NULL; - C(iif); C(tail); C(end); C(head); C(data); C(truesize); -#if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) - C(do_not_encrypt); - C(requeue); -#endif atomic_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); -- cgit v1.2.3 From 9bcb97cace615a9f57fca0b9d788e7d234d7fc95 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 22 May 2009 22:20:02 +0000 Subject: skbuff: Copy csum instead of csum_start/csum_offset Hi: skbuff: Copy csum instead of csum_start/csum_offset It's easier to copy the u32 csum instead of its two u16 constituents. Signed-off-by: Herbert Xu Cheers, Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e47afb20b894..47fbbb8827d9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -526,8 +526,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->sp = secpath_get(old->sp); #endif memcpy(new->cb, old->cb, sizeof(old->cb)); - new->csum_start = old->csum_start; - new->csum_offset = old->csum_offset; + new->csum = old->csum; new->local_df = old->local_df; new->pkt_type = old->pkt_type; new->ip_summed = old->ip_summed; -- cgit v1.2.3 From e3804cbebb67887879102925961d41b503f7fbe3 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Mon, 25 May 2009 01:53:53 -0700 Subject: net: remove COMPAT_NET_DEV_OPS All drivers are already converted to new net_device_ops API and nobody uses old API anymore. Signed-off-by: Alexander Beregalov Signed-off-by: David S. Miller --- net/core/dev.c | 50 -------------------------------------------------- 1 file changed, 50 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3942266d1f6c..241613f6dd2f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4580,39 +4580,6 @@ unsigned long netdev_fix_features(unsigned long features, const char *name) } EXPORT_SYMBOL(netdev_fix_features); -/* Some devices need to (re-)set their netdev_ops inside - * ->init() or similar. If that happens, we have to setup - * the compat pointers again. - */ -void netdev_resync_ops(struct net_device *dev) -{ -#ifdef CONFIG_COMPAT_NET_DEV_OPS - const struct net_device_ops *ops = dev->netdev_ops; - - dev->init = ops->ndo_init; - dev->uninit = ops->ndo_uninit; - dev->open = ops->ndo_open; - dev->change_rx_flags = ops->ndo_change_rx_flags; - dev->set_rx_mode = ops->ndo_set_rx_mode; - dev->set_multicast_list = ops->ndo_set_multicast_list; - dev->set_mac_address = ops->ndo_set_mac_address; - dev->validate_addr = ops->ndo_validate_addr; - dev->do_ioctl = ops->ndo_do_ioctl; - dev->set_config = ops->ndo_set_config; - dev->change_mtu = ops->ndo_change_mtu; - dev->neigh_setup = ops->ndo_neigh_setup; - dev->tx_timeout = ops->ndo_tx_timeout; - dev->get_stats = ops->ndo_get_stats; - dev->vlan_rx_register = ops->ndo_vlan_rx_register; - dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid; - dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid; -#ifdef CONFIG_NET_POLL_CONTROLLER - dev->poll_controller = ops->ndo_poll_controller; -#endif -#endif -} -EXPORT_SYMBOL(netdev_resync_ops); - /** * register_netdevice - register a network device * @dev: device to register @@ -4652,23 +4619,6 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; -#ifdef CONFIG_COMPAT_NET_DEV_OPS - /* Netdevice_ops API compatibility support. - * This is temporary until all network devices are converted. - */ - if (dev->netdev_ops) { - netdev_resync_ops(dev); - } else { - char drivername[64]; - pr_info("%s (%s): not using net_device_ops yet\n", - dev->name, netdev_drivername(dev, drivername, 64)); - - /* This works only because net_device_ops and the - compatibility structure are the same. */ - dev->netdev_ops = (void *) &(dev->init); - } -#endif - /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); -- cgit v1.2.3 From a1dcb6628b9489504a3be2515580fc4de891f94a Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Mon, 25 May 2009 22:47:01 -0700 Subject: pkt_sched: gen_estimator: Fix signed integers right-shifts. Right-shifts of signed integers are implementation-defined so unportable. With feedback from: Eric Dumazet Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/gen_estimator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 6d62d4618cfc..78e5bfc454ae 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -128,12 +128,12 @@ static void est_timer(unsigned long arg) npackets = e->bstats->packets; brate = (nbytes - e->last_bytes)<<(7 - idx); e->last_bytes = nbytes; - e->avbps += ((s64)(brate - e->avbps)) >> e->ewma_log; + e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log); e->rate_est->bps = (e->avbps+0xF)>>5; rate = (npackets - e->last_packets)<<(12 - idx); e->last_packets = npackets; - e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; + e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log); e->rate_est->pps = (e->avpps+0x1FF)>>10; skip: read_unlock(&est_lock); -- cgit v1.2.3 From 08baf561083bc27a953aa087dd8a664bb2b88e8e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 May 2009 22:58:01 -0700 Subject: net: txq_trans_update() helper We would like to get rid of netdev->trans_start = jiffies; that about all net drivers have to use in their start_xmit() function, and use txq->trans_start instead. This can be done generically in core network, as suggested by David. Some devices, (particularly loopback) dont need trans_start update, because they dont have transmit watchdog. We could add a new device flag, or rely on fact that txq->tran_start can be updated is txq->xmit_lock_owner is different than -1. Use a helper function to hide our choice. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ net/core/netpoll.c | 5 ++++- net/core/pktgen.c | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 241613f6dd2f..5eb3e48ab31d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1698,6 +1698,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb->dst = NULL; } rc = ops->ndo_start_xmit(skb, dev); + if (rc == 0) + txq_trans_update(txq); /* * TODO: if skb_orphan() was called by * dev->hard_start_xmit() (for example, the unmodified @@ -1727,6 +1729,7 @@ gso: skb->next = nskb; return rc; } + txq_trans_update(txq); if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) return NETDEV_TX_BUSY; } while (skb->next); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 67b4f3e3d4a5..7ab31a7576a1 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -302,8 +302,11 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { if (__netif_tx_trylock(txq)) { - if (!netif_tx_queue_stopped(txq)) + if (!netif_tx_queue_stopped(txq)) { status = ops->ndo_start_xmit(skb, dev); + if (status == NETDEV_TX_OK) + txq_trans_update(txq); + } __netif_tx_unlock(txq); if (status == NETDEV_TX_OK) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 0666a827bc62..b8ccd3c88d63 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3438,6 +3438,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) retry_now: ret = (*xmit)(pkt_dev->skb, odev); if (likely(ret == NETDEV_TX_OK)) { + txq_trans_update(txq); pkt_dev->last_ok = 1; pkt_dev->sofar++; pkt_dev->seq_num++; -- cgit v1.2.3 From 2b0cc7f78ba55e831c16fb8cb5c204d9d2ecc398 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 26 May 2009 21:05:19 -0700 Subject: net: Remove bogus reference to BUS_ID_SIZE in sysfs code. BUS_ID_SIZE is really no more, and device names are dynamically allocated and thus can be any necessary size. So remove the BUG check here making sure BUS_ID_SIZE is at least as large as IFNAMSIZ. Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b9641e816eee..3994680c08b9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -499,7 +499,6 @@ int netdev_register_kobject(struct net_device *net) dev->platform_data = net; dev->groups = groups; - BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ); dev_set_name(dev, "%s", net->name); #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 42da6994ca6d20ad1d4e30255dee98047db454e7 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:19 +0000 Subject: gro: Open-code frags copy in skb_gro_receive gcc does a poor job at generating code for the memcpy of the frags array in skb_gro_receive, which is the primary purpose of that function when merging frags. In particular, it can't utilise the alignment information of the source and destination. This patch open-codes the copy so we process words instead of bytes. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d429c41e0dc4..c88426b51140 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2673,6 +2673,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) if (skb_shinfo(p)->frag_list) goto merge; else if (skb_headlen(skb) <= skb_gro_offset(skb)) { + skb_frag_t *frag; + int i; + if (skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags > MAX_SKB_FRAGS) return -E2BIG; @@ -2682,9 +2685,9 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) skb_shinfo(skb)->frags[0].size -= skb_gro_offset(skb) - skb_headlen(skb); - memcpy(skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags, - skb_shinfo(skb)->frags, - skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t)); + frag = skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + *frag++ = skb_shinfo(skb)->frags[i]; skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags; skb_shinfo(skb)->nr_frags = 0; -- cgit v1.2.3 From 78a478d0efd9e86e5345b436e130497b4e5846e8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:21 +0000 Subject: gro: Inline skb_gro_header and cache frag0 virtual address The function skb_gro_header is called four times per packet which quickly adds up at 10Gb/s. This patch inlines it to allow better optimisations. Some architectures perform multiplication for page_address, which is done by each skb_gro_header invocation. This patch caches that value in skb->cb to avoid the unnecessary multiplications. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5eb3e48ab31d..bdb1a738193d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2390,21 +2390,6 @@ void napi_gro_flush(struct napi_struct *napi) } EXPORT_SYMBOL(napi_gro_flush); -void *skb_gro_header(struct sk_buff *skb, unsigned int hlen) -{ - unsigned int offset = skb_gro_offset(skb); - - hlen += offset; - if (unlikely(skb_headlen(skb) || - skb_shinfo(skb)->frags[0].size < hlen || - PageHighMem(skb_shinfo(skb)->frags[0].page))) - return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL; - - return page_address(skb_shinfo(skb)->frags[0].page) + - skb_shinfo(skb)->frags[0].page_offset + offset; -} -EXPORT_SYMBOL(skb_gro_header); - int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff **pp = NULL; @@ -2520,6 +2505,18 @@ int napi_skb_finish(int ret, struct sk_buff *skb) } EXPORT_SYMBOL(napi_skb_finish); +void skb_gro_reset_offset(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->data_offset = 0; + NAPI_GRO_CB(skb)->frag0 = NULL; + + if (!skb_headlen(skb) && !PageHighMem(skb_shinfo(skb)->frags[0].page)) + NAPI_GRO_CB(skb)->frag0 = + page_address(skb_shinfo(skb)->frags[0].page) + + skb_shinfo(skb)->frags[0].page_offset; +} +EXPORT_SYMBOL(skb_gro_reset_offset); + int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { skb_gro_reset_offset(skb); -- cgit v1.2.3 From 67147ba99aeb48f2863e03b68e090088a34c1b5d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:22 +0000 Subject: gro: Localise offset/headlen in skb_gro_offset This patch stores the offset/headlen in local variables as they're used repeatedly in skb_gro_offset. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c88426b51140..168e949df6a1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2666,13 +2666,15 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) struct sk_buff *nskb; unsigned int headroom; unsigned int len = skb_gro_len(skb); + unsigned int offset = skb_gro_offset(skb); + unsigned int headlen = skb_headlen(skb); if (p->len + len >= 65536) return -E2BIG; if (skb_shinfo(p)->frag_list) goto merge; - else if (skb_headlen(skb) <= skb_gro_offset(skb)) { + else if (headlen <= offset) { skb_frag_t *frag; int i; @@ -2680,10 +2682,8 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) MAX_SKB_FRAGS) return -E2BIG; - skb_shinfo(skb)->frags[0].page_offset += - skb_gro_offset(skb) - skb_headlen(skb); - skb_shinfo(skb)->frags[0].size -= - skb_gro_offset(skb) - skb_headlen(skb); + skb_shinfo(skb)->frags[0].page_offset += offset - headlen; + skb_shinfo(skb)->frags[0].size -= offset - headlen; frag = skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) @@ -2736,16 +2736,13 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) p = nskb; merge: - if (skb_gro_offset(skb) > skb_headlen(skb)) { - skb_shinfo(skb)->frags[0].page_offset += - skb_gro_offset(skb) - skb_headlen(skb); - skb_shinfo(skb)->frags[0].size -= - skb_gro_offset(skb) - skb_headlen(skb); - skb_gro_reset_offset(skb); - skb_gro_pull(skb, skb_headlen(skb)); + if (offset > headlen) { + skb_shinfo(skb)->frags[0].page_offset += offset - headlen; + skb_shinfo(skb)->frags[0].size -= offset - headlen; + offset = headlen; } - __skb_pull(skb, skb_gro_offset(skb)); + __skb_pull(skb, offset); p->prev->next = skb; p->prev = skb; -- cgit v1.2.3 From 78d3fd0b7de844a6dad56e9620fc9d2271b32ab9 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:23 +0000 Subject: gro: Only use skb_gro_header for completely non-linear packets Currently skb_gro_header is used for packets which put the hardware header in skb->data with the rest in frags. Since the drivers that need this optimisation all provide completely non-linear packets, we can gain extra optimisations by only performing the frag0 optimisation for completely non-linear packets. In particular, we can simply test frag0 (instead of skb_headlen) to see whether the optimisation is in force. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index bdb1a738193d..f9d90c56b6f0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2510,7 +2510,8 @@ void skb_gro_reset_offset(struct sk_buff *skb) NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; - if (!skb_headlen(skb) && !PageHighMem(skb_shinfo(skb)->frags[0].page)) + if (skb->mac_header == skb->tail && + !PageHighMem(skb_shinfo(skb)->frags[0].page)) NAPI_GRO_CB(skb)->frag0 = page_address(skb_shinfo(skb)->frags[0].page) + skb_shinfo(skb)->frags[0].page_offset; -- cgit v1.2.3 From 7489594cb249aeb178287c9a43a9e4f366044259 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:27 +0000 Subject: gro: Optimise length comparison in skb_gro_header By caching frag0_len, we can avoid checking both frag0 and the length separately in skb_gro_header. This helps as skb_gro_header is called four times per packet which amounts to a few million times at 10Gb/s. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index f9d90c56b6f0..b1722a2d1fbe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2509,12 +2509,15 @@ void skb_gro_reset_offset(struct sk_buff *skb) { NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; if (skb->mac_header == skb->tail && - !PageHighMem(skb_shinfo(skb)->frags[0].page)) + !PageHighMem(skb_shinfo(skb)->frags[0].page)) { NAPI_GRO_CB(skb)->frag0 = page_address(skb_shinfo(skb)->frags[0].page) + skb_shinfo(skb)->frags[0].page_offset; + NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; + } } EXPORT_SYMBOL(skb_gro_reset_offset); -- cgit v1.2.3 From a5b1cf288d4200506ab62fbb86cc81ace948a306 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:28 +0000 Subject: gro: Avoid unnecessary comparison after skb_gro_header For the overwhelming majority of cases, skb_gro_header's return value cannot be NULL. Yet we must check it because of its current form. This patch splits it up into multiple functions in order to avoid this. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b1722a2d1fbe..cd29e613bc5a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2590,17 +2590,24 @@ struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; struct ethhdr *eth; + unsigned int hlen; + unsigned int off; napi->skb = NULL; skb_reset_mac_header(skb); skb_gro_reset_offset(skb); - eth = skb_gro_header(skb, sizeof(*eth)); - if (!eth) { - napi_reuse_skb(napi, skb); - skb = NULL; - goto out; + off = skb_gro_offset(skb); + hlen = off + sizeof(*eth); + eth = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, hlen)) { + eth = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!eth)) { + napi_reuse_skb(napi, skb); + skb = NULL; + goto out; + } } skb_gro_pull(skb, sizeof(*eth)); -- cgit v1.2.3 From cb18978cbf454c236db5e4191a12ef71eef9b3a0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:31 +0000 Subject: gro: Open-code final pskb_may_pull As we know the only packets which need the final pskb_may_pull are completely non-linear, and have all the required bits in frag0, we can perform a straight memcpy instead of going through pskb_may_pull and doing skb_copy_bits. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index cd29e613bc5a..ed4550fd9ece 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2452,10 +2452,25 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) ret = GRO_HELD; pull: - if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) { - if (napi->gro_list == skb) - napi->gro_list = skb->next; - ret = GRO_DROP; + if (skb_headlen(skb) < skb_gro_offset(skb)) { + int grow = skb_gro_offset(skb) - skb_headlen(skb); + + BUG_ON(skb->end - skb->tail < grow); + + memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); + + skb->tail += grow; + skb->data_len -= grow; + + skb_shinfo(skb)->frags[0].page_offset += grow; + skb_shinfo(skb)->frags[0].size -= grow; + + if (unlikely(!skb_shinfo(skb)->frags[0].size)) { + put_page(skb_shinfo(skb)->frags[0].page); + memmove(skb_shinfo(skb)->frags, + skb_shinfo(skb)->frags + 1, + --skb_shinfo(skb)->nr_frags); + } } ok: -- cgit v1.2.3 From 66e92fcf1ded5dd0da30d016ed47882eb183ec71 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:32 +0000 Subject: gro: Nasty optimisations for page frags in skb_gro_receive This patch reverses the direction of the frags array copy in skb_gro_receive in order simplify the loop conditional. It also avoids touching the first element of the original frags array. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 168e949df6a1..19afb18abae9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2676,21 +2676,26 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) goto merge; else if (headlen <= offset) { skb_frag_t *frag; - int i; + skb_frag_t *frag2; + int i = skb_shinfo(skb)->nr_frags; + int nr_frags = skb_shinfo(p)->nr_frags + i; + + offset -= headlen; - if (skb_shinfo(p)->nr_frags + skb_shinfo(skb)->nr_frags > - MAX_SKB_FRAGS) + if (nr_frags > MAX_SKB_FRAGS) return -E2BIG; - skb_shinfo(skb)->frags[0].page_offset += offset - headlen; - skb_shinfo(skb)->frags[0].size -= offset - headlen; + skb_shinfo(p)->nr_frags = nr_frags; + skb_shinfo(skb)->nr_frags = 0; - frag = skb_shinfo(p)->frags + skb_shinfo(p)->nr_frags; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - *frag++ = skb_shinfo(skb)->frags[i]; + frag = skb_shinfo(p)->frags + nr_frags; + frag2 = skb_shinfo(skb)->frags + i; + do { + *--frag = *--frag2; + } while (--i); - skb_shinfo(p)->nr_frags += skb_shinfo(skb)->nr_frags; - skb_shinfo(skb)->nr_frags = 0; + frag->page_offset += offset; + frag->size -= offset; skb->truesize -= skb->data_len; skb->len -= skb->data_len; -- cgit v1.2.3 From 9aaa156cf9b7e9d9ed899f254283b91c4e3c36c8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 26 May 2009 18:50:33 +0000 Subject: gro: Store shinfo in local variable in skb_gro_receive This patch stores the two shinfo pointers in local variables because they're used over and over again in skb_gro_receive. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 19afb18abae9..8e815e685f28 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2664,6 +2664,8 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) { struct sk_buff *p = *head; struct sk_buff *nskb; + struct skb_shared_info *skbinfo = skb_shinfo(skb); + struct skb_shared_info *pinfo = skb_shinfo(p); unsigned int headroom; unsigned int len = skb_gro_len(skb); unsigned int offset = skb_gro_offset(skb); @@ -2672,24 +2674,24 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) if (p->len + len >= 65536) return -E2BIG; - if (skb_shinfo(p)->frag_list) + if (pinfo->frag_list) goto merge; else if (headlen <= offset) { skb_frag_t *frag; skb_frag_t *frag2; - int i = skb_shinfo(skb)->nr_frags; - int nr_frags = skb_shinfo(p)->nr_frags + i; + int i = skbinfo->nr_frags; + int nr_frags = pinfo->nr_frags + i; offset -= headlen; if (nr_frags > MAX_SKB_FRAGS) return -E2BIG; - skb_shinfo(p)->nr_frags = nr_frags; - skb_shinfo(skb)->nr_frags = 0; + pinfo->nr_frags = nr_frags; + skbinfo->nr_frags = 0; - frag = skb_shinfo(p)->frags + nr_frags; - frag2 = skb_shinfo(skb)->frags + i; + frag = pinfo->frags + nr_frags; + frag2 = skbinfo->frags + i; do { *--frag = *--frag2; } while (--i); @@ -2726,7 +2728,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); skb_shinfo(nskb)->frag_list = p; - skb_shinfo(nskb)->gso_size = skb_shinfo(p)->gso_size; + skb_shinfo(nskb)->gso_size = pinfo->gso_size; skb_header_release(p); nskb->prev = p; @@ -2742,8 +2744,8 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) merge: if (offset > headlen) { - skb_shinfo(skb)->frags[0].page_offset += offset - headlen; - skb_shinfo(skb)->frags[0].size -= offset - headlen; + skbinfo->frags[0].page_offset += offset - headlen; + skbinfo->frags[0].size -= offset - headlen; offset = headlen; } -- cgit v1.2.3 From 1ce8e7b57b3a4527ef83da1c5c7bd8a6b9d87b56 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 May 2009 04:42:37 +0000 Subject: net: ALIGN/PTR_ALIGN cleanup in alloc_netdev_mq()/netdev_priv() Use ALIGN() and PTR_ALIGN() macros instead of handcoding them. Get rid of NETDEV_ALIGN_CONST ugly define Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ed4550fd9ece..32ceee17896e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4988,18 +4988,18 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, struct netdev_queue *tx; struct net_device *dev; size_t alloc_size; - void *p; + struct net_device *p; BUG_ON(strlen(name) >= sizeof(dev->name)); alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ - alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; + alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); alloc_size += sizeof_priv; } /* ensure 32-byte alignment of whole construct */ - alloc_size += NETDEV_ALIGN_CONST; + alloc_size += NETDEV_ALIGN - 1; p = kzalloc(alloc_size, GFP_KERNEL); if (!p) { @@ -5014,8 +5014,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, goto free_p; } - dev = (struct net_device *) - (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); + dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; if (dev_addr_init(dev)) -- cgit v1.2.3 From 2a91525c20d3aae15b33c189514b9e20e30ef8a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 May 2009 11:30:05 +0000 Subject: net: net/core/sock.c cleanup Pure style cleanup patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 103 ++++++++++++++++++++++++-------------------------------- 1 file changed, 44 insertions(+), 59 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 7dbf3ffb35cc..58dec9dff99a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -212,6 +212,7 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; /* Maximal space eaten by iovec or ancilliary data plus some space */ int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); +EXPORT_SYMBOL(sysctl_optmem_max); static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) { @@ -444,7 +445,7 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) int sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) { - struct sock *sk=sock->sk; + struct sock *sk = sock->sk; int val; int valbool; struct linger ling; @@ -463,15 +464,15 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if (get_user(val, (int __user *)optval)) return -EFAULT; - valbool = val?1:0; + valbool = val ? 1 : 0; lock_sock(sk); - switch(optname) { + switch (optname) { case SO_DEBUG: - if (val && !capable(CAP_NET_ADMIN)) { + if (val && !capable(CAP_NET_ADMIN)) ret = -EACCES; - } else + else sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: @@ -582,7 +583,7 @@ set_rcvbuf: ret = -EINVAL; /* 1003.1g */ break; } - if (copy_from_user(&ling,optval,sizeof(ling))) { + if (copy_from_user(&ling, optval, sizeof(ling))) { ret = -EFAULT; break; } @@ -690,9 +691,8 @@ set_rcvbuf: case SO_MARK: if (!capable(CAP_NET_ADMIN)) ret = -EPERM; - else { + else sk->sk_mark = val; - } break; /* We implement the SO_SNDLOWAT etc to @@ -704,6 +704,7 @@ set_rcvbuf: release_sock(sk); return ret; } +EXPORT_SYMBOL(sock_setsockopt); int sock_getsockopt(struct socket *sock, int level, int optname, @@ -727,7 +728,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, memset(&v, 0, sizeof(v)); - switch(optname) { + switch (optname) { case SO_DEBUG: v.val = sock_flag(sk, SOCK_DBG); break; @@ -762,7 +763,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_ERROR: v.val = -sock_error(sk); - if (v.val==0) + if (v.val == 0) v.val = xchg(&sk->sk_err_soft, 0); break; @@ -816,7 +817,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_RCVTIMEO: - lv=sizeof(struct timeval); + lv = sizeof(struct timeval); if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { v.tm.tv_sec = 0; v.tm.tv_usec = 0; @@ -827,7 +828,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_SNDTIMEO: - lv=sizeof(struct timeval); + lv = sizeof(struct timeval); if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { v.tm.tv_sec = 0; v.tm.tv_usec = 0; @@ -842,7 +843,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_SNDLOWAT: - v.val=1; + v.val = 1; break; case SO_PASSCRED: @@ -1002,6 +1003,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, return sk; } +EXPORT_SYMBOL(sk_alloc); void sk_free(struct sock *sk) { @@ -1026,6 +1028,7 @@ void sk_free(struct sock *sk) put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); } +EXPORT_SYMBOL(sk_free); /* * Last sock_put should drop referrence to sk->sk_net. It has already @@ -1126,7 +1129,6 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) out: return newsk; } - EXPORT_SYMBOL_GPL(sk_clone); void sk_setup_caps(struct sock *sk, struct dst_entry *dst) @@ -1177,6 +1179,7 @@ void sock_wfree(struct sk_buff *skb) sk->sk_write_space(sk); sock_put(sk); } +EXPORT_SYMBOL(sock_wfree); /* * Read buffer destructor automatically called from kfree_skb. @@ -1188,6 +1191,7 @@ void sock_rfree(struct sk_buff *skb) atomic_sub(skb->truesize, &sk->sk_rmem_alloc); sk_mem_uncharge(skb->sk, skb->truesize); } +EXPORT_SYMBOL(sock_rfree); int sock_i_uid(struct sock *sk) @@ -1199,6 +1203,7 @@ int sock_i_uid(struct sock *sk) read_unlock(&sk->sk_callback_lock); return uid; } +EXPORT_SYMBOL(sock_i_uid); unsigned long sock_i_ino(struct sock *sk) { @@ -1209,6 +1214,7 @@ unsigned long sock_i_ino(struct sock *sk) read_unlock(&sk->sk_callback_lock); return ino; } +EXPORT_SYMBOL(sock_i_ino); /* * Allocate a skb from the socket's send buffer. @@ -1217,7 +1223,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { - struct sk_buff * skb = alloc_skb(size, priority); + struct sk_buff *skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); return skb; @@ -1225,6 +1231,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, } return NULL; } +EXPORT_SYMBOL(sock_wmalloc); /* * Allocate a skb from the socket's receive buffer. @@ -1261,6 +1268,7 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) } return NULL; } +EXPORT_SYMBOL(sock_kmalloc); /* * Free an option memory block. @@ -1270,11 +1278,12 @@ void sock_kfree_s(struct sock *sk, void *mem, int size) kfree(mem); atomic_sub(size, &sk->sk_omem_alloc); } +EXPORT_SYMBOL(sock_kfree_s); /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. I think, these locks should be removed for datagram sockets. */ -static long sock_wait_for_wmem(struct sock * sk, long timeo) +static long sock_wait_for_wmem(struct sock *sk, long timeo) { DEFINE_WAIT(wait); @@ -1392,6 +1401,7 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, { return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); } +EXPORT_SYMBOL(sock_alloc_send_skb); static void __lock_sock(struct sock *sk) { @@ -1460,7 +1470,6 @@ int sk_wait_data(struct sock *sk, long *timeo) finish_wait(sk->sk_sleep, &wait); return rc; } - EXPORT_SYMBOL(sk_wait_data); /** @@ -1541,7 +1550,6 @@ suppress_allocation: atomic_sub(amt, prot->memory_allocated); return 0; } - EXPORT_SYMBOL(__sk_mem_schedule); /** @@ -1560,7 +1568,6 @@ void __sk_mem_reclaim(struct sock *sk) (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) *prot->memory_pressure = 0; } - EXPORT_SYMBOL(__sk_mem_reclaim); @@ -1575,78 +1582,92 @@ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_bind); int sock_no_connect(struct socket *sock, struct sockaddr *saddr, int len, int flags) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_connect); int sock_no_socketpair(struct socket *sock1, struct socket *sock2) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_socketpair); int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_accept); int sock_no_getname(struct socket *sock, struct sockaddr *saddr, int *len, int peer) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_getname); -unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt) +unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) { return 0; } +EXPORT_SYMBOL(sock_no_poll); int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_ioctl); int sock_no_listen(struct socket *sock, int backlog) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_listen); int sock_no_shutdown(struct socket *sock, int how) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_shutdown); int sock_no_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_setsockopt); int sock_no_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_getsockopt); int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t len) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_sendmsg); int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, size_t len, int flags) { return -EOPNOTSUPP; } +EXPORT_SYMBOL(sock_no_recvmsg); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { /* Mirror missing mmap method error code */ return -ENODEV; } +EXPORT_SYMBOL(sock_no_mmap); ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { @@ -1660,6 +1681,7 @@ ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, siz kunmap(page); return res; } +EXPORT_SYMBOL(sock_no_sendpage); /* * Default Socket Callbacks @@ -1723,6 +1745,7 @@ void sk_send_sigurg(struct sock *sk) if (send_sigurg(&sk->sk_socket->file->f_owner)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } +EXPORT_SYMBOL(sk_send_sigurg); void sk_reset_timer(struct sock *sk, struct timer_list* timer, unsigned long expires) @@ -1730,7 +1753,6 @@ void sk_reset_timer(struct sock *sk, struct timer_list* timer, if (!mod_timer(timer, expires)) sock_hold(sk); } - EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) @@ -1738,7 +1760,6 @@ void sk_stop_timer(struct sock *sk, struct timer_list* timer) if (timer_pending(timer) && del_timer(timer)) __sock_put(sk); } - EXPORT_SYMBOL(sk_stop_timer); void sock_init_data(struct socket *sock, struct sock *sk) @@ -1797,6 +1818,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) atomic_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } +EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) { @@ -1812,7 +1834,6 @@ void lock_sock_nested(struct sock *sk, int subclass) mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); local_bh_enable(); } - EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) @@ -1895,7 +1916,6 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); } - EXPORT_SYMBOL(sock_common_getsockopt); #ifdef CONFIG_COMPAT @@ -1925,7 +1945,6 @@ int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_namelen = addr_len; return err; } - EXPORT_SYMBOL(sock_common_recvmsg); /* @@ -1938,7 +1957,6 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); } - EXPORT_SYMBOL(sock_common_setsockopt); #ifdef CONFIG_COMPAT @@ -1989,7 +2007,6 @@ void sk_common_release(struct sock *sk) sk_refcnt_debug_release(sk); sock_put(sk); } - EXPORT_SYMBOL(sk_common_release); static DEFINE_RWLOCK(proto_list_lock); @@ -2171,7 +2188,6 @@ out_free_sock_slab: out: return -ENOBUFS; } - EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) @@ -2198,7 +2214,6 @@ void proto_unregister(struct proto *prot) prot->twsk_prot->twsk_slab = NULL; } } - EXPORT_SYMBOL(proto_unregister); #ifdef CONFIG_PROC_FS @@ -2324,33 +2339,3 @@ static int __init proto_init(void) subsys_initcall(proto_init); #endif /* PROC_FS */ - -EXPORT_SYMBOL(sk_alloc); -EXPORT_SYMBOL(sk_free); -EXPORT_SYMBOL(sk_send_sigurg); -EXPORT_SYMBOL(sock_alloc_send_skb); -EXPORT_SYMBOL(sock_init_data); -EXPORT_SYMBOL(sock_kfree_s); -EXPORT_SYMBOL(sock_kmalloc); -EXPORT_SYMBOL(sock_no_accept); -EXPORT_SYMBOL(sock_no_bind); -EXPORT_SYMBOL(sock_no_connect); -EXPORT_SYMBOL(sock_no_getname); -EXPORT_SYMBOL(sock_no_getsockopt); -EXPORT_SYMBOL(sock_no_ioctl); -EXPORT_SYMBOL(sock_no_listen); -EXPORT_SYMBOL(sock_no_mmap); -EXPORT_SYMBOL(sock_no_poll); -EXPORT_SYMBOL(sock_no_recvmsg); -EXPORT_SYMBOL(sock_no_sendmsg); -EXPORT_SYMBOL(sock_no_sendpage); -EXPORT_SYMBOL(sock_no_setsockopt); -EXPORT_SYMBOL(sock_no_shutdown); -EXPORT_SYMBOL(sock_no_socketpair); -EXPORT_SYMBOL(sock_rfree); -EXPORT_SYMBOL(sock_setsockopt); -EXPORT_SYMBOL(sock_wfree); -EXPORT_SYMBOL(sock_wmalloc); -EXPORT_SYMBOL(sock_i_uid); -EXPORT_SYMBOL(sock_i_ino); -EXPORT_SYMBOL(sysctl_optmem_max); -- cgit v1.2.3 From ccffad25b5136958d4769ed6de5e87992dd9c65c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 22 May 2009 23:22:17 +0000 Subject: net: convert unicast addr list This patch converts unicast address list to standard list_head using previously introduced struct netdev_hw_addr. It also relaxes the locking. Original spinlock (still used for multicast addresses) is not needed and is no longer used for a protection of this list. All reading and writing takes place under rtnl (with no changes). I also removed a possibility to specify the length of the address while adding or deleting unicast address. It's always dev->addr_len. The convertion touched especially e1000 and ixgbe codes when the change is not so trivial. Signed-off-by: Jiri Pirko drivers/net/bnx2.c | 13 +-- drivers/net/e1000/e1000_main.c | 24 +++-- drivers/net/ixgbe/ixgbe_common.c | 14 ++-- drivers/net/ixgbe/ixgbe_common.h | 4 +- drivers/net/ixgbe/ixgbe_main.c | 6 +- drivers/net/ixgbe/ixgbe_type.h | 4 +- drivers/net/macvlan.c | 11 +- drivers/net/mv643xx_eth.c | 11 +- drivers/net/niu.c | 7 +- drivers/net/virtio_net.c | 7 +- drivers/s390/net/qeth_l2_main.c | 6 +- drivers/scsi/fcoe/fcoe.c | 16 ++-- include/linux/netdevice.h | 18 ++-- net/8021q/vlan.c | 4 +- net/8021q/vlan_dev.c | 10 +- net/core/dev.c | 195 +++++++++++++++++++++++++++----------- net/dsa/slave.c | 10 +- net/packet/af_packet.c | 4 +- 18 files changed, 227 insertions(+), 137 deletions(-) Signed-off-by: David S. Miller --- net/core/dev.c | 195 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 138 insertions(+), 57 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 32ceee17896e..e2fcc5f10177 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3473,8 +3473,9 @@ void dev_set_rx_mode(struct net_device *dev) /* hw addresses list handling functions */ -static int __hw_addr_add(struct list_head *list, unsigned char *addr, - int addr_len, unsigned char addr_type) +static int __hw_addr_add(struct list_head *list, int *delta, + unsigned char *addr, int addr_len, + unsigned char addr_type) { struct netdev_hw_addr *ha; int alloc_size; @@ -3482,6 +3483,15 @@ static int __hw_addr_add(struct list_head *list, unsigned char *addr, if (addr_len > MAX_ADDR_LEN) return -EINVAL; + list_for_each_entry(ha, list, list) { + if (!memcmp(ha->addr, addr, addr_len) && + ha->type == addr_type) { + ha->refcount++; + return 0; + } + } + + alloc_size = sizeof(*ha); if (alloc_size < L1_CACHE_BYTES) alloc_size = L1_CACHE_BYTES; @@ -3490,7 +3500,11 @@ static int __hw_addr_add(struct list_head *list, unsigned char *addr, return -ENOMEM; memcpy(ha->addr, addr, addr_len); ha->type = addr_type; + ha->refcount = 1; + ha->synced = false; list_add_tail_rcu(&ha->list, list); + if (delta) + (*delta)++; return 0; } @@ -3502,29 +3516,30 @@ static void ha_rcu_free(struct rcu_head *head) kfree(ha); } -static int __hw_addr_del_ii(struct list_head *list, unsigned char *addr, - int addr_len, unsigned char addr_type, - int ignore_index) +static int __hw_addr_del(struct list_head *list, int *delta, + unsigned char *addr, int addr_len, + unsigned char addr_type) { struct netdev_hw_addr *ha; - int i = 0; list_for_each_entry(ha, list, list) { - if (i++ != ignore_index && - !memcmp(ha->addr, addr, addr_len) && + if (!memcmp(ha->addr, addr, addr_len) && (ha->type == addr_type || !addr_type)) { + if (--ha->refcount) + return 0; list_del_rcu(&ha->list); call_rcu(&ha->rcu_head, ha_rcu_free); + if (delta) + (*delta)--; return 0; } } return -ENOENT; } -static int __hw_addr_add_multiple_ii(struct list_head *to_list, - struct list_head *from_list, - int addr_len, unsigned char addr_type, - int ignore_index) +static int __hw_addr_add_multiple(struct list_head *to_list, int *to_delta, + struct list_head *from_list, int addr_len, + unsigned char addr_type) { int err; struct netdev_hw_addr *ha, *ha2; @@ -3532,7 +3547,8 @@ static int __hw_addr_add_multiple_ii(struct list_head *to_list, list_for_each_entry(ha, from_list, list) { type = addr_type ? addr_type : ha->type; - err = __hw_addr_add(to_list, ha->addr, addr_len, type); + err = __hw_addr_add(to_list, to_delta, ha->addr, + addr_len, type); if (err) goto unroll; } @@ -3543,27 +3559,69 @@ unroll: if (ha2 == ha) break; type = addr_type ? addr_type : ha2->type; - __hw_addr_del_ii(to_list, ha2->addr, addr_len, type, - ignore_index); + __hw_addr_del(to_list, to_delta, ha2->addr, + addr_len, type); } return err; } -static void __hw_addr_del_multiple_ii(struct list_head *to_list, - struct list_head *from_list, - int addr_len, unsigned char addr_type, - int ignore_index) +static void __hw_addr_del_multiple(struct list_head *to_list, int *to_delta, + struct list_head *from_list, int addr_len, + unsigned char addr_type) { struct netdev_hw_addr *ha; unsigned char type; list_for_each_entry(ha, from_list, list) { type = addr_type ? addr_type : ha->type; - __hw_addr_del_ii(to_list, ha->addr, addr_len, addr_type, - ignore_index); + __hw_addr_del(to_list, to_delta, ha->addr, + addr_len, addr_type); + } +} + +static int __hw_addr_sync(struct list_head *to_list, int *to_delta, + struct list_head *from_list, int *from_delta, + int addr_len) +{ + int err = 0; + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, from_list, list) { + if (!ha->synced) { + err = __hw_addr_add(to_list, to_delta, ha->addr, + addr_len, ha->type); + if (err) + break; + ha->synced = true; + ha->refcount++; + } else if (ha->refcount == 1) { + __hw_addr_del(to_list, to_delta, ha->addr, + addr_len, ha->type); + __hw_addr_del(from_list, from_delta, ha->addr, + addr_len, ha->type); + } } + return err; } +static void __hw_addr_unsync(struct list_head *to_list, int *to_delta, + struct list_head *from_list, int *from_delta, + int addr_len) +{ + struct netdev_hw_addr *ha, *tmp; + + list_for_each_entry_safe(ha, tmp, from_list, list) { + if (ha->synced) { + __hw_addr_del(to_list, to_delta, ha->addr, + addr_len, ha->type); + ha->synced = false; + __hw_addr_del(from_list, from_delta, ha->addr, + addr_len, ha->type); + } + } +} + + static void __hw_addr_flush(struct list_head *list) { struct netdev_hw_addr *ha, *tmp; @@ -3594,7 +3652,7 @@ static int dev_addr_init(struct net_device *dev) INIT_LIST_HEAD(&dev->dev_addr_list); memset(addr, 0, sizeof(*addr)); - err = __hw_addr_add(&dev->dev_addr_list, addr, sizeof(*addr), + err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, sizeof(*addr), NETDEV_HW_ADDR_T_LAN); if (!err) { /* @@ -3626,7 +3684,7 @@ int dev_addr_add(struct net_device *dev, unsigned char *addr, ASSERT_RTNL(); - err = __hw_addr_add(&dev->dev_addr_list, addr, dev->addr_len, + err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); @@ -3649,11 +3707,20 @@ int dev_addr_del(struct net_device *dev, unsigned char *addr, unsigned char addr_type) { int err; + struct netdev_hw_addr *ha; ASSERT_RTNL(); - err = __hw_addr_del_ii(&dev->dev_addr_list, addr, dev->addr_len, - addr_type, 0); + /* + * We can not remove the first address from the list because + * dev->dev_addr points to that. + */ + ha = list_first_entry(&dev->dev_addr_list, struct netdev_hw_addr, list); + if (ha->addr == dev->dev_addr && ha->refcount == 1) + return -ENOENT; + + err = __hw_addr_del(&dev->dev_addr_list, NULL, addr, dev->addr_len, + addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return err; @@ -3680,9 +3747,9 @@ int dev_addr_add_multiple(struct net_device *to_dev, if (from_dev->addr_len != to_dev->addr_len) return -EINVAL; - err = __hw_addr_add_multiple_ii(&to_dev->dev_addr_list, - &from_dev->dev_addr_list, - to_dev->addr_len, addr_type, 0); + err = __hw_addr_add_multiple(&to_dev->dev_addr_list, NULL, + &from_dev->dev_addr_list, + to_dev->addr_len, addr_type); if (!err) call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); return err; @@ -3707,9 +3774,9 @@ int dev_addr_del_multiple(struct net_device *to_dev, if (from_dev->addr_len != to_dev->addr_len) return -EINVAL; - __hw_addr_del_multiple_ii(&to_dev->dev_addr_list, - &from_dev->dev_addr_list, - to_dev->addr_len, addr_type, 0); + __hw_addr_del_multiple(&to_dev->dev_addr_list, NULL, + &from_dev->dev_addr_list, + to_dev->addr_len, addr_type); call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); return 0; } @@ -3779,24 +3846,22 @@ int __dev_addr_add(struct dev_addr_list **list, int *count, * dev_unicast_delete - Release secondary unicast address. * @dev: device * @addr: address to delete - * @alen: length of @addr * * Release reference to a secondary unicast address and remove it * from the device if the reference count drops to zero. * * The caller must hold the rtnl_mutex. */ -int dev_unicast_delete(struct net_device *dev, void *addr, int alen) +int dev_unicast_delete(struct net_device *dev, void *addr) { int err; ASSERT_RTNL(); - netif_addr_lock_bh(dev); - err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0); + err = __hw_addr_del(&dev->uc_list, &dev->uc_count, addr, + dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_unicast_delete); @@ -3805,24 +3870,22 @@ EXPORT_SYMBOL(dev_unicast_delete); * dev_unicast_add - add a secondary unicast address * @dev: device * @addr: address to add - * @alen: length of @addr * * Add a secondary unicast address to the device or increase * the reference count if it already exists. * * The caller must hold the rtnl_mutex. */ -int dev_unicast_add(struct net_device *dev, void *addr, int alen) +int dev_unicast_add(struct net_device *dev, void *addr) { int err; ASSERT_RTNL(); - netif_addr_lock_bh(dev); - err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0); + err = __hw_addr_add(&dev->uc_list, &dev->uc_count, addr, + dev->addr_len, NETDEV_HW_ADDR_T_UNICAST); if (!err) __dev_set_rx_mode(dev); - netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(dev_unicast_add); @@ -3879,8 +3942,7 @@ void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, * @from: source device * * Add newly added addresses to the destination device and release - * addresses that have no users left. The source device must be - * locked by netif_tx_lock_bh. + * addresses that have no users left. * * This function is intended to be called from the dev->set_rx_mode * function of layered software devices. @@ -3889,12 +3951,15 @@ int dev_unicast_sync(struct net_device *to, struct net_device *from) { int err = 0; - netif_addr_lock_bh(to); - err = __dev_addr_sync(&to->uc_list, &to->uc_count, - &from->uc_list, &from->uc_count); + ASSERT_RTNL(); + + if (to->addr_len != from->addr_len) + return -EINVAL; + + err = __hw_addr_sync(&to->uc_list, &to->uc_count, + &from->uc_list, &from->uc_count, to->addr_len); if (!err) __dev_set_rx_mode(to); - netif_addr_unlock_bh(to); return err; } EXPORT_SYMBOL(dev_unicast_sync); @@ -3910,18 +3975,33 @@ EXPORT_SYMBOL(dev_unicast_sync); */ void dev_unicast_unsync(struct net_device *to, struct net_device *from) { - netif_addr_lock_bh(from); - netif_addr_lock(to); + ASSERT_RTNL(); - __dev_addr_unsync(&to->uc_list, &to->uc_count, - &from->uc_list, &from->uc_count); - __dev_set_rx_mode(to); + if (to->addr_len != from->addr_len) + return; - netif_addr_unlock(to); - netif_addr_unlock_bh(from); + __hw_addr_unsync(&to->uc_list, &to->uc_count, + &from->uc_list, &from->uc_count, to->addr_len); + __dev_set_rx_mode(to); } EXPORT_SYMBOL(dev_unicast_unsync); +static void dev_unicast_flush(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + __hw_addr_flush(&dev->uc_list); + dev->uc_count = 0; +} + +static void dev_unicast_init(struct net_device *dev) +{ + /* rtnl_mutex must be held here */ + + INIT_LIST_HEAD(&dev->uc_list); +} + + static void __dev_addr_discard(struct dev_addr_list **list) { struct dev_addr_list *tmp; @@ -3940,9 +4020,6 @@ static void dev_addr_discard(struct net_device *dev) { netif_addr_lock_bh(dev); - __dev_addr_discard(&dev->uc_list); - dev->uc_count = 0; - __dev_addr_discard(&dev->mc_list); dev->mc_count = 0; @@ -4535,6 +4612,7 @@ static void rollback_registered(struct net_device *dev) /* * Flush the unicast and multicast chains */ + dev_unicast_flush(dev); dev_addr_discard(dev); if (dev->netdev_ops->ndo_uninit) @@ -5020,6 +5098,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, if (dev_addr_init(dev)) goto free_tx; + dev_unicast_init(dev); + dev_net_set(dev, &init_net); dev->_tx = tx; @@ -5223,6 +5303,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* * Flush the unicast and multicast chains */ + dev_unicast_flush(dev); dev_addr_discard(dev); netdev_unregister_kobject(dev); -- cgit v1.2.3 From adf30907d63893e4208dfe3f5c88ae12bc2f25d5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Jun 2009 05:19:30 +0000 Subject: net: skb->dst accessors Define three accessors to get/set dst attached to a skb struct dst_entry *skb_dst(const struct sk_buff *skb) void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) void skb_dst_drop(struct sk_buff *skb) This one should replace occurrences of : dst_release(skb->dst) skb->dst = NULL; Delete skb->dst field Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 +++---- net/core/neighbour.c | 11 +++++------ net/core/skbuff.c | 4 ++-- 3 files changed, 10 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index e2fcc5f10177..34b49a6a22fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1693,10 +1693,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, * If device doesnt need skb->dst, release it right now while * its hot in this cpu cache */ - if ((dev->priv_flags & IFF_XMIT_DST_RELEASE) && skb->dst) { - dst_release(skb->dst); - skb->dst = NULL; - } + if (dev->priv_flags & IFF_XMIT_DST_RELEASE) + skb_dst_drop(skb); + rc = ops->ndo_start_xmit(skb, dev); if (rc == 0) txq_trans_update(txq); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a1cbce7fdae5..c54229befcfe 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1088,8 +1088,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, struct neighbour *n1 = neigh; write_unlock_bh(&neigh->lock); /* On shaper/eql skb->dst->neighbour != neigh :( */ - if (skb->dst && skb->dst->neighbour) - n1 = skb->dst->neighbour; + if (skb_dst(skb) && skb_dst(skb)->neighbour) + n1 = skb_dst(skb)->neighbour; n1->output(skb); write_lock_bh(&neigh->lock); } @@ -1182,7 +1182,7 @@ EXPORT_SYMBOL(neigh_compat_output); int neigh_resolve_output(struct sk_buff *skb) { - struct dst_entry *dst = skb->dst; + struct dst_entry *dst = skb_dst(skb); struct neighbour *neigh; int rc = 0; @@ -1229,7 +1229,7 @@ EXPORT_SYMBOL(neigh_resolve_output); int neigh_connected_output(struct sk_buff *skb) { int err; - struct dst_entry *dst = skb->dst; + struct dst_entry *dst = skb_dst(skb); struct neighbour *neigh = dst->neighbour; struct net_device *dev = neigh->dev; @@ -1298,8 +1298,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } - dst_release(skb->dst); - skb->dst = NULL; + skb_dst_drop(skb); dev_hold(skb->dev); __skb_queue_tail(&tbl->proxy_queue, skb); mod_timer(&tbl->proxy_timer, sched_next); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 8e815e685f28..6adf19ec95cc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -381,7 +381,7 @@ static void kfree_skbmem(struct sk_buff *skb) static void skb_release_head_state(struct sk_buff *skb) { - dst_release(skb->dst); + skb_dst_drop(skb); #ifdef CONFIG_XFRM secpath_put(skb->sp); #endif @@ -521,7 +521,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->transport_header = old->transport_header; new->network_header = old->network_header; new->mac_header = old->mac_header; - new->dst = dst_clone(old->dst); + skb_dst_set(new, dst_clone(skb_dst(old))); #ifdef CONFIG_XFRM new->sp = secpath_get(old->sp); #endif -- cgit v1.2.3 From 3b8bcfd5d31ea0fec58681d035544ace707d2536 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 30 May 2009 01:39:53 +0200 Subject: net: introduce pre-up netdev notifier NETDEV_UP is called after the device is set UP, but sometimes it is useful to be able to veto the device UP. Introduce a new NETDEV_PRE_UP notifier that can be used for exactly this. The first use case will be cfg80211 denying interfaces to be set UP if the device is known to be rfkill'ed. Signed-off-by: Johannes Berg Acked-by: David S. Miller Signed-off-by: John W. Linville --- net/core/dev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 34b49a6a22fd..1f38401fc028 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1048,7 +1048,7 @@ void dev_load(struct net *net, const char *name) int dev_open(struct net_device *dev) { const struct net_device_ops *ops = dev->netdev_ops; - int ret = 0; + int ret; ASSERT_RTNL(); @@ -1065,6 +1065,11 @@ int dev_open(struct net_device *dev) if (!netif_device_present(dev)) return -ENODEV; + ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); + ret = notifier_to_errno(ret); + if (ret) + return ret; + /* * Call device private open method */ -- cgit v1.2.3 From 5ff8dda3035d95df5bf6979136eb82b0e301726b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 4 Jun 2009 01:22:01 +0000 Subject: net: Ensure partial checksum offset is inside the skb head On Thu, Jun 04, 2009 at 09:06:00PM +1000, Herbert Xu wrote: > > tun: Optimise handling of bogus gso->hdr_len > > As all current versions of virtio_net generate a value for the > header length that's too small, we should optimise this so that > we don't copy it twice. This can be done by ensuring that it is > at least as large as the place where we'll write the checksum. > > Signed-off-by: Herbert Xu With this applied we can strengthen the partial checksum check: In skb_partial_csum_set we check to see if the checksum offset is within the packet. However, we really should check that it is within the skb head as that's the only bit we can modify without copying. Signed-off-by: Herbert Xu Acked-by: Rusty Russell Signed-off-by: David S. Miller --- net/core/skbuff.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6adf19ec95cc..a2473b1600e3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3026,12 +3026,12 @@ EXPORT_SYMBOL_GPL(skb_tstamp_tx); */ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) { - if (unlikely(start > skb->len - 2) || - unlikely((int)start + off > skb->len - 2)) { + if (unlikely(start > skb_headlen(skb)) || + unlikely((int)start + off > skb_headlen(skb) - 2)) { if (net_ratelimit()) printk(KERN_WARNING "bad partial csum: csum=%u/%u len=%u\n", - start, off, skb->len); + start, off, skb_headlen(skb)); return false; } skb->ip_summed = CHECKSUM_PARTIAL; -- cgit v1.2.3 From eae3f29cc73f83cc3f1891d3ad40021b5172c630 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Jun 2009 04:03:35 +0000 Subject: net: num_dma_maps is not used Get rid of num_dma_maps in struct skb_shared_info, as it seems unused. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skb_dma_map.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c index 86234923a3b7..7adb623ef664 100644 --- a/net/core/skb_dma_map.c +++ b/net/core/skb_dma_map.c @@ -30,7 +30,6 @@ int skb_dma_map(struct device *dev, struct sk_buff *skb, goto unwind; sp->dma_maps[i + 1] = map; } - sp->num_dma_maps = i + 1; return 0; -- cgit v1.2.3 From 042a53a9e437feaf2230dd2cadcecfae9c7bfe05 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Jun 2009 04:04:16 +0000 Subject: net: skb_shared_info optimization skb_dma_unmap() is quite expensive for small packets, because we use two different cache lines from skb_shared_info. One to access nr_frags, one to access dma_maps[0] Instead of dma_maps being an array of MAX_SKB_FRAGS + 1 elements, let dma_head alone in a new dma_head field, close to nr_frags, to reduce cache lines misses. Tested on my dev machine (bnx2 & tg3 adapters), nice speedup ! Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skb_dma_map.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/skb_dma_map.c b/net/core/skb_dma_map.c index 7adb623ef664..79687dfd6957 100644 --- a/net/core/skb_dma_map.c +++ b/net/core/skb_dma_map.c @@ -20,7 +20,7 @@ int skb_dma_map(struct device *dev, struct sk_buff *skb, if (dma_mapping_error(dev, map)) goto out_err; - sp->dma_maps[0] = map; + sp->dma_head = map; for (i = 0; i < sp->nr_frags; i++) { skb_frag_t *fp = &sp->frags[i]; @@ -28,7 +28,7 @@ int skb_dma_map(struct device *dev, struct sk_buff *skb, fp->size, dir); if (dma_mapping_error(dev, map)) goto unwind; - sp->dma_maps[i + 1] = map; + sp->dma_maps[i] = map; } return 0; @@ -37,10 +37,10 @@ unwind: while (--i >= 0) { skb_frag_t *fp = &sp->frags[i]; - dma_unmap_page(dev, sp->dma_maps[i + 1], + dma_unmap_page(dev, sp->dma_maps[i], fp->size, dir); } - dma_unmap_single(dev, sp->dma_maps[0], + dma_unmap_single(dev, sp->dma_head, skb_headlen(skb), dir); out_err: return -ENOMEM; @@ -53,12 +53,12 @@ void skb_dma_unmap(struct device *dev, struct sk_buff *skb, struct skb_shared_info *sp = skb_shinfo(skb); int i; - dma_unmap_single(dev, sp->dma_maps[0], + dma_unmap_single(dev, sp->dma_head, skb_headlen(skb), dir); for (i = 0; i < sp->nr_frags; i++) { skb_frag_t *fp = &sp->frags[i]; - dma_unmap_page(dev, sp->dma_maps[i + 1], + dma_unmap_page(dev, sp->dma_maps[i], fp->size, dir); } } -- cgit v1.2.3 From d2d27bfd11659675fdd1c20b1c7f8f59873cad60 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 5 Jun 2009 09:35:40 +0000 Subject: net: Fix skb_copy_datagram_from_iovec() to pass the right offset I am working on enabling UFO between KVM guests using virtio-net and i have some patches that i got working with 2.6.30-rc8. When i wanted to try them with net-next-2.6, i noticed that virtio-net is not working with that tree. After some debugging, it turned out to be several bugs in the recent patches to fix aio with tun driver, specifically the following 2 commits. http://git.kernel.org/?p=linux/kernel/git/davem/net-next-2.6.git;a=commitdiff;h=0a1ec07a67bd8b0033dace237249654d015efa21 http://git.kernel.org/?p=linux/kernel/git/davem/net-next-2.6.git;a=commitdiff;h=6f26c9a7555e5bcca3560919db9b852015077dae Fix the call to memcpy_from_iovecend() in skb_copy_datagram_from_iovec to pass the right iovec offset. Signed-off-by: Sridhar Samudrala Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/datagram.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index e2a36f05cdf7..774bcd9119d9 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -466,7 +466,8 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, if (copy > 0) { if (copy > len) copy = len; - if (memcpy_fromiovecend(skb->data + offset, from, 0, copy)) + if (memcpy_fromiovecend(skb->data + offset, from, from_offset, + copy)) goto fault; if ((len -= copy) == 0) return 0; -- cgit v1.2.3 From 2faef52b72958b8b1c08e927b9b0691c314cf6f4 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Fri, 5 Jun 2009 09:35:44 +0000 Subject: net: Fix memcpy_toiovecend() to use the right offset Increment the iovec base by the offset passed in for the initial copy_to_user() in memcpy_to_iovecend(). Signed-off-by: Sridhar Samudrala Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/iovec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/iovec.c b/net/core/iovec.c index 40a76ce19d9f..16ad45d4882b 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -112,9 +112,9 @@ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, continue; } copy = min_t(unsigned int, iov->iov_len - offset, len); - offset = 0; - if (copy_to_user(iov->iov_base, kdata, copy)) + if (copy_to_user(iov->iov_base + offset, kdata, copy)) return -EFAULT; + offset = 0; kdata += copy; len -= copy; } -- cgit v1.2.3 From 1d0ebfe7c9621d43804e9ce9f2b898541a7d9652 Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Mon, 8 Jun 2009 00:40:35 -0700 Subject: net pkgen.c:fix no need for check vfree() does its own 'NULL' check, so no need for check before calling it. Signed-off-by: Figo.zhang Signed-off-by: David S. Miller --- net/core/pktgen.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b8ccd3c88d63..19b8c20e98a4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3691,8 +3691,7 @@ out1: #ifdef CONFIG_XFRM free_SAs(pkt_dev); #endif - if (pkt_dev->flows) - vfree(pkt_dev->flows); + vfree(pkt_dev->flows); kfree(pkt_dev); return err; } @@ -3791,8 +3790,7 @@ static int pktgen_remove_device(struct pktgen_thread *t, #ifdef CONFIG_XFRM free_SAs(pkt_dev); #endif - if (pkt_dev->flows) - vfree(pkt_dev->flows); + vfree(pkt_dev->flows); kfree(pkt_dev); return 0; } -- cgit v1.2.3 From 5b1a002ade68173f21b2126a778278df72202ba6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Jun 2009 00:18:15 -0700 Subject: datagram: Use frag list abstraction interfaces. Signed-off-by: David S. Miller --- net/core/datagram.c | 178 ++++++++++++++++++++++++---------------------------- 1 file changed, 83 insertions(+), 95 deletions(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index 774bcd9119d9..58abee1f1df1 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -282,6 +282,7 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, { int start = skb_headlen(skb); int i, copy = start - offset; + struct sk_buff *frag_iter; /* Copy header. */ if (copy > 0) { @@ -322,28 +323,24 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, start = end; } - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_iovec(list, - offset - start, - to, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - } - start = end; + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_iovec(frag_iter, + offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; } + start = end; } if (!len) return 0; @@ -369,6 +366,7 @@ int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, { int start = skb_headlen(skb); int i, copy = start - offset; + struct sk_buff *frag_iter; /* Copy header. */ if (copy > 0) { @@ -411,30 +409,26 @@ int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, start = end; } - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_const_iovec(list, - offset - start, - to, to_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - start = end; + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_const_iovec(frag_iter, + offset - start, + to, to_offset, + copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to_offset += copy; } + start = end; } if (!len) return 0; @@ -461,6 +455,7 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, { int start = skb_headlen(skb); int i, copy = start - offset; + struct sk_buff *frag_iter; /* Copy header. */ if (copy > 0) { @@ -506,31 +501,27 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, start = end; } - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_from_iovec(list, - offset - start, - from, - from_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - from_offset += copy; - } - start = end; + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_from_iovec(frag_iter, + offset - start, + from, + from_offset, + copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from_offset += copy; } + start = end; } if (!len) return 0; @@ -545,8 +536,9 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, __wsum *csump) { int start = skb_headlen(skb); - int pos = 0; int i, copy = start - offset; + struct sk_buff *frag_iter; + int pos = 0; /* Copy header. */ if (copy > 0) { @@ -597,33 +589,29 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, start = end; } - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list=list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - __wsum csum2 = 0; - if (copy > len) - copy = len; - if (skb_copy_and_csum_datagram(list, - offset - start, - to, copy, - &csum2)) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - pos += copy; - } - start = end; + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + __wsum csum2 = 0; + if (copy > len) + copy = len; + if (skb_copy_and_csum_datagram(frag_iter, + offset - start, + to, copy, + &csum2)) + goto fault; + *csump = csum_block_add(*csump, csum2, pos); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + pos += copy; } + start = end; } if (!len) return 0; -- cgit v1.2.3 From 4cf704fbea96075942bd033fd75aa4e76ae1c8a1 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Jun 2009 00:18:51 -0700 Subject: net/core/dev.c: Use frag list abstraction interfaces. Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1f38401fc028..4913089c91dc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1820,7 +1820,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (netif_needs_gso(dev, skb)) goto gso; - if (skb_shinfo(skb)->frag_list && + if (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST) && __skb_linearize(skb)) goto out_kfree_skb; @@ -2407,7 +2407,7 @@ int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (!(skb->dev->features & NETIF_F_GRO)) goto normal; - if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list) + if (skb_is_gso(skb) || skb_has_frags(skb)) goto normal; rcu_read_lock(); -- cgit v1.2.3 From fbb398a832086c370bce47789e155bf5a08774e9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Jun 2009 00:18:59 -0700 Subject: net/core/skbuff.c: Use frag list abstraction interfaces. Signed-off-by: David S. Miller --- net/core/skbuff.c | 230 +++++++++++++++++++++++++----------------------------- 1 file changed, 106 insertions(+), 124 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a2473b1600e3..49961ba3c0f6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -210,7 +210,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; shinfo->tx_flags.flags = 0; - shinfo->frag_list = NULL; + skb_frag_list_init(skb); memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); if (fclone) { @@ -323,7 +323,7 @@ static void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; - for (list = skb_shinfo(skb)->frag_list; list; list = list->next) + skb_walk_frags(skb, list) skb_get(list); } @@ -338,7 +338,7 @@ static void skb_release_data(struct sk_buff *skb) put_page(skb_shinfo(skb)->frags[i].page); } - if (skb_shinfo(skb)->frag_list) + if (skb_has_frags(skb)) skb_drop_fraglist(skb); kfree(skb->head); @@ -503,7 +503,7 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size) shinfo->gso_type = 0; shinfo->ip6_frag_id = 0; shinfo->tx_flags.flags = 0; - shinfo->frag_list = NULL; + skb_frag_list_init(skb); memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps)); memset(skb, 0, offsetof(struct sk_buff, tail)); @@ -758,7 +758,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) skb_shinfo(n)->nr_frags = i; } - if (skb_shinfo(skb)->frag_list) { + if (skb_has_frags(skb)) { skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; skb_clone_fraglist(n); } @@ -821,7 +821,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page); - if (skb_shinfo(skb)->frag_list) + if (skb_has_frags(skb)) skb_clone_fraglist(skb); skb_release_data(skb); @@ -1093,7 +1093,7 @@ drop_pages: for (; i < nfrags; i++) put_page(skb_shinfo(skb)->frags[i].page); - if (skb_shinfo(skb)->frag_list) + if (skb_has_frags(skb)) skb_drop_fraglist(skb); goto done; } @@ -1188,7 +1188,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. */ - if (!skb_shinfo(skb)->frag_list) + if (!skb_has_frags(skb)) goto pull_pages; /* Estimate size of pulled pages. */ @@ -1285,8 +1285,9 @@ EXPORT_SYMBOL(__pskb_pull_tail); int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) { - int i, copy; int start = skb_headlen(skb); + struct sk_buff *frag_iter; + int i, copy; if (offset > (int)skb->len - len) goto fault; @@ -1328,28 +1329,23 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) start = end; } - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; + skb_walk_frags(skb, frag_iter) { + int end; - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_bits(list, offset - start, - to, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } - start = end; + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_bits(frag_iter, offset - start, to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; } + start = end; } if (!len) return 0; @@ -1534,6 +1530,7 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .ops = &sock_pipe_buf_ops, .spd_release = sock_spd_release, }; + struct sk_buff *frag_iter; struct sock *sk = skb->sk; /* @@ -1548,13 +1545,11 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, /* * now see if we have a frag_list to map */ - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list && tlen; list = list->next) { - if (__skb_splice_bits(list, &offset, &tlen, &spd, sk)) - break; - } + skb_walk_frags(skb, frag_iter) { + if (!tlen) + break; + if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk)) + break; } done: @@ -1593,8 +1588,9 @@ done: int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) { - int i, copy; int start = skb_headlen(skb); + struct sk_buff *frag_iter; + int i, copy; if (offset > (int)skb->len - len) goto fault; @@ -1635,28 +1631,24 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len) start = end; } - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; + skb_walk_frags(skb, frag_iter) { + int end; - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_store_bits(list, offset - start, - from, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - from += copy; - } - start = end; + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_store_bits(frag_iter, offset - start, + from, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + from += copy; } + start = end; } if (!len) return 0; @@ -1673,6 +1665,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, { int start = skb_headlen(skb); int i, copy = start - offset; + struct sk_buff *frag_iter; int pos = 0; /* Checksum header. */ @@ -1712,29 +1705,25 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset, start = end; } - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; + skb_walk_frags(skb, frag_iter) { + int end; - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - __wsum csum2; - if (copy > len) - copy = len; - csum2 = skb_checksum(list, offset - start, - copy, 0); - csum = csum_block_add(csum, csum2, pos); - if ((len -= copy) == 0) - return csum; - offset += copy; - pos += copy; - } - start = end; + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + __wsum csum2; + if (copy > len) + copy = len; + csum2 = skb_checksum(frag_iter, offset - start, + copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos += copy; } + start = end; } BUG_ON(len); @@ -1749,6 +1738,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, { int start = skb_headlen(skb); int i, copy = start - offset; + struct sk_buff *frag_iter; int pos = 0; /* Copy header. */ @@ -1793,31 +1783,27 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, start = end; } - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; + skb_walk_frags(skb, frag_iter) { + __wsum csum2; + int end; - for (; list; list = list->next) { - __wsum csum2; - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - csum2 = skb_copy_and_csum_bits(list, - offset - start, - to, copy, 0); - csum = csum_block_add(csum, csum2, pos); - if ((len -= copy) == 0) - return csum; - offset += copy; - to += copy; - pos += copy; - } - start = end; + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + csum2 = skb_copy_and_csum_bits(frag_iter, + offset - start, + to, copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos += copy; } + start = end; } BUG_ON(len); return csum; @@ -2327,8 +2313,7 @@ next_skb: st->frag_data = NULL; } - if (st->root_skb == st->cur_skb && - skb_shinfo(st->root_skb)->frag_list) { + if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; @@ -2639,7 +2624,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) } else skb_get(fskb2); - BUG_ON(skb_shinfo(nskb)->frag_list); + SKB_FRAG_ASSERT(nskb); skb_shinfo(nskb)->frag_list = fskb2; } @@ -2796,6 +2781,7 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { int start = skb_headlen(skb); int i, copy = start - offset; + struct sk_buff *frag_iter; int elt = 0; if (copy > 0) { @@ -2829,26 +2815,22 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) start = end; } - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; + skb_walk_frags(skb, frag_iter) { + int end; - WARN_ON(start > offset + len); + WARN_ON(start > offset + len); - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - elt += __skb_to_sgvec(list, sg+elt, offset - start, - copy); - if ((len -= copy) == 0) - return elt; - offset += copy; - } - start = end; + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, + copy); + if ((len -= copy) == 0) + return elt; + offset += copy; } + start = end; } BUG_ON(len); return elt; @@ -2896,7 +2878,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return -ENOMEM; /* Easy case. Most of packets will go this way. */ - if (!skb_shinfo(skb)->frag_list) { + if (!skb_has_frags(skb)) { /* A little of trouble, not enough of space for trailer. * This should not happen, when stack is tuned to generate * good frames. OK, on miss we reallocate and reserve even more @@ -2931,7 +2913,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) if (skb1->next == NULL && tailbits) { if (skb_shinfo(skb1)->nr_frags || - skb_shinfo(skb1)->frag_list || + skb_has_frags(skb1) || skb_tailroom(skb1) < tailbits) ntail = tailbits + 128; } @@ -2940,7 +2922,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) skb_cloned(skb1) || ntail || skb_shinfo(skb1)->nr_frags || - skb_shinfo(skb1)->frag_list) { + skb_has_frags(skb1)) { struct sk_buff *skb2; /* Fuck, we are miserable poor guys... */ -- cgit v1.2.3 From 285e42802bb3da91102967f63fb9e28e61f7831e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Jun 2009 00:19:10 -0700 Subject: net/core/user_dma.c: Use frag list abstraction interfaces. Signed-off-by: David S. Miller --- net/core/user_dma.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'net/core') diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 164b090d5ac3..25d717ebc92e 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -51,6 +51,7 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan, { int start = skb_headlen(skb); int i, copy = start - offset; + struct sk_buff *frag_iter; dma_cookie_t cookie = 0; /* Copy header. */ @@ -94,31 +95,28 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan, start = end; } - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - WARN_ON(start > offset + len); - - end = start + list->len; - copy = end - offset; - if (copy > 0) { - if (copy > len) - copy = len; - cookie = dma_skb_copy_datagram_iovec(chan, list, - offset - start, to, copy, - pinned_list); - if (cookie < 0) - goto fault; - len -= copy; - if (len == 0) - goto end; - offset += copy; - } - start = end; + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + copy = end - offset; + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_skb_copy_datagram_iovec(chan, frag_iter, + offset - start, + to, copy, + pinned_list); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; } + start = end; } end: -- cgit v1.2.3 From 0c27922e4933ceb86644f4a9b1af212ffe5aad75 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Jun 2009 03:49:24 +0000 Subject: net: dev_addr_init() fix commit f001fde5eadd915f4858d22ed70d7040f48767cf (net: introduce a list of device addresses dev_addr_list (v6)) added one regression Vegard Nossum found in its testings. With kmemcheck help, Vegard found some uninitialized memory was read and reported to user, potentialy leaking kernel data. ( thread can be found on http://lkml.org/lkml/2009/5/30/177 ) dev_addr_init() incorrectly uses sizeof() operator. We were initializing one byte instead of MAX_ADDR_LEN bytes. Reported-by: Vegard Nossum Signed-off-by: Eric Dumazet Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4913089c91dc..81b392ef5114 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3655,8 +3655,8 @@ static int dev_addr_init(struct net_device *dev) /* rtnl_mutex must be held here */ INIT_LIST_HEAD(&dev->dev_addr_list); - memset(addr, 0, sizeof(*addr)); - err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, sizeof(*addr), + memset(addr, 0, sizeof(addr)); + err = __hw_addr_add(&dev->dev_addr_list, NULL, addr, sizeof(addr), NETDEV_HW_ADDR_T_LAN); if (!err) { /* -- cgit v1.2.3 From fcb94e422479da52ed90bab230c59617a0462416 Mon Sep 17 00:00:00 2001 From: Sergey Lapin Date: Mon, 8 Jun 2009 12:18:47 +0000 Subject: Add constants for the ieee 802.15.4 stack IEEE 802.15.4 stack requires several constants to be defined/adjusted. Signed-off-by: Dmitry Eremin-Solenikov Signed-off-by: Sergey Lapin Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++-- net/core/sock.c | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 81b392ef5114..11560e3258b5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -269,7 +269,8 @@ static const unsigned short netdev_lock_type[] = ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, - ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE}; + ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, ARPHRD_IEEE802154_PHY, + ARPHRD_VOID, ARPHRD_NONE}; static const char *netdev_lock_name[] = {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", @@ -286,7 +287,8 @@ static const char *netdev_lock_name[] = "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", - "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"}; + "_xmit_PHONET_PIPE", "_xmit_IEEE802154", "_xmit_IEEE802154_PHY", + "_xmit_VOID", "_xmit_NONE"}; static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; diff --git a/net/core/sock.c b/net/core/sock.c index 58dec9dff99a..04e35eb2e736 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -155,6 +155,7 @@ static const char *af_family_key_strings[AF_MAX+1] = { "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , + "sk_lock-AF_IEEE802154", "sk_lock-AF_MAX" }; static const char *af_family_slock_key_strings[AF_MAX+1] = { @@ -170,6 +171,7 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = { "slock-27" , "slock-28" , "slock-AF_CAN" , "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , + "slock-AF_IEEE802154", "slock-AF_MAX" }; static const char *af_family_clock_key_strings[AF_MAX+1] = { @@ -185,6 +187,7 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = { "clock-27" , "clock-28" , "clock-AF_CAN" , "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , + "clock-AF_IEEE802154", "clock-AF_MAX" }; -- cgit v1.2.3 From 8f77f3849cc3ae2d6df9301785a3d316ea7d7ee1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 7 Jun 2009 21:58:37 +0200 Subject: mac80211: do not pass PS frames out of mac80211 again In order to handle powersave frames properly we had needed to pass these out to the device queues again, and introduce the skb->requeue bit. This, however, also has unnecessary overhead by needing to 'clean up' already tried frames, and this clean-up code is also buggy when software encryption is used. Instead of sending the frames via the master netdev queue again, simply put them into the pending queue. This also fixes a problem where frames for that particular station could be reordered when some were still on the software queues and older ones are re-injected into the software queue after them. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/core/skbuff.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 49961ba3c0f6..b94d777e3eb4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -552,7 +552,6 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) new->vlan_tci = old->vlan_tci; #if defined(CONFIG_MAC80211) || defined(CONFIG_MAC80211_MODULE) new->do_not_encrypt = old->do_not_encrypt; - new->requeue = old->requeue; #endif skb_copy_secmark(new, old); -- cgit v1.2.3 From 2b85a34e911bf483c27cfdd124aeb1605145dc80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 11 Jun 2009 02:55:43 -0700 Subject: net: No more expensive sock_hold()/sock_put() on each tx One of the problem with sock memory accounting is it uses a pair of sock_hold()/sock_put() for each transmitted packet. This slows down bidirectional flows because the receive path also needs to take a refcount on socket and might use a different cpu than transmit path or transmit completion path. So these two atomic operations also trigger cache line bounces. We can see this in tx or tx/rx workloads (media gateways for example), where sock_wfree() can be in top five functions in profiles. We use this sock_hold()/sock_put() so that sock freeing is delayed until all tx packets are completed. As we also update sk_wmem_alloc, we could offset sk_wmem_alloc by one unit at init time, until sk_free() is called. Once sk_free() is called, we atomic_dec_and_test(sk_wmem_alloc) to decrement initial offset and atomicaly check if any packets are in flight. skb_set_owner_w() doesnt call sock_hold() anymore sock_wfree() doesnt call sock_put() anymore, but check if sk_wmem_alloc reached 0 to perform the final freeing. Drawback is that a skb->truesize error could lead to unfreeable sockets, or even worse, prematurely calling __sk_free() on a live socket. Nice speedups on SMP. tbench for example, going from 2691 MB/s to 2711 MB/s on my 8 cpu dev machine, even if tbench was not really hitting sk_refcnt contention point. 5 % speedup on a UDP transmit workload (depends on number of flows), lowering TX completion cpu usage. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 04e35eb2e736..06e26b77ad9e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1008,7 +1008,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } EXPORT_SYMBOL(sk_alloc); -void sk_free(struct sock *sk) +static void __sk_free(struct sock *sk) { struct sk_filter *filter; @@ -1031,6 +1031,17 @@ void sk_free(struct sock *sk) put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); } + +void sk_free(struct sock *sk) +{ + /* + * We substract one from sk_wmem_alloc and can know if + * some packets are still in some tx queue. + * If not null, sock_wfree() will call __sk_free(sk) later + */ + if (atomic_dec_and_test(&sk->sk_wmem_alloc)) + __sk_free(sk); +} EXPORT_SYMBOL(sk_free); /* @@ -1071,7 +1082,10 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; atomic_set(&newsk->sk_rmem_alloc, 0); - atomic_set(&newsk->sk_wmem_alloc, 0); + /* + * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) + */ + atomic_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); @@ -1175,12 +1189,18 @@ void __init sk_init(void) void sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; + int res; /* In case it might be waiting for more memory. */ - atomic_sub(skb->truesize, &sk->sk_wmem_alloc); + res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc); if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) sk->sk_write_space(sk); - sock_put(sk); + /* + * if sk_wmem_alloc reached 0, we are last user and should + * free this sock, as sk_free() call could not do it. + */ + if (res == 0) + __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); @@ -1819,6 +1839,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); atomic_set(&sk->sk_refcnt, 1); + atomic_set(&sk->sk_wmem_alloc, 1); atomic_set(&sk->sk_drops, 0); } EXPORT_SYMBOL(sock_init_data); -- cgit v1.2.3 From 5ef12d98a19254ee5dc851bd83e214b43ec1f725 Mon Sep 17 00:00:00 2001 From: Timo Teras Date: Thu, 11 Jun 2009 04:16:28 -0700 Subject: neigh: fix state transition INCOMPLETE->FAILED via Netlink request The current code errors out the INCOMPLETE neigh entry skb queue only from the timer if maximum probes have been attempted and there has been no reply. This also causes the transtion to FAILED state. However, the neigh entry can be also updated via Netlink to inform that the address is unavailable. Currently, neigh_update() just stops the timers and leaves the pending skb's unreleased. This results that the clean up code in the timer callback is never called, preventing also proper garbage collection. This fixes neigh_update() to process the pending skb queue immediately if INCOMPLETE -> FAILED state transtion occurs due to a Netlink request. Signed-off-by: Timo Teras Signed-off-by: David S. Miller --- net/core/neighbour.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index c54229befcfe..163b4f5b0365 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -771,6 +771,28 @@ static __inline__ int neigh_max_probes(struct neighbour *n) p->ucast_probes + p->app_probes + p->mcast_probes); } +static void neigh_invalidate(struct neighbour *neigh) +{ + struct sk_buff *skb; + + NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); + NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + neigh->updated = jiffies; + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while (neigh->nud_state == NUD_FAILED && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + write_unlock(&neigh->lock); + neigh->ops->error_report(neigh, skb); + write_lock(&neigh->lock); + } + skb_queue_purge(&neigh->arp_queue); +} + /* Called when a timer expires for a neighbour entry. */ static void neigh_timer_handler(unsigned long arg) @@ -835,26 +857,9 @@ static void neigh_timer_handler(unsigned long arg) if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { - struct sk_buff *skb; - neigh->nud_state = NUD_FAILED; - neigh->updated = jiffies; notify = 1; - NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); - NEIGH_PRINTK2("neigh %p is failed.\n", neigh); - - /* It is very thin place. report_unreachable is very complicated - routine. Particularly, it can hit the same neighbour entry! - - So that, we try to be accurate and avoid dead loop. --ANK - */ - while (neigh->nud_state == NUD_FAILED && - (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { - write_unlock(&neigh->lock); - neigh->ops->error_report(neigh, skb); - write_lock(&neigh->lock); - } - skb_queue_purge(&neigh->arp_queue); + neigh_invalidate(neigh); } if (neigh->nud_state & NUD_IN_TIMER) { @@ -1001,6 +1006,11 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, neigh->nud_state = new; err = 0; notify = old & NUD_VALID; + if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && + (new & NUD_FAILED)) { + neigh_invalidate(neigh); + notify = 1; + } goto out; } -- cgit v1.2.3 From 746e6ad23cd6fec2edce056e014a0eabeffa838c Mon Sep 17 00:00:00 2001 From: John Dykstra Date: Thu, 11 Jun 2009 20:57:21 -0700 Subject: [PATCH] net core: Some interface flags not returned by SIOCGIFFLAGS Commit b00055aacdb172c05067612278ba27265fcd05ce " [NET] core: add RFC2863 operstate" defined new interface flag values. Its documentation specified that these flags could be accessed from user space via SIOCGIFFLAGS. However, this does not work because the new flags do not fit in that ioctl's argument width. Change the documentation to match the code's behavior. Also change the source to explicitly show the truncation. This _should_ have no effect on executable code, and did not with gcc 4.2.4 generating x86 code. A new ioctl could be defined to return all interface flags to user space. However, since this has been broken for three years with no one complaining, there doesn't seem much need. They are still accessible via netlink. Reported-by: "Fredrik Arnerup" Signed-off-by: John Dykstra Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 11560e3258b5..a09bf658970f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4209,7 +4209,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm switch (cmd) { case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = dev_get_flags(dev); + ifr->ifr_flags = (short) dev_get_flags(dev); return 0; case SIOCGIFMETRIC: /* Get the metric on the interface -- cgit v1.2.3 From da6782927de809d9d427bd4bd6a4024243e41f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Miros=C5=82aw?= Date: Fri, 5 Jun 2009 05:35:28 +0000 Subject: bridge: Simplify interface for ATM LANE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch changes FDB entry check for ATM LANE bridge integration. There's no point in holding a FDB entry around SKB building. br_fdb_get()/br_fdb_put() pair are changed into single br_fdb_test_addr() hook that checks if the addr has FDB entry pointing to other port to the one the request arrived on. FDB entry refcounting is removed as it's not used anywhere else. Signed-off-by: Michał Mirosław Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index a09bf658970f..ea00e36f48e1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2071,11 +2071,13 @@ static inline int deliver_skb(struct sk_buff *skb, } #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) -/* These hooks defined here for ATM */ -struct net_bridge; -struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, - unsigned char *addr); -void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly; + +#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) +/* This hook is defined here for ATM LANE */ +int (*br_fdb_test_addr_hook)(struct net_device *dev, + unsigned char *addr) __read_mostly; +EXPORT_SYMBOL(br_fdb_test_addr_hook); +#endif /* * If bridge module is loaded call bridging hook. @@ -2083,6 +2085,8 @@ void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly; */ struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff *skb) __read_mostly; +EXPORT_SYMBOL(br_handle_frame_hook); + static inline struct sk_buff *handle_bridge(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) @@ -5665,12 +5669,6 @@ EXPORT_SYMBOL(net_enable_timestamp); EXPORT_SYMBOL(net_disable_timestamp); EXPORT_SYMBOL(dev_get_flags); -#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) -EXPORT_SYMBOL(br_handle_frame_hook); -EXPORT_SYMBOL(br_fdb_get_hook); -EXPORT_SYMBOL(br_fdb_put_hook); -#endif - EXPORT_SYMBOL(dev_load); EXPORT_PER_CPU_SYMBOL(softnet_data); -- cgit v1.2.3