From bec4596b4e6770c7037f21f6bd27567b152dc0d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Jun 2012 00:18:19 +0000 Subject: drop_monitor: dont sleep in atomic context drop_monitor calls several sleeping functions while in atomic context. BUG: sleeping function called from invalid context at mm/slub.c:943 in_atomic(): 1, irqs_disabled(): 0, pid: 2103, name: kworker/0:2 Pid: 2103, comm: kworker/0:2 Not tainted 3.5.0-rc1+ #55 Call Trace: [] __might_sleep+0xca/0xf0 [] kmem_cache_alloc_node+0x1b3/0x1c0 [] ? queue_delayed_work_on+0x11c/0x130 [] __alloc_skb+0x4b/0x230 [] ? reset_per_cpu_data+0x160/0x160 [drop_monitor] [] reset_per_cpu_data+0x2f/0x160 [drop_monitor] [] send_dm_alert+0x4b/0xb0 [drop_monitor] [] process_one_work+0x130/0x4c0 [] worker_thread+0x159/0x360 [] ? manage_workers.isra.27+0x240/0x240 [] kthread+0x93/0xa0 [] kernel_thread_helper+0x4/0x10 [] ? kthread_freezable_should_stop+0x80/0x80 [] ? gs_change+0xb/0xb Rework the logic to call the sleeping functions in right context. Use standard timer/workqueue api to let system chose any cpu to perform the allocation and netlink send. Also avoid a loop if reset_per_cpu_data() cannot allocate memory : use mod_timer() to wait 1/10 second before next try. Signed-off-by: Eric Dumazet Cc: Neil Horman Reviewed-by: Neil Horman Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 102 ++++++++++++++++-------------------------------- 1 file changed, 33 insertions(+), 69 deletions(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index ea5fb9fcc3f5..d23b6682f4e9 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -36,9 +36,6 @@ #define TRACE_ON 1 #define TRACE_OFF 0 -static void send_dm_alert(struct work_struct *unused); - - /* * Globals, our netlink socket pointer * and the work handle that will send up @@ -48,11 +45,10 @@ static int trace_state = TRACE_OFF; static DEFINE_MUTEX(trace_state_mutex); struct per_cpu_dm_data { - struct work_struct dm_alert_work; - struct sk_buff __rcu *skb; - atomic_t dm_hit_count; - struct timer_list send_timer; - int cpu; + spinlock_t lock; + struct sk_buff *skb; + struct work_struct dm_alert_work; + struct timer_list send_timer; }; struct dm_hw_stat_delta { @@ -78,13 +74,13 @@ static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; static LIST_HEAD(hw_stats_list); -static void reset_per_cpu_data(struct per_cpu_dm_data *data) +static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) { size_t al; struct net_dm_alert_msg *msg; struct nlattr *nla; struct sk_buff *skb; - struct sk_buff *oskb = rcu_dereference_protected(data->skb, 1); + unsigned long flags; al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); @@ -99,65 +95,40 @@ static void reset_per_cpu_data(struct per_cpu_dm_data *data) sizeof(struct net_dm_alert_msg)); msg = nla_data(nla); memset(msg, 0, al); - } else - schedule_work_on(data->cpu, &data->dm_alert_work); - - /* - * Don't need to lock this, since we are guaranteed to only - * run this on a single cpu at a time. - * Note also that we only update data->skb if the old and new skb - * pointers don't match. This ensures that we don't continually call - * synchornize_rcu if we repeatedly fail to alloc a new netlink message. - */ - if (skb != oskb) { - rcu_assign_pointer(data->skb, skb); - - synchronize_rcu(); - - atomic_set(&data->dm_hit_count, dm_hit_limit); + } else { + mod_timer(&data->send_timer, jiffies + HZ / 10); } + spin_lock_irqsave(&data->lock, flags); + swap(data->skb, skb); + spin_unlock_irqrestore(&data->lock, flags); + + return skb; } -static void send_dm_alert(struct work_struct *unused) +static void send_dm_alert(struct work_struct *work) { struct sk_buff *skb; - struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data); + struct per_cpu_dm_data *data; - WARN_ON_ONCE(data->cpu != smp_processor_id()); + data = container_of(work, struct per_cpu_dm_data, dm_alert_work); - /* - * Grab the skb we're about to send - */ - skb = rcu_dereference_protected(data->skb, 1); - - /* - * Replace it with a new one - */ - reset_per_cpu_data(data); + skb = reset_per_cpu_data(data); - /* - * Ship it! - */ if (skb) genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL); - - put_cpu_var(dm_cpu_data); } /* * This is the timer function to delay the sending of an alert * in the event that more drops will arrive during the - * hysteresis period. Note that it operates under the timer interrupt - * so we don't need to disable preemption here + * hysteresis period. */ -static void sched_send_work(unsigned long unused) +static void sched_send_work(unsigned long _data) { - struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data); - - schedule_work_on(smp_processor_id(), &data->dm_alert_work); + struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data; - put_cpu_var(dm_cpu_data); + schedule_work(&data->dm_alert_work); } static void trace_drop_common(struct sk_buff *skb, void *location) @@ -167,33 +138,28 @@ static void trace_drop_common(struct sk_buff *skb, void *location) struct nlattr *nla; int i; struct sk_buff *dskb; - struct per_cpu_dm_data *data = &get_cpu_var(dm_cpu_data); - + struct per_cpu_dm_data *data; + unsigned long flags; - rcu_read_lock(); - dskb = rcu_dereference(data->skb); + local_irq_save(flags); + data = &__get_cpu_var(dm_cpu_data); + spin_lock(&data->lock); + dskb = data->skb; if (!dskb) goto out; - if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) { - /* - * we're already at zero, discard this hit - */ - goto out; - } - nlh = (struct nlmsghdr *)dskb->data; nla = genlmsg_data(nlmsg_data(nlh)); msg = nla_data(nla); for (i = 0; i < msg->entries; i++) { if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) { msg->points[i].count++; - atomic_inc(&data->dm_hit_count); goto out; } } - + if (msg->entries == dm_hit_limit) + goto out; /* * We need to create a new entry */ @@ -205,13 +171,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location) if (!timer_pending(&data->send_timer)) { data->send_timer.expires = jiffies + dm_delay * HZ; - add_timer_on(&data->send_timer, smp_processor_id()); + add_timer(&data->send_timer); } out: - rcu_read_unlock(); - put_cpu_var(dm_cpu_data); - return; + spin_unlock_irqrestore(&data->lock, flags); } static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location) @@ -418,11 +382,11 @@ static int __init init_net_drop_monitor(void) for_each_possible_cpu(cpu) { data = &per_cpu(dm_cpu_data, cpu); - data->cpu = cpu; INIT_WORK(&data->dm_alert_work, send_dm_alert); init_timer(&data->send_timer); - data->send_timer.data = cpu; + data->send_timer.data = (unsigned long)data; data->send_timer.function = sched_send_work; + spin_lock_init(&data->lock); reset_per_cpu_data(data); } -- cgit v1.2.3 From 4bd6683bd400c8b1d2ad544bb155d86a5d10f91c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Jun 2012 04:58:35 +0000 Subject: net: neighbour: fix neigh_dump_info() Denys found out "ip neigh" output was truncated to about 54 neighbours. Signed-off-by: Eric Dumazet Reported-by: Denys Fedoryshchenko Signed-off-by: David S. Miller --- net/core/neighbour.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index eb09f8bbbf07..d81d026138f0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2219,9 +2219,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); - for (h = 0; h < (1 << nht->hash_shift); h++) { - if (h < s_h) - continue; + for (h = s_h; h < (1 << nht->hash_shift); h++) { if (h > s_h) s_idx = 0; for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; @@ -2260,9 +2258,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, read_lock_bh(&tbl->lock); - for (h = 0; h <= PNEIGH_HASHMASK; h++) { - if (h < s_h) - continue; + for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { @@ -2297,7 +2293,7 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) struct neigh_table *tbl; int t, family, s_t; int proxy = 0; - int err = 0; + int err; read_lock(&neigh_tbl_lock); family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family; @@ -2311,7 +2307,7 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; - for (tbl = neigh_tables, t = 0; tbl && (err >= 0); + for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) { if (t < s_t || (family && tbl->family != family)) continue; @@ -2322,6 +2318,8 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) err = pneigh_dump_table(tbl, skb, cb); else err = neigh_dump_table(tbl, skb, cb); + if (err < 0) + break; } read_unlock(&neigh_tbl_lock); -- cgit v1.2.3 From c6c4b97c6b7003e8082dd43db224c1d1f7a24aa2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Jun 2012 14:01:44 +0000 Subject: net/core: fix kernel-doc warnings Fix kernel-doc warnings in net/core: Warning(net/core/skbuff.c:3368): No description found for parameter 'delta_truesize' Warning(net/core/filter.c:628): No description found for parameter 'pfp' Warning(net/core/filter.c:628): Excess function parameter 'sk' description in 'sk_unattached_filter_create' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- net/core/filter.c | 4 ++-- net/core/skbuff.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index a3eddb515d1b..d4ce2dc712e3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -616,9 +616,9 @@ static int __sk_prepare_filter(struct sk_filter *fp) /** * sk_unattached_filter_create - create an unattached filter * @fprog: the filter program - * @sk: the socket to use + * @pfp: the unattached filter that is created * - * Create a filter independent ofr any socket. We first run some + * Create a filter independent of any socket. We first run some * sanity checks on it to make sure it does not explode on us later. * If an error occurs or there is insufficient memory for the filter * a negative errno code is returned. On success the return is zero. diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 016694d62484..d78671e9d545 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3361,7 +3361,7 @@ EXPORT_SYMBOL(kfree_skb_partial); * @to: prior buffer * @from: buffer to add * @fragstolen: pointer to boolean - * + * @delta_truesize: how much more was allocated than was requested */ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize) -- cgit v1.2.3 From 954fba0274058d27c7c07b5ea07c41b3b7477894 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Jun 2012 19:30:21 +0000 Subject: netpoll: fix netpoll_send_udp() bugs Bogdan Hamciuc diagnosed and fixed following bug in netpoll_send_udp() : "skb->len += len;" instead of "skb_put(skb, len);" Meaning that _if_ a network driver needs to call skb_realloc_headroom(), only packet headers would be copied, leaving garbage in the payload. However the skb_realloc_headroom() must be avoided as much as possible since it requires memory and netpoll tries hard to work even if memory is exhausted (using a pool of preallocated skbs) It appears netpoll_send_udp() reserved 16 bytes for the ethernet header, which happens to work for typicall drivers but not all. Right thing is to use LL_RESERVED_SPACE(dev) (And also add dev->needed_tailroom of tailroom) This patch combines both fixes. Many thanks to Bogdan for raising this issue. Reported-by: Bogdan Hamciuc Signed-off-by: Eric Dumazet Tested-by: Bogdan Hamciuc Cc: Herbert Xu Cc: Neil Horman Reviewed-by: Neil Horman Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 3d84fb9d8873..f9f40b932e4b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -362,22 +362,23 @@ EXPORT_SYMBOL(netpoll_send_skb_on_dev); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { - int total_len, eth_len, ip_len, udp_len; + int total_len, ip_len, udp_len; struct sk_buff *skb; struct udphdr *udph; struct iphdr *iph; struct ethhdr *eth; udp_len = len + sizeof(*udph); - ip_len = eth_len = udp_len + sizeof(*iph); - total_len = eth_len + ETH_HLEN + NET_IP_ALIGN; + ip_len = udp_len + sizeof(*iph); + total_len = ip_len + LL_RESERVED_SPACE(np->dev); - skb = find_skb(np, total_len, total_len - len); + skb = find_skb(np, total_len + np->dev->needed_tailroom, + total_len - len); if (!skb) return; skb_copy_to_linear_data(skb, msg, len); - skb->len += len; + skb_put(skb, len); skb_push(skb, sizeof(*udph)); skb_reset_transport_header(skb); -- cgit v1.2.3 From 62b1a8ab9b3660bb820d8dfe23148ed6cda38574 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Jun 2012 06:42:44 +0000 Subject: net: remove skb_orphan_try() Orphaning skb in dev_hard_start_xmit() makes bonding behavior unfriendly for applications sending big UDP bursts : Once packets pass the bonding device and come to real device, they might hit a full qdisc and be dropped. Without orphaning, the sender is automatically throttled because sk->sk_wmemalloc reaches sk->sk_sndbuf (assuming sk_sndbuf is not too big) We could try to defer the orphaning adding another test in dev_hard_start_xmit(), but all this seems of little gain, now that BQL tends to make packets more likely to be parked in Qdisc queues instead of NIC TX ring, in cases where performance matters. Reverts commits : fc6055a5ba31 net: Introduce skb_orphan_try() 87fd308cfc6b net: skb_tx_hash() fix relative to skb_orphan_try() and removes SKBTX_DRV_NEEDS_SK_REF flag Reported-and-bisected-by: Jean-Michel Hautbois Signed-off-by: Eric Dumazet Tested-by: Oliver Hartkopp Acked-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/core/dev.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index cd0981977f5c..6df214041a5e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2089,25 +2089,6 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features) return 0; } -/* - * Try to orphan skb early, right before transmission by the device. - * We cannot orphan skb if tx timestamp is requested or the sk-reference - * is needed on driver level for other reasons, e.g. see net/can/raw.c - */ -static inline void skb_orphan_try(struct sk_buff *skb) -{ - struct sock *sk = skb->sk; - - if (sk && !skb_shinfo(skb)->tx_flags) { - /* skb_tx_hash() wont be able to get sk. - * We copy sk_hash into skb->rxhash - */ - if (!skb->rxhash) - skb->rxhash = sk->sk_hash; - skb_orphan(skb); - } -} - static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) { return ((features & NETIF_F_GEN_CSUM) || @@ -2193,8 +2174,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, if (!list_empty(&ptype_all)) dev_queue_xmit_nit(skb, dev); - skb_orphan_try(skb); - features = netif_skb_features(skb); if (vlan_tx_tag_present(skb) && @@ -2304,7 +2283,7 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; else - hash = (__force u16) skb->protocol ^ skb->rxhash; + hash = (__force u16) skb->protocol; hash = jhash_1word(hash, hashrnd); return (u16) (((u64) hash * qcount) >> 32) + qoffset; -- cgit v1.2.3