From d853ba9cde9afd951d9bc8a86c7e90dfed2ad60b Mon Sep 17 00:00:00 2001 From: Ted Feng Date: Thu, 8 Dec 2011 00:46:21 +0000 Subject: ipip, sit: copy parms.name after register_netdevice commit 72b36015ba43a3cca5303f5534d2c3e1899eae29 upstream. Same fix as 731abb9cb2 for ipip and sit tunnel. Commit 1c5cae815d removed an explicit call to dev_alloc_name in ipip_tunnel_locate and ipip6_tunnel_locate, because register_netdevice will now create a valid name, however the tunnel keeps a copy of the name in the private parms structure. Fix this by copying the name back after register_netdevice has successfully returned. This shows up if you do a simple tunnel add, followed by a tunnel show: $ sudo ip tunnel add mode ipip remote 10.2.20.211 $ ip tunnel tunl0: ip/ip remote any local any ttl inherit nopmtudisc tunl%d: ip/ip remote 10.2.20.211 local any ttl inherit $ sudo ip tunnel add mode sit remote 10.2.20.212 $ ip tunnel sit0: ipv6/ip remote any local any ttl 64 nopmtudisc 6rd-prefix 2002::/16 sit%d: ioctl 89f8 failed: No such device sit%d: ipv6/ip remote 10.2.20.212 local any ttl inherit Signed-off-by: Ted Feng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ipip.c | 7 ++++++- net/ipv6/sit.c | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 378b20b7ca6e..6f06f7f39ea2 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -285,6 +285,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, if (register_netdevice(dev) < 0) goto failed_free; + strcpy(nt->parms.name, dev->name); + dev_hold(dev); ipip_tunnel_link(ipn, nt); return nt; @@ -759,7 +761,6 @@ static int ipip_tunnel_init(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); @@ -825,6 +826,7 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) static int __net_init ipip_init_net(struct net *net) { struct ipip_net *ipn = net_generic(net, ipip_net_id); + struct ip_tunnel *t; int err; ipn->tunnels[0] = ipn->tunnels_wc; @@ -848,6 +850,9 @@ static int __net_init ipip_init_net(struct net *net) if ((err = register_netdev(ipn->fb_tunnel_dev))) goto err_reg_dev; + t = netdev_priv(ipn->fb_tunnel_dev); + + strcpy(t->parms.name, ipn->fb_tunnel_dev->name); return 0; err_reg_dev: diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 00b15ac7a702..c1e0d63db2e4 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -263,6 +263,8 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, if (register_netdevice(dev) < 0) goto failed_free; + strcpy(nt->parms.name, dev->name); + dev_hold(dev); ipip6_tunnel_link(sitn, nt); @@ -1144,7 +1146,6 @@ static int ipip6_tunnel_init(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); @@ -1207,6 +1208,7 @@ static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_hea static int __net_init sit_init_net(struct net *net) { struct sit_net *sitn = net_generic(net, sit_net_id); + struct ip_tunnel *t; int err; sitn->tunnels[0] = sitn->tunnels_wc; @@ -1231,6 +1233,9 @@ static int __net_init sit_init_net(struct net *net) if ((err = register_netdev(sitn->fb_tunnel_dev))) goto err_reg_dev; + t = netdev_priv(sitn->fb_tunnel_dev); + + strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; err_reg_dev: -- cgit v1.2.3 From c5bebbd132f1d56ef3d99143f5386d58b4da318e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 7 Dec 2011 09:02:21 +0100 Subject: mac80211: fix another race in aggregation start commit 15062e6a8524f5977f2cbdf6e3eb2f144262f74e upstream. Emmanuel noticed that when mac80211 stops the queues for aggregation that can leave a packet pending. This packet will be given to the driver after the AMPDU callback, but as a non-aggregated packet which messes up the sequence number etc. I also noticed by looking at the code that if packets are being processed while we clear the WANT_START bit, they might see it cleared already and queue up on tid_tx->pending. If the driver then rejects the new aggregation session we leak the packet. Fix both of these issues by changing this code to not stop the queues at all. Instead, let packets queue up on the tid_tx->pending queue instead of letting them get to the driver, and add code to recover properly in case the driver rejects the session. (The patch looks large because it has to move two functions to before their new use.) Reported-by: Emmanuel Grumbach Signed-off-by: Johannes Berg Signed-off-by: John W. Linville Signed-off-by: Greg Kroah-Hartman --- net/mac80211/agg-tx.c | 86 ++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index db7db43ccf42..b7f4f5c1f693 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -304,6 +304,38 @@ ieee80211_wake_queue_agg(struct ieee80211_local *local, int tid) __release(agg_queue); } +/* + * splice packets from the STA's pending to the local pending, + * requires a call to ieee80211_agg_splice_finish later + */ +static void __acquires(agg_queue) +ieee80211_agg_splice_packets(struct ieee80211_local *local, + struct tid_ampdu_tx *tid_tx, u16 tid) +{ + int queue = ieee80211_ac_from_tid(tid); + unsigned long flags; + + ieee80211_stop_queue_agg(local, tid); + + if (WARN(!tid_tx, "TID %d gone but expected when splicing aggregates" + " from the pending queue\n", tid)) + return; + + if (!skb_queue_empty(&tid_tx->pending)) { + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + /* copy over remaining packets */ + skb_queue_splice_tail_init(&tid_tx->pending, + &local->pending[queue]); + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + } +} + +static void __releases(agg_queue) +ieee80211_agg_splice_finish(struct ieee80211_local *local, u16 tid) +{ + ieee80211_wake_queue_agg(local, tid); +} + void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) { struct tid_ampdu_tx *tid_tx; @@ -315,19 +347,17 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) tid_tx = rcu_dereference_protected_tid_tx(sta, tid); /* - * While we're asking the driver about the aggregation, - * stop the AC queue so that we don't have to worry - * about frames that came in while we were doing that, - * which would require us to put them to the AC pending - * afterwards which just makes the code more complex. + * Start queuing up packets for this aggregation session. + * We're going to release them once the driver is OK with + * that. */ - ieee80211_stop_queue_agg(local, tid); - clear_bit(HT_AGG_STATE_WANT_START, &tid_tx->state); /* - * make sure no packets are being processed to get - * valid starting sequence number + * Make sure no packets are being processed. This ensures that + * we have a valid starting sequence number and that in-flight + * packets have been flushed out and no packets for this TID + * will go into the driver during the ampdu_action call. */ synchronize_net(); @@ -341,17 +371,15 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) " tid %d\n", tid); #endif spin_lock_bh(&sta->lock); + ieee80211_agg_splice_packets(local, tid_tx, tid); ieee80211_assign_tid_tx(sta, tid, NULL); + ieee80211_agg_splice_finish(local, tid); spin_unlock_bh(&sta->lock); - ieee80211_wake_queue_agg(local, tid); kfree_rcu(tid_tx, rcu_head); return; } - /* we can take packets again now */ - ieee80211_wake_queue_agg(local, tid); - /* activate the timer for the recipient's addBA response */ mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL); #ifdef CONFIG_MAC80211_HT_DEBUG @@ -471,38 +499,6 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, } EXPORT_SYMBOL(ieee80211_start_tx_ba_session); -/* - * splice packets from the STA's pending to the local pending, - * requires a call to ieee80211_agg_splice_finish later - */ -static void __acquires(agg_queue) -ieee80211_agg_splice_packets(struct ieee80211_local *local, - struct tid_ampdu_tx *tid_tx, u16 tid) -{ - int queue = ieee80211_ac_from_tid(tid); - unsigned long flags; - - ieee80211_stop_queue_agg(local, tid); - - if (WARN(!tid_tx, "TID %d gone but expected when splicing aggregates" - " from the pending queue\n", tid)) - return; - - if (!skb_queue_empty(&tid_tx->pending)) { - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - /* copy over remaining packets */ - skb_queue_splice_tail_init(&tid_tx->pending, - &local->pending[queue]); - spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - } -} - -static void __releases(agg_queue) -ieee80211_agg_splice_finish(struct ieee80211_local *local, u16 tid) -{ - ieee80211_wake_queue_agg(local, tid); -} - static void ieee80211_agg_tx_operational(struct ieee80211_local *local, struct sta_info *sta, u16 tid) { -- cgit v1.2.3 From 9aeeebb28c4bd8c3a2382ff909e82989d22b97a1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Dec 2011 14:16:17 -0500 Subject: SUNRPC: Ensure we always bump the backlog queue in xprt_free_slot commit c25573b5134294c0be82bfaecc6d08136835b271 upstream. Whenever we free a slot, we know that the resulting xprt->num_reqs will be less than xprt->max_reqs, so we know that we can release at least one backlogged rpc_task. Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- net/sunrpc/xprt.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index f4385e45a5fc..c64c0ef519b5 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -995,13 +995,11 @@ out_init_req: static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { - if (xprt_dynamic_free_slot(xprt, req)) - return; - - memset(req, 0, sizeof(*req)); /* mark unused */ - spin_lock(&xprt->reserve_lock); - list_add(&req->rq_list, &xprt->free); + if (!xprt_dynamic_free_slot(xprt, req)) { + memset(req, 0, sizeof(*req)); /* mark unused */ + list_add(&req->rq_list, &xprt->free); + } rpc_wake_up_next(&xprt->backlog); spin_unlock(&xprt->reserve_lock); } -- cgit v1.2.3 From 3515cb57df914810fc5c0cca5037e2ad3663c6ad Mon Sep 17 00:00:00 2001 From: Alex Juncu Date: Thu, 15 Dec 2011 23:01:25 +0000 Subject: llc: llc_cmsg_rcv was getting called after sk_eat_skb. [ Upstream commit 9cef310fcdee12b49b8b4c96fd8f611c8873d284 ] Received non stream protocol packets were calling llc_cmsg_rcv that used a skb after that skb was released by sk_eat_skb. This caused received STP packets to generate kernel panics. Signed-off-by: Alexandru Juncu Signed-off-by: Kunjan Naik Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/llc/af_llc.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index dfd3a648a551..a18e6c3d36e3 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -833,15 +833,15 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, copied += used; len -= used; + /* For non stream protcols we get one packet per recvmsg call */ + if (sk->sk_type != SOCK_STREAM) + goto copy_uaddr; + if (!(flags & MSG_PEEK)) { sk_eat_skb(sk, skb, 0); *seq = 0; } - /* For non stream protcols we get one packet per recvmsg call */ - if (sk->sk_type != SOCK_STREAM) - goto copy_uaddr; - /* Partial read */ if (used + offset < skb->len) continue; @@ -857,6 +857,12 @@ copy_uaddr: } if (llc_sk(sk)->cmsg_flags) llc_cmsg_rcv(msg, skb); + + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, 0); + *seq = 0; + } + goto out; } -- cgit v1.2.3 From ad24a5287dd16619b31b9c30917615ab1fcb56b0 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 22 Dec 2011 02:05:07 +0000 Subject: mqprio: Avoid panic if no options are provided [ Upstream commit 7838f2ce36b6ab5c13ef20b1857e3bbd567f1759 ] Userspace may not provide TCA_OPTIONS, in fact tc currently does so not do so if no arguments are specified on the command line. Return EINVAL instead of panicing. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_mqprio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index ea17cbed29ef..59b26b8ff4b0 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -106,7 +106,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; - if (nla_len(opt) < sizeof(*qopt)) + if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); -- cgit v1.2.3 From 77fe0b97b750a3064e67330b131811edeebbc29b Mon Sep 17 00:00:00 2001 From: Gerlando Falauto Date: Mon, 19 Dec 2011 22:58:04 +0000 Subject: net: have ipconfig not wait if no dev is available [ Upstream commit cd7816d14953c8af910af5bb92f488b0b277e29d ] previous commit 3fb72f1e6e6165c5f495e8dc11c5bbd14c73385c makes IP-Config wait for carrier on at least one network device. Before waiting (predefined value 120s), check that at least one device was successfully brought up. Otherwise (e.g. buggy bootloader which does not set the MAC address) there is no point in waiting for carrier. Cc: Micha Nelissen Cc: Holger Brunck Signed-off-by: Gerlando Falauto Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ipconfig.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 472a8c4f1dc0..004bb74b41c2 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -252,6 +252,10 @@ static int __init ic_open_devs(void) } } + /* no point in waiting if we could not bring up at least one device */ + if (!ic_first_dev) + goto have_carrier; + /* wait for a carrier on at least one device */ start = jiffies; while (jiffies - start < msecs_to_jiffies(CONF_CARRIER_TIMEOUT)) { -- cgit v1.2.3 From 79a25778ca9fbc4ea2bd8c1f32d73bd73a1a3817 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 11 Dec 2011 23:42:53 +0000 Subject: sch_gred: should not use GFP_KERNEL while holding a spinlock [ Upstream commit 3f1e6d3fd37bd4f25e5b19f1c7ca21850426c33f ] gred_change_vq() is called under sch_tree_lock(sch). This means a spinlock is held, and we are not allowed to sleep in this context. We might pre-allocate memory using GFP_KERNEL before taking spinlock, but this is not suitable for stable material. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_gred.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index b9493a09a870..6cd8ddfb512d 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -385,7 +385,7 @@ static inline int gred_change_vq(struct Qdisc *sch, int dp, struct gred_sched_data *q; if (table->tab[dp] == NULL) { - table->tab[dp] = kzalloc(sizeof(*q), GFP_KERNEL); + table->tab[dp] = kzalloc(sizeof(*q), GFP_ATOMIC); if (table->tab[dp] == NULL) return -ENOMEM; } -- cgit v1.2.3 From 74631fca684d7a475f1eed5f6823c11b62bbd2bc Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Fri, 16 Dec 2011 12:44:15 +0000 Subject: sctp: fix incorrect overflow check on autoclose [ Upstream commit 2692ba61a82203404abd7dd2a027bda962861f74 ] Commit 8ffd3208 voids the previous patches f6778aab and 810c0719 for limiting the autoclose value. If userspace passes in -1 on 32-bit platform, the overflow check didn't work and autoclose would be set to 0xffffffff. This patch defines a max_autoclose (in seconds) for limiting the value and exposes it through sysctl, with the following intentions. 1) Avoid overflowing autoclose * HZ. 2) Keep the default autoclose bound consistent across 32- and 64-bit platforms (INT_MAX / HZ in this patch). 3) Keep the autoclose value consistent between setsockopt() and getsockopt() calls. Suggested-by: Vlad Yasevich Signed-off-by: Xi Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/associola.c | 2 +- net/sctp/protocol.c | 3 +++ net/sctp/socket.c | 2 -- net/sctp/sysctl.c | 13 +++++++++++++ 4 files changed, 17 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index dc16b90ddb6f..49814827f81f 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -173,7 +173,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = - (unsigned long)sp->autoclose * HZ; + min_t(unsigned long, sp->autoclose, sctp_max_autoclose) * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 91784f44a2e2..48cb7b98b113 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1285,6 +1285,9 @@ SCTP_STATIC __init int sctp_init(void) sctp_max_instreams = SCTP_DEFAULT_INSTREAMS; sctp_max_outstreams = SCTP_DEFAULT_OUTSTREAMS; + /* Initialize maximum autoclose timeout. */ + sctp_max_autoclose = INT_MAX / HZ; + /* Initialize handle used for association ids. */ idr_init(&sctp_assocs_id); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 836aa63ee121..4760f4e65b80 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2199,8 +2199,6 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, return -EINVAL; if (copy_from_user(&sp->autoclose, optval, optlen)) return -EFAULT; - /* make sure it won't exceed MAX_SCHEDULE_TIMEOUT */ - sp->autoclose = min_t(long, sp->autoclose, MAX_SCHEDULE_TIMEOUT / HZ); return 0; } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 6b3952961b85..60ffbd067ff7 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -53,6 +53,10 @@ static int sack_timer_min = 1; static int sack_timer_max = 500; static int addr_scope_max = 3; /* check sctp_scope_policy_t in include/net/sctp/constants.h for max entries */ static int rwnd_scale_max = 16; +static unsigned long max_autoclose_min = 0; +static unsigned long max_autoclose_max = + (MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX) + ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; extern long sysctl_sctp_mem[3]; extern int sysctl_sctp_rmem[3]; @@ -258,6 +262,15 @@ static ctl_table sctp_table[] = { .extra1 = &one, .extra2 = &rwnd_scale_max, }, + { + .procname = "max_autoclose", + .data = &sctp_max_autoclose, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax, + .extra1 = &max_autoclose_min, + .extra2 = &max_autoclose_max, + }, { /* sentinel */ } }; -- cgit v1.2.3 From fd4199603bbc8851051b45299b7cf73b07b4daac Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Mon, 19 Dec 2011 04:11:40 +0000 Subject: sctp: Do not account for sizeof(struct sk_buff) in estimated rwnd [ Upstream commit a76c0adf60f6ca5ff3481992e4ea0383776b24d2 ] When checking whether a DATA chunk fits into the estimated rwnd a full sizeof(struct sk_buff) is added to the needed chunk size. This quickly exhausts the available rwnd space and leads to packets being sent which are much below the PMTU limit. This can lead to much worse performance. The reason for this behaviour was to avoid putting too much memory pressure on the receiver. The concept is not completely irational because a Linux receiver does in fact clone an skb for each DATA chunk delivered. However, Linux also reserves half the available socket buffer space for data structures therefore usage of it is already accounted for. When proposing to change this the last time it was noted that this behaviour was introduced to solve a performance issue caused by rwnd overusage in combination with small DATA chunks. Trying to reproduce this I found that with the sk_buff overhead removed, the performance would improve significantly unless socket buffer limits are increased. The following numbers have been gathered using a patched iperf supporting SCTP over a live 1 Gbit ethernet network. The -l option was used to limit DATA chunk sizes. The numbers listed are based on the average of 3 test runs each. Default values have been used for sk_(r|w)mem. Chunk Size Unpatched No Overhead ------------------------------------- 4 15.2 Kbit [!] 12.2 Mbit [!] 8 35.8 Kbit [!] 26.0 Mbit [!] 16 95.5 Kbit [!] 54.4 Mbit [!] 32 106.7 Mbit 102.3 Mbit 64 189.2 Mbit 188.3 Mbit 128 331.2 Mbit 334.8 Mbit 256 537.7 Mbit 536.0 Mbit 512 766.9 Mbit 766.6 Mbit 1024 810.1 Mbit 808.6 Mbit Signed-off-by: Thomas Graf Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/output.c | 8 +------- net/sctp/outqueue.c | 6 ++---- 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sctp/output.c b/net/sctp/output.c index 08b3cead6503..817174eb5f41 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -697,13 +697,7 @@ static void sctp_packet_append_data(struct sctp_packet *packet, /* Keep track of how many bytes are in flight to the receiver. */ asoc->outqueue.outstanding_bytes += datasize; - /* Update our view of the receiver's rwnd. Include sk_buff overhead - * while updating peer.rwnd so that it reduces the chances of a - * receiver running out of receive buffer space even when receive - * window is still open. This can happen when a sender is sending - * sending small messages. - */ - datasize += sizeof(struct sk_buff); + /* Update our view of the receiver's rwnd. */ if (datasize < rwnd) rwnd -= datasize; else diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index a6d27bf563a5..6edd7deb1ad8 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -411,8 +411,7 @@ void sctp_retransmit_mark(struct sctp_outq *q, chunk->transport->flight_size -= sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); - q->asoc->peer.rwnd += (sctp_data_size(chunk) + - sizeof(struct sk_buff)); + q->asoc->peer.rwnd += sctp_data_size(chunk); } continue; } @@ -432,8 +431,7 @@ void sctp_retransmit_mark(struct sctp_outq *q, * (Section 7.2.4)), add the data size of those * chunks to the rwnd. */ - q->asoc->peer.rwnd += (sctp_data_size(chunk) + - sizeof(struct sk_buff)); + q->asoc->peer.rwnd += sctp_data_size(chunk); q->outstanding_bytes -= sctp_data_size(chunk); if (chunk->transport) transport->flight_size -= sctp_data_size(chunk); -- cgit v1.2.3 From 6561c4fa576a9e83f4e2faf7b62dbd1d9b598c39 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Wed, 21 Dec 2011 16:48:08 -0500 Subject: net: Add a flow_cache_flush_deferred function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c0ed1c14a72ca9ebacd51fb94a8aca488b0d361e ] flow_cach_flush() might sleep but can be called from atomic context via the xfrm garbage collector. So add a flow_cache_flush_deferred() function and use this if the xfrm garbage colector is invoked from within the packet path. Signed-off-by: Steffen Klassert Acked-by: Timo Teräs Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/flow.c | 12 ++++++++++++ net/xfrm/xfrm_policy.c | 18 ++++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/flow.c b/net/core/flow.c index 555a456efb07..d6968e5ae75b 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -358,6 +358,18 @@ void flow_cache_flush(void) put_online_cpus(); } +static void flow_cache_flush_task(struct work_struct *work) +{ + flow_cache_flush(); +} + +static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task); + +void flow_cache_flush_deferred(void) +{ + schedule_work(&flow_cache_flush_work); +} + static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) { struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 552df27dcf53..7e088c055cfa 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2276,8 +2276,6 @@ static void __xfrm_garbage_collect(struct net *net) { struct dst_entry *head, *next; - flow_cache_flush(); - spin_lock_bh(&xfrm_policy_sk_bundle_lock); head = xfrm_policy_sk_bundles; xfrm_policy_sk_bundles = NULL; @@ -2290,6 +2288,18 @@ static void __xfrm_garbage_collect(struct net *net) } } +static void xfrm_garbage_collect(struct net *net) +{ + flow_cache_flush(); + __xfrm_garbage_collect(net); +} + +static void xfrm_garbage_collect_deferred(struct net *net) +{ + flow_cache_flush_deferred(); + __xfrm_garbage_collect(net); +} + static void xfrm_init_pmtu(struct dst_entry *dst) { do { @@ -2420,7 +2430,7 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; if (likely(afinfo->garbage_collect == NULL)) - afinfo->garbage_collect = __xfrm_garbage_collect; + afinfo->garbage_collect = xfrm_garbage_collect_deferred; xfrm_policy_afinfo[afinfo->family] = afinfo; } write_unlock_bh(&xfrm_policy_afinfo_lock); @@ -2514,7 +2524,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void switch (event) { case NETDEV_DOWN: - __xfrm_garbage_collect(dev_net(dev)); + xfrm_garbage_collect(dev_net(dev)); } return NOTIFY_DONE; } -- cgit v1.2.3 From 4e1a937bf13f758c7be089f614e7e84bdafcc0af Mon Sep 17 00:00:00 2001 From: Weiping Pan Date: Thu, 1 Dec 2011 15:47:06 +0000 Subject: ipv4: flush route cache after change accept_local [ Upstream commit d01ff0a049f749e0bf10a35bb23edd012718c8c2 ] After reset ipv4_devconf->data[IPV4_DEVCONF_ACCEPT_LOCAL] to 0, we should flush route cache, or it will continue receive packets with local source address, which should be dropped. Signed-off-by: Weiping Pan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/devinet.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index bc19bd06dd00..070f214b8865 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1490,7 +1490,9 @@ static int devinet_conf_proc(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + int new_value = *(int *)ctl->data; if (write) { struct ipv4_devconf *cnf = ctl->extra1; @@ -1501,6 +1503,9 @@ static int devinet_conf_proc(ctl_table *ctl, int write, if (cnf == net->ipv4.devconf_dflt) devinet_copy_dflt_conf(net, i); + if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1) + if ((new_value == 0) && (old_value != 0)) + rt_cache_flush(net, 0); } return ret; -- cgit v1.2.3 From ea67f1fb5976ad72d27e45dff42017894f30b563 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 13 Dec 2011 17:35:06 -0500 Subject: ipv6: Check dest prefix length on original route not copied one in rt6_alloc_cow(). [ Upstream commit bb3c36863e8001fc21a88bebfdead4da4c23e848 ] After commit 8e2ec639173f325977818c45011ee176ef2b11f6 ("ipv6: don't use inetpeer to store metrics for routes.") the test in rt6_alloc_cow() for setting the ANYCAST flag is now wrong. 'rt' will always now have a plen of 128, because it is set explicitly to 128 by ip6_rt_copy. So to restore the semantics of the test, check the destination prefix length of 'ort'. Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 57b82dc1ae91..f02fe523bd3a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -725,7 +725,7 @@ static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort, int attempts = !in_softirq(); if (!(rt->rt6i_flags&RTF_GATEWAY)) { - if (rt->rt6i_dst.plen != 128 && + if (ort->rt6i_dst.plen != 128 && ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; ipv6_addr_copy(&rt->rt6i_gateway, daddr); -- cgit v1.2.3 From d23270aae3214c26691f95922a63c70549232c22 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Dec 2011 04:15:53 +0000 Subject: net: introduce DST_NOPEER dst flag [ Upstream commit e688a604807647c9450f9c12a7cb6d027150a895 ] Chris Boot reported crashes occurring in ipv6_select_ident(). [ 461.457562] RIP: 0010:[] [] ipv6_select_ident+0x31/0xa7 [ 461.578229] Call Trace: [ 461.580742] [ 461.582870] [] ? udp6_ufo_fragment+0x124/0x1a2 [ 461.589054] [] ? ipv6_gso_segment+0xc0/0x155 [ 461.595140] [] ? skb_gso_segment+0x208/0x28b [ 461.601198] [] ? ipv6_confirm+0x146/0x15e [nf_conntrack_ipv6] [ 461.608786] [] ? nf_iterate+0x41/0x77 [ 461.614227] [] ? dev_hard_start_xmit+0x357/0x543 [ 461.620659] [] ? nf_hook_slow+0x73/0x111 [ 461.626440] [] ? br_parse_ip_options+0x19a/0x19a [bridge] [ 461.633581] [] ? dev_queue_xmit+0x3af/0x459 [ 461.639577] [] ? br_dev_queue_push_xmit+0x72/0x76 [bridge] [ 461.646887] [] ? br_nf_post_routing+0x17d/0x18f [bridge] [ 461.653997] [] ? nf_iterate+0x41/0x77 [ 461.659473] [] ? br_flood+0xfa/0xfa [bridge] [ 461.665485] [] ? nf_hook_slow+0x73/0x111 [ 461.671234] [] ? br_flood+0xfa/0xfa [bridge] [ 461.677299] [] ? nf_bridge_update_protocol+0x20/0x20 [bridge] [ 461.684891] [] ? nf_ct_zone+0xa/0x17 [nf_conntrack] [ 461.691520] [] ? br_flood+0xfa/0xfa [bridge] [ 461.697572] [] ? NF_HOOK.constprop.8+0x3c/0x56 [bridge] [ 461.704616] [] ? nf_bridge_push_encap_header+0x1c/0x26 [bridge] [ 461.712329] [] ? br_nf_forward_finish+0x8a/0x95 [bridge] [ 461.719490] [] ? nf_bridge_pull_encap_header+0x1c/0x27 [bridge] [ 461.727223] [] ? br_nf_forward_ip+0x1c0/0x1d4 [bridge] [ 461.734292] [] ? nf_iterate+0x41/0x77 [ 461.739758] [] ? __br_deliver+0xa0/0xa0 [bridge] [ 461.746203] [] ? nf_hook_slow+0x73/0x111 [ 461.751950] [] ? __br_deliver+0xa0/0xa0 [bridge] [ 461.758378] [] ? NF_HOOK.constprop.4+0x56/0x56 [bridge] This is caused by bridge netfilter special dst_entry (fake_rtable), a special shared entry, where attaching an inetpeer makes no sense. Problem is present since commit 87c48fa3b46 (ipv6: make fragment identifications less predictable) Introduce DST_NOPEER dst flag and make sure ipv6_select_ident() and __ip_select_ident() fallback to the 'no peer attached' handling. Reported-by: Chris Boot Tested-by: Chris Boot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_netfilter.c | 2 +- net/ipv4/route.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index d6ec3720c77e..5693e5f9e03e 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -141,7 +141,7 @@ void br_netfilter_rtable_init(struct net_bridge *br) rt->dst.dev = br->dev; rt->dst.path = &rt->dst; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); - rt->dst.flags = DST_NOXFRM; + rt->dst.flags = DST_NOXFRM | DST_NOPEER; rt->dst.ops = &fake_dst_ops; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 05ac666c3301..4e9bfc6c0c1d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1272,7 +1272,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) { struct rtable *rt = (struct rtable *) dst; - if (rt) { + if (rt && !(rt->dst.flags & DST_NOPEER)) { if (rt->peer == NULL) rt_bind_peer(rt, rt->rt_dst, 1); @@ -1283,7 +1283,7 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) iph->id = htons(inet_getid(rt->peer, more)); return; } - } else + } else if (!rt) printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", __builtin_return_address(0)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 4c882cf4e8a1..55a35c1dede1 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -606,7 +606,7 @@ void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) static atomic_t ipv6_fragmentation_id; int old, new; - if (rt) { + if (rt && !(rt->dst.flags & DST_NOPEER)) { struct inet_peer *peer; if (!rt->rt6i_peer) -- cgit v1.2.3 From 44f6d7e64fc76fe710b95afee4fee984c19d0e64 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 21 Dec 2011 15:47:16 -0500 Subject: ipv4: reintroduce route cache garbage collector [ Upstream commit 9f28a2fc0bd77511f649c0a788c7bf9a5fd04edb ] Commit 2c8cec5c10b (ipv4: Cache learned PMTU information in inetpeer) removed IP route cache garbage collector a bit too soon, as this gc was responsible for expired routes cleanup, releasing their neighbour reference. As pointed out by Robert Gladewitz, recent kernels can fill and exhaust their neighbour cache. Reintroduce the garbage collection, since we'll have to wait our neighbour lookups become refcount-less to not depend on this stuff. Reported-by: Robert Gladewitz Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/route.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4e9bfc6c0c1d..75fc6d5faa9d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -134,6 +134,9 @@ static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; static int redirect_genid; +static struct delayed_work expires_work; +static unsigned long expires_ljiffies; + /* * Interface to generic destination cache. */ @@ -831,6 +834,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) return ONE; } +static void rt_check_expire(void) +{ + static unsigned int rover; + unsigned int i = rover, goal; + struct rtable *rth; + struct rtable __rcu **rthp; + unsigned long samples = 0; + unsigned long sum = 0, sum2 = 0; + unsigned long delta; + u64 mult; + + delta = jiffies - expires_ljiffies; + expires_ljiffies = jiffies; + mult = ((u64)delta) << rt_hash_log; + if (ip_rt_gc_timeout > 1) + do_div(mult, ip_rt_gc_timeout); + goal = (unsigned int)mult; + if (goal > rt_hash_mask) + goal = rt_hash_mask + 1; + for (; goal > 0; goal--) { + unsigned long tmo = ip_rt_gc_timeout; + unsigned long length; + + i = (i + 1) & rt_hash_mask; + rthp = &rt_hash_table[i].chain; + + if (need_resched()) + cond_resched(); + + samples++; + + if (rcu_dereference_raw(*rthp) == NULL) + continue; + length = 0; + spin_lock_bh(rt_hash_lock_addr(i)); + while ((rth = rcu_dereference_protected(*rthp, + lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { + prefetch(rth->dst.rt_next); + if (rt_is_expired(rth)) { + *rthp = rth->dst.rt_next; + rt_free(rth); + continue; + } + if (rth->dst.expires) { + /* Entry is expired even if it is in use */ + if (time_before_eq(jiffies, rth->dst.expires)) { +nofree: + tmo >>= 1; + rthp = &rth->dst.rt_next; + /* + * We only count entries on + * a chain with equal hash inputs once + * so that entries for different QOS + * levels, and other non-hash input + * attributes don't unfairly skew + * the length computation + */ + length += has_noalias(rt_hash_table[i].chain, rth); + continue; + } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) + goto nofree; + + /* Cleanup aged off entries. */ + *rthp = rth->dst.rt_next; + rt_free(rth); + } + spin_unlock_bh(rt_hash_lock_addr(i)); + sum += length; + sum2 += length*length; + } + if (samples) { + unsigned long avg = sum / samples; + unsigned long sd = int_sqrt(sum2 / samples - avg*avg); + rt_chain_length_max = max_t(unsigned long, + ip_rt_gc_elasticity, + (avg + 4*sd) >> FRACT_BITS); + } + rover = i; +} + +/* + * rt_worker_func() is run in process context. + * we call rt_check_expire() to scan part of the hash table + */ +static void rt_worker_func(struct work_struct *work) +{ + rt_check_expire(); + schedule_delayed_work(&expires_work, ip_rt_gc_interval); +} + /* * Perturbation of rt_genid by a small quantity [1..256] * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() @@ -3175,6 +3269,13 @@ static ctl_table ipv4_route_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "gc_interval", + .data = &ip_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, { .procname = "redirect_load", .data = &ip_rt_redirect_load, @@ -3385,6 +3486,11 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); + INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); + expires_ljiffies = jiffies; + schedule_delayed_work(&expires_work, + net_random() % ip_rt_gc_interval + ip_rt_gc_interval); + if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM -- cgit v1.2.3 From e3b22a8ecc595fd80bc3cd84d32e7d4ee7fe9ad8 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 22 Dec 2011 17:03:29 +1100 Subject: ipv4: using prefetch requires including prefetch.h [ Upstream commit b9eda06f80b0db61a73bd87c6b0eb67d8aca55ad ] Signed-off-by: Stephen Rothwell Acked-by: Eric Dumazet Acked-by: David Miller Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- net/ipv4/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 75fc6d5faa9d..b5638545debc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From dd9f9823b61ce894163433380ffcfc28eaf6e9c5 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 9 Jan 2012 14:06:46 -0800 Subject: igmp: Avoid zero delay when receiving odd mixture of IGMP queries commit a8c1f65c79cbbb2f7da782d4c9d15639a9b94b27 upstream. Commit 5b7c84066733c5dfb0e4016d939757b38de189e4 ('ipv4: correct IGMP behavior on v3 query during v2-compatibility mode') added yet another case for query parsing, which can result in max_delay = 0. Substitute a value of 1, as in the usual v3 case. Reported-by: Simon McVittie References: http://bugs.debian.org/654876 Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d577199eabd5..e0d42dbb33fe 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -875,6 +875,8 @@ static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, * to be intended in a v3 query. */ max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); + if (!max_delay) + max_delay = 1; /* can't mod w/ 0 */ } else { /* v3 */ if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) return; -- cgit v1.2.3