From f230d1e891ba1da5953460516960894154f265db Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:06 -0700 Subject: ipv6: Rename the dst_cache helper functions in ip6_tunnel It is a prep work to fix the dst_entry refcnt bugs in ip6_tunnel. This patch rename: 1. ip6_tnl_dst_check() to ip6_tnl_dst_get() to better reflect that it will take a dst refcnt in the next patch. 2. ip6_tnl_dst_store() to ip6_tnl_dst_set() to have a more conventional name matching with ip6_tnl_dst_get(). Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index b8529aa1dae7..979b081a47e8 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -60,9 +60,9 @@ struct ipv6_tlv_tnl_enc_lim { __u8 encap_limit; /* tunnel encapsulation limit */ } __packed; -struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t); +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t); void ip6_tnl_dst_reset(struct ip6_tnl *t); -void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst); +void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst); int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr); int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, -- cgit v1.2.3 From cdf3464e6c6bd764277cbbe992cd12da735b92fb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:07 -0700 Subject: ipv6: Fix dst_entry refcnt bugs in ip6_tunnel Problems in the current dst_entry cache in the ip6_tunnel: 1. ip6_tnl_dst_set is racy. There is no lock to protect it: - One major problem is that the dst refcnt gets messed up. F.e. the same dst_cache can be released multiple times and then triggering the infamous dst refcnt < 0 warning message. - Another issue is the inconsistency between dst_cache and dst_cookie. It can be reproduced by adding and removing the ip6gre tunnel while running a super_netperf TCP_CRR test. 2. ip6_tnl_dst_get does not take the dst refcnt before returning the dst. This patch: 1. Create a percpu dst_entry cache in ip6_tnl 2. Use a spinlock to protect the dst_cache operations 3. ip6_tnl_dst_get always takes the dst refcnt before returning Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 979b081a47e8..60b4f402f78c 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -32,6 +32,12 @@ struct __ip6_tnl_parm { __be32 o_key; }; +struct ip6_tnl_dst { + spinlock_t lock; + struct dst_entry *dst; + u32 cookie; +}; + /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ @@ -39,8 +45,7 @@ struct ip6_tnl { struct net *net; /* netns for packet i/o */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct flowi fl; /* flowi template for xmit */ - struct dst_entry *dst_cache; /* cached dst */ - u32 dst_cookie; + struct ip6_tnl_dst __percpu *dst_cache; /* cached dst */ int err_count; unsigned long err_time; @@ -61,6 +66,8 @@ struct ipv6_tlv_tnl_enc_lim { } __packed; struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t); +int ip6_tnl_dst_init(struct ip6_tnl *t); +void ip6_tnl_dst_destroy(struct ip6_tnl *t); void ip6_tnl_dst_reset(struct ip6_tnl *t); void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst); int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, -- cgit v1.2.3 From 70da5b5c532f0ec8aa76b4f46158da5f010f34b3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 15 Sep 2015 14:30:09 -0700 Subject: ipv6: Replace spinlock with seqlock and rcu in ip6_tunnel This patch uses a seqlock to ensure consistency between idst->dst and idst->cookie. It also makes dst freeing from fib tree to undergo a rcu grace period. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 60b4f402f78c..65c2a9397b3c 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -33,8 +33,8 @@ struct __ip6_tnl_parm { }; struct ip6_tnl_dst { - spinlock_t lock; - struct dst_entry *dst; + seqlock_t lock; + struct dst_entry __rcu *dst; u32 cookie; }; -- cgit v1.2.3 From 37a1d3611c126fd9782ce5235791f898f053e763 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Sun, 13 Sep 2015 10:18:33 -0700 Subject: ipv6: include NLM_F_REPLACE in route replace notifications This patch adds NLM_F_REPLACE flag to ipv6 route replace notifications. This makes nlm_flags in ipv6 replace notifications consistent with ipv4. Signed-off-by: Roopa Prabhu Acked-by: Nicolas Dichtel Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 063d30474cf6..aaf9700fc9e5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -275,7 +275,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc); int fib6_del(struct rt6_info *rt, struct nl_info *info); -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info); +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, + unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); -- cgit v1.2.3 From 58189ca7b27411c3dc9a5cb9eeee0906da684c59 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 15 Sep 2015 15:10:50 -0700 Subject: net: Fix vti use case with oif in dst lookups Steffen reported that the recent change to add oif to dst lookups breaks the VTI use case. The problem is that with the oif set in the flow struct the comparison to the nh_oif is triggered. Fix by splitting the FLOWI_FLAG_VRFSRC into 2 flags -- one that triggers the vrf device cache bypass (FLOWI_FLAG_VRFSRC) and another telling the lookup to not compare nh oif (FLOWI_FLAG_SKIP_NH_OIF). Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups") Signed-off-by: David Ahern Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/flow.h | 1 + include/net/route.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/flow.h b/include/net/flow.h index acd6a096250e..9b85db85f13c 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -35,6 +35,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_VRFSRC 0x04 +#define FLOWI_FLAG_SKIP_NH_OIF 0x08 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; }; diff --git a/include/net/route.h b/include/net/route.h index cc61cb95f059..f46af256880c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -255,7 +255,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 flow_flags |= FLOWI_FLAG_ANYSRC; if (netif_index_is_vrf(sock_net(sk), oif)) - flow_flags |= FLOWI_FLAG_VRFSRC; + flow_flags |= FLOWI_FLAG_VRFSRC | FLOWI_FLAG_SKIP_NH_OIF; flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, protocol, flow_flags, dst, src, dport, sport); -- cgit v1.2.3 From 0315e382704817b279e5693dca8ab9d89aa20b3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nikola=20Forr=C3=B3?= Date: Thu, 17 Sep 2015 16:01:32 +0200 Subject: net: Fix behaviour of unreachable, blackhole and prohibit routes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Man page of ip-route(8) says following about route types: unreachable - these destinations are unreachable. Packets are dis‐ carded and the ICMP message host unreachable is generated. The local senders get an EHOSTUNREACH error. blackhole - these destinations are unreachable. Packets are dis‐ carded silently. The local senders get an EINVAL error. prohibit - these destinations are unreachable. Packets are discarded and the ICMP message communication administratively prohibited is generated. The local senders get an EACCES error. In the inet6 address family, this was correct, except the local senders got ENETUNREACH error instead of EHOSTUNREACH in case of unreachable route. In the inet address family, all three route types generated ICMP message net unreachable, and the local senders got ENETUNREACH error. In both address families all three route types now behave consistently with documentation. Signed-off-by: Nikola Forró Signed-off-by: David S. Miller --- include/net/ip_fib.h | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a37d0432bebd..727d6e9a9685 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -236,8 +236,11 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp, rcu_read_lock(); tb = fib_get_table(net, RT_TABLE_MAIN); - if (tb && !fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF)) - err = 0; + if (tb) + err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF); + + if (err == -EAGAIN) + err = -ENETUNREACH; rcu_read_unlock(); @@ -258,7 +261,7 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; - int err; + int err = -ENETUNREACH; flags |= FIB_LOOKUP_NOREF; if (net->ipv4.fib_has_custom_rules) @@ -268,15 +271,20 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, res->tclassid = 0; - for (err = 0; !err; err = -ENETUNREACH) { - tb = rcu_dereference_rtnl(net->ipv4.fib_main); - if (tb && !fib_table_lookup(tb, flp, res, flags)) - break; + tb = rcu_dereference_rtnl(net->ipv4.fib_main); + if (tb) + err = fib_table_lookup(tb, flp, res, flags); + + if (!err) + goto out; + + tb = rcu_dereference_rtnl(net->ipv4.fib_default); + if (tb) + err = fib_table_lookup(tb, flp, res, flags); - tb = rcu_dereference_rtnl(net->ipv4.fib_default); - if (tb && !fib_table_lookup(tb, flp, res, flags)) - break; - } +out: + if (err == -EAGAIN) + err = -ENETUNREACH; rcu_read_unlock(); -- cgit v1.2.3 From 83cf9a2521b0934a5f9d04082c9bb4f554fddcd4 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 18 Sep 2015 11:47:41 +0200 Subject: ip6tunnel: make rx/tx bytes counters consistent Like the previous patch, which fixes ipv4 tunnels, here is the ipv6 part. Before the patch, the external ipv6 header + gre header were included on tx. After the patch: $ ping -c1 192.168.6.121 ; ip -s l ls dev ip6gre1 PING 192.168.6.121 (192.168.6.121) 56(84) bytes of data. 64 bytes from 192.168.6.121: icmp_req=1 ttl=64 time=1.92 ms --- 192.168.6.121 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 1.923/1.923/1.923/0.000 ms 7: ip6gre1@NONE: mtu 1440 qdisc noqueue state UNKNOWN mode DEFAULT group default link/gre6 20:01:06:60:30:08:c1:c3:00:00:00:00:00:00:01:23 peer 20:01:06:60:30:08:c1:c3:00:00:00:00:00:00:01:21 RX: bytes packets errors dropped overrun mcast 84 1 0 0 0 0 TX: bytes packets errors dropped carrier collsns 84 1 0 0 0 0 $ ping -c1 192.168.1.121 ; ip -s l ls dev ip6tnl1 PING 192.168.1.121 (192.168.1.121) 56(84) bytes of data. 64 bytes from 192.168.1.121: icmp_req=1 ttl=64 time=2.28 ms --- 192.168.1.121 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 2.288/2.288/2.288/0.000 ms 8: ip6tnl1@NONE: mtu 1452 qdisc noqueue state UNKNOWN mode DEFAULT group default link/tunnel6 2001:660:3008:c1c3::123 peer 2001:660:3008:c1c3::121 RX: bytes packets errors dropped overrun mcast 84 1 0 0 0 0 TX: bytes packets errors dropped carrier collsns 84 1 0 0 0 0 Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 65c2a9397b3c..fa915fa0f703 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -86,7 +86,7 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, struct net_device_stats *stats = &dev->stats; int pkt_len, err; - pkt_len = skb->len; + pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out_sk(sk, skb); if (net_xmit_eval(err) == 0) { -- cgit v1.2.3 From ed2e923945892a8372ab70d2f61d364b0b6d9054 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 19 Sep 2015 09:08:34 -0700 Subject: tcp/dccp: fix timewait races in timer handling When creating a timewait socket, we need to arm the timer before allowing other cpus to find it. The signal allowing cpus to find the socket is setting tw_refcnt to non zero value. As we set tw_refcnt in __inet_twsk_hashdance(), we therefore need to call inet_twsk_schedule() first. This also means we need to remove tw_refcnt changes from inet_twsk_schedule() and let the caller handle it. Note that because we use mod_timer_pinned(), we have the guarantee the timer wont expire before we set tw_refcnt as we run in BH context. To make things more readable I introduced inet_twsk_reschedule() helper. When rearming the timer, we can use mod_timer_pending() to make sure we do not rearm a canceled timer. Note: This bug can possibly trigger if packets of a flow can hit multiple cpus. This does not normally happen, unless flow steering is broken somehow. This explains this bug was spotted ~5 months after its introduction. A similar fix is needed for SYN_RECV sockets in reqsk_queue_hash_req(), but will be provided in a separate patch for proper tracking. Fixes: 789f558cfb36 ("tcp/dccp: get rid of central timewait timer") Signed-off-by: Eric Dumazet Reported-by: Ying Cai Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 879d6e5a973b..186f3a1e1b1f 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -110,7 +110,19 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo); +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, + bool rearm); + +static void inline inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) +{ + __inet_twsk_schedule(tw, timeo, false); +} + +static void inline inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo) +{ + __inet_twsk_schedule(tw, timeo, true); +} + void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); void inet_twsk_purge(struct inet_hashinfo *hashinfo, -- cgit v1.2.3 From 63d008a4e9ee86614ca5671b7f3ba447df007190 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 22 Sep 2015 18:12:11 +0200 Subject: ipv4: send arp replies to the correct tunnel When using ip lwtunnels, the additional data for xmit (basically, the actual tunnel to use) are carried in ip_tunnel_info either in dst->lwtstate or in metadata dst. When replying to ARP requests, we need to send the reply to the same tunnel the request came from. This means we need to construct proper metadata dst for ARP replies. We could perform another route lookup to get a dst entry with the correct lwtstate. However, this won't always ensure that the outgoing tunnel is the same as the incoming one, and it won't work anyway for IPv4 duplicate address detection. The only thing to do is to "reverse" the ip_tunnel_info. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 9a6a3ba888e8..f6dafec9102c 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -276,6 +276,8 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto); int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, u8 tos, u8 ttl, __be16 df, bool xnet); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, + gfp_t flags); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, int gso_type_mask); -- cgit v1.2.3