diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-24 16:49:49 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-24 16:49:49 -0700 |
commit | e0456717e483bb8a9431b80a5bdc99a928b9b003 (patch) | |
tree | 5eb5add2bafd1f20326d70f5cb3b711d00a40b10 /net/ipv6/route.c | |
parent | 98ec21a01896751b673b6c731ca8881daa8b2c6d (diff) | |
parent | 1ea2d020ba477cb7011a7174e8501a9e04a325d4 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) Add TX fast path in mac80211, from Johannes Berg.
2) Add TSO/GRO support to ibmveth, from Thomas Falcon
3) Move away from cached routes in ipv6, just like ipv4, from Martin
KaFai Lau.
4) Lots of new rhashtable tests, from Thomas Graf.
5) Run ingress qdisc lockless, from Alexei Starovoitov.
6) Allow servers to fetch TCP packet headers for SYN packets of new
connections, for fingerprinting. From Eric Dumazet.
7) Add mode parameter to pktgen, for testing receive. From Alexei
Starovoitov.
8) Cache access optimizations via simplifications of build_skb(), from
Alexander Duyck.
9) Move page frag allocator under mm/, also from Alexander.
10) Add xmit_more support to hv_netvsc, from KY Srinivasan.
11) Add a counter guard in case we try to perform endless reclassify
loops in the packet scheduler.
12) Extern flow dissector to be programmable and use it in new "Flower"
classifier. From Jiri Pirko.
13) AF_PACKET fanout rollover fixes, performance improvements, and new
statistics. From Willem de Bruijn.
14) Add netdev driver for GENEVE tunnels, from John W Linville.
15) Add ingress netfilter hooks and filtering, from Pablo Neira Ayuso.
16) Fix handling of epoll edge triggers in TCP, from Eric Dumazet.
17) Add an ECN retry fallback for the initial TCP handshake, from Daniel
Borkmann.
18) Add tail call support to BPF, from Alexei Starovoitov.
19) Add several pktgen helper scripts, from Jesper Dangaard Brouer.
20) Add zerocopy support to AF_UNIX, from Hannes Frederic Sowa.
21) Favor even port numbers for allocation to connect() requests, and
odd port numbers for bind(0), in an effort to help avoid
ip_local_port_range exhaustion. From Eric Dumazet.
22) Add Cavium ThunderX driver, from Sunil Goutham.
23) Allow bpf programs to access skb_iif and dev->ifindex SKB metadata,
from Alexei Starovoitov.
24) Add support for T6 chips in cxgb4vf driver, from Hariprasad Shenai.
25) Double TCP Small Queues default to 256K to accomodate situations
like the XEN driver and wireless aggregation. From Wei Liu.
26) Add more entropy inputs to flow dissector, from Tom Herbert.
27) Add CDG congestion control algorithm to TCP, from Kenneth Klette
Jonassen.
28) Convert ipset over to RCU locking, from Jozsef Kadlecsik.
29) Track and act upon link status of ipv4 route nexthops, from Andy
Gospodarek.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1670 commits)
bridge: vlan: flush the dynamically learned entries on port vlan delete
bridge: multicast: add a comment to br_port_state_selection about blocking state
net: inet_diag: export IPV6_V6ONLY sockopt
stmmac: troubleshoot unexpected bits in des0 & des1
net: ipv4 sysctl option to ignore routes when nexthop link is down
net: track link-status of ipv4 nexthops
net: switchdev: ignore unsupported bridge flags
net: Cavium: Fix MAC address setting in shutdown state
drivers: net: xgene: fix for ACPI support without ACPI
ip: report the original address of ICMP messages
net/mlx5e: Prefetch skb data on RX
net/mlx5e: Pop cq outside mlx5e_get_cqe
net/mlx5e: Remove mlx5e_cq.sqrq back-pointer
net/mlx5e: Remove extra spaces
net/mlx5e: Avoid TX CQE generation if more xmit packets expected
net/mlx5e: Avoid redundant dev_kfree_skb() upon NOP completion
net/mlx5e: Remove re-assignment of wq type in mlx5e_enable_rq()
net/mlx5e: Use skb_shinfo(skb)->gso_segs rather than counting them
net/mlx5e: Static mapping of netdev priv resources to/from netdev TX queues
net/mlx4_en: Use HW counters for rx/tx bytes/packets in PF device
...
Diffstat (limited to 'net/ipv6/route.c')
-rw-r--r-- | net/ipv6/route.c | 561 |
1 files changed, 387 insertions, 174 deletions
diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c73ae5039e46..1a1122a6bbf5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -72,8 +72,7 @@ enum rt6_nud_state { RT6_NUD_SUCCEED = 1 }; -static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, - const struct in6_addr *dest); +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); static unsigned int ip6_mtu(const struct dst_entry *dst); @@ -92,6 +91,7 @@ static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); +static void rt6_dst_from_metrics_check(struct rt6_info *rt); static int rt6_score_route(struct rt6_info *rt, int oif, int strict); #ifdef CONFIG_IPV6_ROUTE_INFO @@ -104,65 +104,82 @@ static struct rt6_info *rt6_get_route_info(struct net *net, const struct in6_addr *gwaddr, int ifindex); #endif -static void rt6_bind_peer(struct rt6_info *rt, int create) +struct uncached_list { + spinlock_t lock; + struct list_head head; +}; + +static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); + +static void rt6_uncached_list_add(struct rt6_info *rt) { - struct inet_peer_base *base; - struct inet_peer *peer; + struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); - base = inetpeer_base_ptr(rt->_rt6i_peer); - if (!base) - return; + rt->dst.flags |= DST_NOCACHE; + rt->rt6i_uncached_list = ul; + + spin_lock_bh(&ul->lock); + list_add_tail(&rt->rt6i_uncached, &ul->head); + spin_unlock_bh(&ul->lock); +} - peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create); - if (peer) { - if (!rt6_set_peer(rt, peer)) - inet_putpeer(peer); +static void rt6_uncached_list_del(struct rt6_info *rt) +{ + if (!list_empty(&rt->rt6i_uncached)) { + struct uncached_list *ul = rt->rt6i_uncached_list; + + spin_lock_bh(&ul->lock); + list_del(&rt->rt6i_uncached); + spin_unlock_bh(&ul->lock); } } -static struct inet_peer *__rt6_get_peer(struct rt6_info *rt, int create) +static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { - if (rt6_has_peer(rt)) - return rt6_peer_ptr(rt); + struct net_device *loopback_dev = net->loopback_dev; + int cpu; + + for_each_possible_cpu(cpu) { + struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); + struct rt6_info *rt; + + spin_lock_bh(&ul->lock); + list_for_each_entry(rt, &ul->head, rt6i_uncached) { + struct inet6_dev *rt_idev = rt->rt6i_idev; + struct net_device *rt_dev = rt->dst.dev; + + if (rt_idev && (rt_idev->dev == dev || !dev) && + rt_idev->dev != loopback_dev) { + rt->rt6i_idev = in6_dev_get(loopback_dev); + in6_dev_put(rt_idev); + } - rt6_bind_peer(rt, create); - return (rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL); + if (rt_dev && (rt_dev == dev || !dev) && + rt_dev != loopback_dev) { + rt->dst.dev = loopback_dev; + dev_hold(rt->dst.dev); + dev_put(rt_dev); + } + } + spin_unlock_bh(&ul->lock); + } } -static struct inet_peer *rt6_get_peer_create(struct rt6_info *rt) +static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) { - return __rt6_get_peer(rt, 1); + return dst_metrics_write_ptr(rt->dst.from); } static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) { - struct rt6_info *rt = (struct rt6_info *) dst; - struct inet_peer *peer; - u32 *p = NULL; + struct rt6_info *rt = (struct rt6_info *)dst; - if (!(rt->dst.flags & DST_HOST)) + if (rt->rt6i_flags & RTF_PCPU) + return rt6_pcpu_cow_metrics(rt); + else if (rt->rt6i_flags & RTF_CACHE) + return NULL; + else return dst_cow_metrics_generic(dst, old); - - peer = rt6_get_peer_create(rt); - if (peer) { - u32 *old_p = __DST_METRICS_PTR(old); - unsigned long prev, new; - - p = peer->metrics; - if (inet_metrics_new(peer) || - (old & DST_METRICS_FORCE_OVERWRITE)) - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); - - new = (unsigned long) p; - prev = cmpxchg(&dst->_metrics, old, new); - - if (prev != old) { - p = __DST_METRICS_PTR(prev); - if (prev & DST_METRICS_READ_ONLY) - p = NULL; - } - } - return p; } static inline const void *choose_neigh_daddr(struct rt6_info *rt, @@ -299,10 +316,10 @@ static const struct rt6_info ip6_blk_hole_entry_template = { #endif /* allocate dst with ip6_dst_ops */ -static inline struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags, - struct fib6_table *table) +static struct rt6_info *__ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags, + struct fib6_table *table) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0, DST_OBSOLETE_FORCE_CHK, flags); @@ -311,21 +328,54 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, struct dst_entry *dst = &rt->dst; memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); - rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); INIT_LIST_HEAD(&rt->rt6i_siblings); + INIT_LIST_HEAD(&rt->rt6i_uncached); + } + return rt; +} + +static struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags, + struct fib6_table *table) +{ + struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table); + + if (rt) { + rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); + if (rt->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **p; + + p = per_cpu_ptr(rt->rt6i_pcpu, cpu); + /* no one shares rt */ + *p = NULL; + } + } else { + dst_destroy((struct dst_entry *)rt); + return NULL; + } } + return rt; } static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; - struct inet6_dev *idev = rt->rt6i_idev; struct dst_entry *from = dst->from; + struct inet6_dev *idev; + + dst_destroy_metrics_generic(dst); - if (!(rt->dst.flags & DST_HOST)) - dst_destroy_metrics_generic(dst); + if (rt->rt6i_pcpu) + free_percpu(rt->rt6i_pcpu); + rt6_uncached_list_del(rt); + + idev = rt->rt6i_idev; if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); @@ -333,11 +383,6 @@ static void ip6_dst_destroy(struct dst_entry *dst) dst->from = NULL; dst_release(from); - - if (rt6_has_peer(rt)) { - struct inet_peer *peer = rt6_peer_ptr(rt); - inet_putpeer(peer); - } } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, @@ -652,15 +697,33 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, u32 metric, int oif, int strict, bool *do_rr) { - struct rt6_info *rt, *match; + struct rt6_info *rt, *match, *cont; int mpri = -1; match = NULL; - for (rt = rr_head; rt && rt->rt6i_metric == metric; - rt = rt->dst.rt6_next) + cont = NULL; + for (rt = rr_head; rt; rt = rt->dst.rt6_next) { + if (rt->rt6i_metric != metric) { + cont = rt; + break; + } + match = find_match(rt, oif, strict, &mpri, match, do_rr); - for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric; - rt = rt->dst.rt6_next) + } + + for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + if (rt->rt6i_metric != metric) { + cont = rt; + break; + } + + match = find_match(rt, oif, strict, &mpri, match, do_rr); + } + + if (match || !cont) + return match; + + for (rt = cont; rt; rt = rt->dst.rt6_next) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; @@ -694,6 +757,11 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) return match ? match : net->ipv6.ip6_null_entry; } +static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); +} + #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr) @@ -872,9 +940,9 @@ int ip6_ins_rt(struct rt6_info *rt) return __ip6_ins_rt(rt, &info, &mxc); } -static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, - const struct in6_addr *daddr, - const struct in6_addr *saddr) +static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct rt6_info *rt; @@ -882,15 +950,26 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, * Clone the route. */ - rt = ip6_rt_copy(ort, daddr); + if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) + ort = (struct rt6_info *)ort->dst.from; - if (rt) { + rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, + 0, ort->rt6i_table); + + if (!rt) + return NULL; + + ip6_rt_copy_init(rt, ort); + rt->rt6i_flags |= RTF_CACHE; + rt->rt6i_metric = 0; + rt->dst.flags |= DST_HOST; + rt->rt6i_dst.addr = *daddr; + rt->rt6i_dst.plen = 128; + + if (!rt6_is_gw_or_nonexthop(ort)) { if (ort->rt6i_dst.plen != 128 && ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; - - rt->rt6i_flags |= RTF_CACHE; - #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { rt->rt6i_src.addr = *saddr; @@ -902,30 +981,65 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, return rt; } -static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, - const struct in6_addr *daddr) +static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) { - struct rt6_info *rt = ip6_rt_copy(ort, daddr); + struct rt6_info *pcpu_rt; - if (rt) - rt->rt6i_flags |= RTF_CACHE; - return rt; + pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev), + rt->dst.dev, rt->dst.flags, + rt->rt6i_table); + + if (!pcpu_rt) + return NULL; + ip6_rt_copy_init(pcpu_rt, rt); + pcpu_rt->rt6i_protocol = rt->rt6i_protocol; + pcpu_rt->rt6i_flags |= RTF_PCPU; + return pcpu_rt; +} + +/* It should be called with read_lock_bh(&tb6_lock) acquired */ +static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) +{ + struct rt6_info *pcpu_rt, *prev, **p; + + p = this_cpu_ptr(rt->rt6i_pcpu); + pcpu_rt = *p; + + if (pcpu_rt) + goto done; + + pcpu_rt = ip6_rt_pcpu_alloc(rt); + if (!pcpu_rt) { + struct net *net = dev_net(rt->dst.dev); + + pcpu_rt = net->ipv6.ip6_null_entry; + goto done; + } + + prev = cmpxchg(p, NULL, pcpu_rt); + if (prev) { + /* If someone did it before us, return prev instead */ + dst_destroy(&pcpu_rt->dst); + pcpu_rt = prev; + } + +done: + dst_hold(&pcpu_rt->dst); + rt6_dst_from_metrics_check(pcpu_rt); + return pcpu_rt; } static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt, *nrt; + struct rt6_info *rt; int strict = 0; - int attempts = 3; - int err; strict |= flags & RT6_LOOKUP_F_IFACE; if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; -redo_fib6_lookup_lock: read_lock_bh(&table->tb6_lock); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); @@ -944,51 +1058,52 @@ redo_rt6_select: strict &= ~RT6_LOOKUP_F_REACHABLE; fn = saved_fn; goto redo_rt6_select; - } else { - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - goto out2; } } - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - if (rt->rt6i_flags & RTF_CACHE) - goto out2; + if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); - if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY))) - nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr); - else if (!(rt->dst.flags & DST_HOST)) - nrt = rt6_alloc_clone(rt, &fl6->daddr); - else - goto out2; + rt6_dst_from_metrics_check(rt); + return rt; + } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && + !(rt->rt6i_flags & RTF_GATEWAY))) { + /* Create a RTF_CACHE clone which will not be + * owned by the fib6 tree. It is for the special case where + * the daddr in the skb during the neighbor look-up is different + * from the fl6->daddr used to look-up route here. + */ - ip6_rt_put(rt); - rt = nrt ? : net->ipv6.ip6_null_entry; + struct rt6_info *uncached_rt; - dst_hold(&rt->dst); - if (nrt) { - err = ip6_ins_rt(nrt); - if (!err) - goto out2; - } + dst_use(&rt->dst, jiffies); + read_unlock_bh(&table->tb6_lock); - if (--attempts <= 0) - goto out2; + uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); + dst_release(&rt->dst); - /* - * Race condition! In the gap, when table->tb6_lock was - * released someone could insert this route. Relookup. - */ - ip6_rt_put(rt); - goto redo_fib6_lookup_lock; + if (uncached_rt) + rt6_uncached_list_add(uncached_rt); + else + uncached_rt = net->ipv6.ip6_null_entry; -out2: - rt->dst.lastuse = jiffies; - rt->dst.__use++; + dst_hold(&uncached_rt->dst); + return uncached_rt; - return rt; + } else { + /* Get a percpu copy */ + + struct rt6_info *pcpu_rt; + + rt->dst.lastuse = jiffies; + rt->dst.__use++; + pcpu_rt = rt6_get_pcpu_route(rt); + read_unlock_bh(&table->tb6_lock); + + return pcpu_rt; + } } static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, @@ -1059,7 +1174,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori new = &rt->dst; memset(new + 1, 0, sizeof(*rt) - sizeof(*new)); - rt6_init_peer(rt, net->ipv6.peers); new->__use = 1; new->input = dst_discard; @@ -1093,6 +1207,33 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori * Destination cache support functions */ +static void rt6_dst_from_metrics_check(struct rt6_info *rt) +{ + if (rt->dst.from && + dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) + dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); +} + +static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) +{ + if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + return NULL; + + if (rt6_check_expired(rt)) + return NULL; + + return &rt->dst; +} + +static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) +{ + if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + return &rt->dst; + else + return NULL; +} + static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct rt6_info *rt; @@ -1103,13 +1244,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) - return NULL; - if (rt6_check_expired(rt)) - return NULL; + rt6_dst_from_metrics_check(rt); - return dst; + if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE)) + return rt6_dst_from_check(rt, cookie); + else + return rt6_check(rt, cookie); } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) @@ -1148,24 +1289,63 @@ static void ip6_link_failure(struct sk_buff *skb) } } -static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, u32 mtu) +static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) +{ + struct net *net = dev_net(rt->dst.dev); + + rt->rt6i_flags |= RTF_MODIFIED; + rt->rt6i_pmtu = mtu; + rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); +} + +static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, + const struct ipv6hdr *iph, u32 mtu) { struct rt6_info *rt6 = (struct rt6_info *)dst; - dst_confirm(dst); - if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { - struct net *net = dev_net(dst->dev); + if (rt6->rt6i_flags & RTF_LOCAL) + return; - rt6->rt6i_flags |= RTF_MODIFIED; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; + dst_confirm(dst); + mtu = max_t(u32, mtu, IPV6_MIN_MTU); + if (mtu >= dst_mtu(dst)) + return; - dst_metric_set(dst, RTAX_MTU, mtu); - rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires); + if (rt6->rt6i_flags & RTF_CACHE) { + rt6_do_update_pmtu(rt6, mtu); + } else { + const struct in6_addr *daddr, *saddr; + struct rt6_info *nrt6; + + if (iph) { + daddr = &iph->daddr; + saddr = &iph->saddr; + } else if (sk) { + daddr = &sk->sk_v6_daddr; + saddr = &inet6_sk(sk)->saddr; + } else { + return; + } + nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); + if (nrt6) { + rt6_do_update_pmtu(nrt6, mtu); + + /* ip6_ins_rt(nrt6) will bump the + * rt6->rt6i_node->fn_sernum + * which will fail the next rt6_check() and + * invalidate the sk->sk_dst_cache. + */ + ip6_ins_rt(nrt6); + } } } +static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, u32 mtu) +{ + __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); +} + void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark) { @@ -1182,7 +1362,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) - ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu)); + __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); @@ -1341,12 +1521,17 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) static unsigned int ip6_mtu(const struct dst_entry *dst) { + const struct rt6_info *rt = (const struct rt6_info *)dst; + unsigned int mtu = rt->rt6i_pmtu; struct inet6_dev *idev; - unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + goto out; + mtu = IPV6_MIN_MTU; rcu_read_lock(); @@ -1590,10 +1775,8 @@ int ip6_route_add(struct fib6_config *cfg) ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; - if (rt->rt6i_dst.plen == 128) { + if (rt->rt6i_dst.plen == 128) rt->dst.flags |= DST_HOST; - dst_metrics_set_force_overwrite(&rt->dst); - } #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); @@ -1651,6 +1834,16 @@ int ip6_route_add(struct fib6_config *cfg) int gwa_type; gw_addr = &cfg->fc_gateway; + + /* if gw_addr is local we will fail to detect this in case + * address is still TENTATIVE (DAD in progress). rt6_lookup() + * will return already-added prefix route via interface that + * prefix route was assigned to, which might be non-loopback. + */ + err = -EINVAL; + if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0)) + goto out; + rt->rt6i_gateway = *gw_addr; gwa_type = ipv6_addr_type(gw_addr); @@ -1664,7 +1857,6 @@ int ip6_route_add(struct fib6_config *cfg) (SIT, PtP, NBMA NOARP links) it is handy to allow some exceptions. --ANK */ - err = -EINVAL; if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; @@ -1785,6 +1977,9 @@ static int ip6_route_del(struct fib6_config *cfg) if (fn) { for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + if ((rt->rt6i_flags & RTF_CACHE) && + !(cfg->fc_flags & RTF_CACHE)) + continue; if (cfg->fc_ifindex && (!rt->dst.dev || rt->dst.dev->ifindex != cfg->fc_ifindex)) @@ -1894,7 +2089,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu NEIGH_UPDATE_F_ISROUTER)) ); - nrt = ip6_rt_copy(rt, &msg->dest); + nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); if (!nrt) goto out; @@ -1926,42 +2121,35 @@ out: * Misc support functions */ -static struct rt6_info *ip6_rt_copy(struct rt6_info *ort, - const struct in6_addr *dest) +static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) { - struct net *net = dev_net(ort->dst.dev); - struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0, - ort->rt6i_table); + BUG_ON(from->dst.from); - if (rt) { - rt->dst.input = ort->dst.input; - rt->dst.output = ort->dst.output; - rt->dst.flags |= DST_HOST; - - rt->rt6i_dst.addr = *dest; - rt->rt6i_dst.plen = 128; - dst_copy_metrics(&rt->dst, &ort->dst); - rt->dst.error = ort->dst.error; - rt->rt6i_idev = ort->rt6i_idev; - if (rt->rt6i_idev) - in6_dev_hold(rt->rt6i_idev); - rt->dst.lastuse = jiffies; - - if (ort->rt6i_flags & RTF_GATEWAY) - rt->rt6i_gateway = ort->rt6i_gateway; - else - rt->rt6i_gateway = *dest; - rt->rt6i_flags = ort->rt6i_flags; - rt6_set_from(rt, ort); - rt->rt6i_metric = 0; + rt->rt6i_flags &= ~RTF_EXPIRES; + dst_hold(&from->dst); + rt->dst.from = &from->dst; + dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); +} +static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) +{ + rt->dst.input = ort->dst.input; + rt->dst.output = ort->dst.output; + rt->rt6i_dst = ort->rt6i_dst; + rt->dst.error = ort->dst.error; + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->dst.lastuse = jiffies; + rt->rt6i_gateway = ort->rt6i_gateway; + rt->rt6i_flags = ort->rt6i_flags; + rt6_set_from(rt, ort); + rt->rt6i_metric = ort->rt6i_metric; #ifdef CONFIG_IPV6_SUBTREES - memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); + rt->rt6i_src = ort->rt6i_src; #endif - memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key)); - rt->rt6i_table = ort->rt6i_table; - } - return rt; + rt->rt6i_prefsrc = ort->rt6i_prefsrc; + rt->rt6i_table = ort->rt6i_table; } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -2336,6 +2524,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev) fib6_clean_all(net, fib6_ifdown, &adn); icmp6_clean_all(fib6_ifdown, &adn); + rt6_uncached_list_flush_dev(net, dev); } struct rt6_mtu_change_arg { @@ -2373,11 +2562,20 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) PMTU discouvery. */ if (rt->dst.dev == arg->dev && - !dst_metric_locked(&rt->dst, RTAX_MTU) && - (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6))) { - dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + !dst_metric_locked(&rt->dst, RTAX_MTU)) { + if (rt->rt6i_flags & RTF_CACHE) { + /* For RTF_CACHE with rt6i_pmtu == 0 + * (i.e. a redirected route), + * the metrics of its rt->dst.from has already + * been updated. + */ + if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) + rt->rt6i_pmtu = arg->mtu; + } else if (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); + } } return 0; } @@ -2434,6 +2632,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (rtm->rtm_type == RTN_LOCAL) cfg->fc_flags |= RTF_LOCAL; + if (rtm->rtm_flags & RTM_F_CLONED) + cfg->fc_flags |= RTF_CACHE; + cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); @@ -2608,6 +2809,7 @@ static int rt6_fill_node(struct net *net, int iif, int type, u32 portid, u32 seq, int prefix, int nowait, unsigned int flags) { + u32 metrics[RTAX_MAX]; struct rtmsg *rtm; struct nlmsghdr *nlh; long expires; @@ -2721,7 +2923,10 @@ static int rt6_fill_node(struct net *net, goto nla_put_failure; } - if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); + if (rt->rt6i_pmtu) + metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; + if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (rt->rt6i_flags & RTF_GATEWAY) { @@ -3216,6 +3421,7 @@ static struct notifier_block ip6_route_dev_notifier = { int __init ip6_route_init(void) { int ret; + int cpu; ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = @@ -3275,6 +3481,13 @@ int __init ip6_route_init(void) if (ret) goto out_register_late_subsys; + for_each_possible_cpu(cpu) { + struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); + + INIT_LIST_HEAD(&ul->head); + spin_lock_init(&ul->lock); + } + out: return ret; |