diff options
author | Artem Bityutskiy <Artem.Bityutskiy@nokia.com> | 2011-03-25 17:41:20 +0200 |
---|---|---|
committer | Artem Bityutskiy <Artem.Bityutskiy@nokia.com> | 2011-03-25 17:41:20 +0200 |
commit | 7bf7e370d5919112c223a269462cd0b546903829 (patch) | |
tree | 03ccc715239df14ae168277dbccc9d9cf4d8a2c8 /net/ipv4 | |
parent | 68b1a1e786f29c900fa1c516a402e24f0ece622a (diff) | |
parent | d39dd11c3e6a7af5c20bfac40594db36cf270f42 (diff) |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-linus-1
* 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6: (9356 commits)
[media] rc: update for bitop name changes
fs: simplify iget & friends
fs: pull inode->i_lock up out of writeback_single_inode
fs: rename inode_lock to inode_hash_lock
fs: move i_wb_list out from under inode_lock
fs: move i_sb_list out from under inode_lock
fs: remove inode_lock from iput_final and prune_icache
fs: Lock the inode LRU list separately
fs: factor inode disposal
fs: protect inode->i_state with inode->i_lock
lib, arch: add filter argument to show_mem and fix private implementations
SLUB: Write to per cpu data when allocating it
slub: Fix debugobjects with lockless fastpath
autofs4: Do not potentially dereference NULL pointer returned by fget() in autofs_dev_ioctl_setpipefd()
autofs4 - remove autofs4_lock
autofs4 - fix d_manage() return on rcu-walk
autofs4 - fix autofs4_expire_indirect() traversal
autofs4 - fix dentry leak in autofs4_expire_direct()
autofs4 - reinstate last used update on access
vfs - check non-mountpoint dentry might block in __follow_mount_rcu()
...
NOTE!
This merge commit was created to fix compilation error. The block
tree was merged upstream and removed the 'elv_queue_empty()'
function which the new 'mtdswap' driver is using. So a simple
merge of the mtd tree with upstream does not compile. And the
mtd tree has already be published, so re-basing it is not an option.
To fix this unfortunate situation, I had to merge upstream into the
mtd-2.6.git tree without committing, put the fixup patch on top of
this, and then commit this. The result is that we do not have commits
which do not compile.
In other words, this merge commit "merges" 3 things: the MTD tree, the
upstream tree, and the fixup patch.
Diffstat (limited to 'net/ipv4')
62 files changed, 2130 insertions, 3076 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index a5a1050595d1..cbb505ba9324 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -55,45 +55,9 @@ config IP_ADVANCED_ROUTER If unsure, say N here. -choice - prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" - depends on IP_ADVANCED_ROUTER - default ASK_IP_FIB_HASH - -config ASK_IP_FIB_HASH - bool "FIB_HASH" - ---help--- - Current FIB is very proven and good enough for most users. - -config IP_FIB_TRIE - bool "FIB_TRIE" - ---help--- - Use new experimental LC-trie as FIB lookup algorithm. - This improves lookup performance if you have a large - number of routes. - - LC-trie is a longest matching prefix lookup algorithm which - performs better than FIB_HASH for large routing tables. - But, it consumes more memory and is more complex. - - LC-trie is described in: - - IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson - IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, - June 1999 - - An experimental study of compression methods for dynamic tries - Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - <http://www.csc.kth.se/~snilsson/software/dyntrie2/> - -endchoice - -config IP_FIB_HASH - def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER - config IP_FIB_TRIE_STATS bool "FIB TRIE statistics" - depends on IP_FIB_TRIE + depends on IP_ADVANCED_ROUTER ---help--- Keep track of statistics on structure of FIB TRIE table. Useful for testing and measuring TRIE performance. @@ -140,6 +104,9 @@ config IP_ROUTE_VERBOSE handled by the klogd daemon which is responsible for kernel messages ("man klogd"). +config IP_ROUTE_CLASSID + bool + config IP_PNP bool "IP: kernel level autoconfiguration" help @@ -657,4 +624,3 @@ config TCP_MD5SIG on the Internet. If unsure, say N. - diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4978d22f9a75..0dc772d0d125 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,12 +10,10 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ - fib_frontend.o fib_semantics.o \ + fib_frontend.o fib_semantics.o fib_trie.o \ inet_fragment.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o -obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o -obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f2b61107df6c..807d83c02ef6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -880,6 +880,19 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } EXPORT_SYMBOL(inet_ioctl); +#ifdef CONFIG_COMPAT +int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err = -ENOIOCTLCMD; + + if (sk->sk_prot->compat_ioctl) + err = sk->sk_prot->compat_ioctl(sk, cmd, arg); + + return err; +} +#endif + const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, @@ -903,6 +916,7 @@ const struct proto_ops inet_stream_ops = { #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, + .compat_ioctl = inet_compat_ioctl, #endif }; EXPORT_SYMBOL(inet_stream_ops); @@ -929,6 +943,7 @@ const struct proto_ops inet_dgram_ops = { #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, + .compat_ioctl = inet_compat_ioctl, #endif }; EXPORT_SYMBOL(inet_dgram_ops); @@ -959,6 +974,7 @@ static const struct proto_ops inet_sockraw_ops = { #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, + .compat_ioctl = inet_compat_ioctl, #endif }; @@ -1085,23 +1101,20 @@ int sysctl_ip_dynaddr __read_mostly; static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); - int err; - struct rtable *rt; __be32 old_saddr = inet->inet_saddr; - __be32 new_saddr; __be32 daddr = inet->inet_daddr; + struct rtable *rt; + __be32 new_saddr; if (inet->opt && inet->opt->srr) daddr = inet->opt->faddr; /* Query new route. */ - err = ip_route_connect(&rt, daddr, 0, - RT_CONN_FLAGS(sk), - sk->sk_bound_dev_if, - sk->sk_protocol, - inet->inet_sport, inet->inet_dport, sk, 0); - if (err) - return err; + rt = ip_route_connect(daddr, 0, RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if, sk->sk_protocol, + inet->inet_sport, inet->inet_dport, sk, false); + if (IS_ERR(rt)) + return PTR_ERR(rt); sk_setup_caps(sk, &rt->dst); @@ -1144,25 +1157,16 @@ int inet_sk_rebuild_header(struct sock *sk) daddr = inet->inet_daddr; if (inet->opt && inet->opt->srr) daddr = inet->opt->faddr; -{ - struct flowi fl = { - .oif = sk->sk_bound_dev_if, - .mark = sk->sk_mark, - .fl4_dst = daddr, - .fl4_src = inet->inet_saddr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet->inet_sport, - .fl_ip_dport = inet->inet_dport, - }; - - security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0); -} - if (!err) + rt = ip_route_output_ports(sock_net(sk), sk, daddr, inet->inet_saddr, + inet->inet_dport, inet->inet_sport, + sk->sk_protocol, RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); + if (!IS_ERR(rt)) { + err = 0; sk_setup_caps(sk, &rt->dst); - else { + } else { + err = PTR_ERR(rt); + /* Routing failed... */ sk->sk_route_caps = 0; /* @@ -1215,7 +1219,7 @@ out: return err; } -static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct iphdr *iph; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 86961bec70ab..4286fd3cc0e2 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -201,11 +201,14 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->ttl = 0; top_iph->check = 0; - ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + if (x->props.flags & XFRM_STATE_ALIGN4) + ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; + else + ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; - ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); + ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, 0, skb->len); @@ -299,9 +302,15 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; - if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && - ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) - goto out; + if (x->props.flags & XFRM_STATE_ALIGN4) { + if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + } else { + if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) + goto out; + } if (!pskb_may_pull(skb, ah_hlen)) goto out; @@ -450,8 +459,12 @@ static int ah_init_state(struct xfrm_state *x) BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + - ahp->icv_trunc_len); + if (x->props.flags & XFRM_STATE_ALIGN4) + x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); + else + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); x->data = ahp; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 04c8b69fd426..090d273d7865 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -433,14 +433,13 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { - struct flowi fl = { .fl4_dst = sip, - .fl4_src = tip }; struct rtable *rt; int flag = 0; /*unsigned long now; */ struct net *net = dev_net(dev); - if (ip_route_output_key(net, &rt, &fl) < 0) + rt = ip_route_output(net, sip, tip, 0, 0); + if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); @@ -1017,14 +1016,13 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } - if (__in_dev_get_rcu(dev)) { - IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on); + if (__in_dev_get_rtnl(dev)) { + IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); return 0; } return -ENXIO; } -/* must be called with rcu_read_lock() */ static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { @@ -1062,12 +1060,10 @@ static int arp_req_set(struct net *net, struct arpreq *r, if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (dev == NULL) { - struct flowi fl = { .fl4_dst = ip, - .fl4_tos = RTO_ONLINK }; - struct rtable *rt; - err = ip_route_output_key(net, &rt, &fl); - if (err != 0) - return err; + struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + + if (IS_ERR(rt)) + return PTR_ERR(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) @@ -1178,7 +1174,6 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r, static int arp_req_delete(struct net *net, struct arpreq *r, struct net_device *dev) { - int err; __be32 ip; if (r->arp_flags & ATF_PUBL) @@ -1186,12 +1181,9 @@ static int arp_req_delete(struct net *net, struct arpreq *r, ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (dev == NULL) { - struct flowi fl = { .fl4_dst = ip, - .fl4_tos = RTO_ONLINK }; - struct rtable *rt; - err = ip_route_output_key(net, &rt, &fl); - if (err != 0) - return err; + struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0); + if (IS_ERR(rt)) + return PTR_ERR(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) @@ -1233,10 +1225,10 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!(r.arp_flags & ATF_NETMASK)) ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = htonl(0xFFFFFFFFUL); - rcu_read_lock(); + rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - dev = dev_get_by_name_rcu(net, r.arp_dev); + dev = __dev_get_by_name(net, r.arp_dev); if (dev == NULL) goto out; @@ -1263,7 +1255,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; } out: - rcu_read_unlock(); + rtnl_unlock(); if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r))) err = -EFAULT; return err; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 174be6caa5c8..85bd24ca4f6d 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -46,11 +46,12 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!saddr) saddr = inet->mc_addr; } - err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, - RT_CONN_FLAGS(sk), oif, - sk->sk_protocol, - inet->inet_sport, usin->sin_port, sk, 1); - if (err) { + rt = ip_route_connect(usin->sin_addr.s_addr, saddr, + RT_CONN_FLAGS(sk), oif, + sk->sk_protocol, + inet->inet_sport, usin->sin_port, sk, true); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); if (err == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 748cb5b337bd..6d85800daeb7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -51,6 +51,7 @@ #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/slab.h> +#include <linux/hash.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -92,6 +93,71 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; +/* inet_addr_hash's shifting is dependent upon this IN4_ADDR_HSIZE + * value. So if you change this define, make appropriate changes to + * inet_addr_hash as well. + */ +#define IN4_ADDR_HSIZE 256 +static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; +static DEFINE_SPINLOCK(inet_addr_hash_lock); + +static inline unsigned int inet_addr_hash(struct net *net, __be32 addr) +{ + u32 val = (__force u32) addr ^ hash_ptr(net, 8); + + return ((val ^ (val >> 8) ^ (val >> 16) ^ (val >> 24)) & + (IN4_ADDR_HSIZE - 1)); +} + +static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) +{ + unsigned int hash = inet_addr_hash(net, ifa->ifa_local); + + spin_lock(&inet_addr_hash_lock); + hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); + spin_unlock(&inet_addr_hash_lock); +} + +static void inet_hash_remove(struct in_ifaddr *ifa) +{ + spin_lock(&inet_addr_hash_lock); + hlist_del_init_rcu(&ifa->hash); + spin_unlock(&inet_addr_hash_lock); +} + +/** + * __ip_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * @devref: if true, take a reference on the found device + * + * If a caller uses devref=false, it should be protected by RCU, or RTNL + */ +struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) +{ + unsigned int hash = inet_addr_hash(net, addr); + struct net_device *result = NULL; + struct in_ifaddr *ifa; + struct hlist_node *node; + + rcu_read_lock(); + hlist_for_each_entry_rcu(ifa, node, &inet_addr_lst[hash], hash) { + struct net_device *dev = ifa->ifa_dev->dev; + + if (!net_eq(dev_net(dev), net)) + continue; + if (ifa->ifa_local == addr) { + result = dev; + break; + } + } + if (result && devref) + dev_hold(result); + rcu_read_unlock(); + return result; +} +EXPORT_SYMBOL(__ip_dev_find); + static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); @@ -265,6 +331,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, } if (!do_promote) { + inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, pid); @@ -281,6 +348,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, /* 2. Unlink it */ *ifap = ifa1->ifa_next; + inet_hash_remove(ifa1); /* 3. Announce address deletion */ @@ -368,6 +436,8 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifa->ifa_next = *ifap; *ifap = ifa; + inet_hash_insert(dev_net(in_dev->dev), ifa); + /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ @@ -521,6 +591,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) if (tb[IFA_ADDRESS] == NULL) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; + INIT_HLIST_NODE(&ifa->hash); ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = ifm->ifa_flags; @@ -670,7 +741,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) ifap = &ifa->ifa_next) { if (!strcmp(ifr.ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == - ifa->ifa_address) { + ifa->ifa_local) { break; /* found */ } } @@ -728,6 +799,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { ret = -ENOBUFS; ifa = inet_alloc_ifa(); + INIT_HLIST_NODE(&ifa->hash); if (!ifa) break; if (colon) @@ -1030,6 +1102,21 @@ static inline bool inetdev_valid_mtu(unsigned mtu) return mtu >= 68; } +static void inetdev_send_gratuitous_arp(struct net_device *dev, + struct in_device *in_dev) + +{ + struct in_ifaddr *ifa = in_dev->ifa_list; + + if (!ifa) + return; + + arp_send(ARPOP_REQUEST, ETH_P_ARP, + ifa->ifa_local, dev, + ifa->ifa_local, NULL, + dev->dev_addr, NULL); +} + /* Called only under RTNL semaphore */ static int inetdev_event(struct notifier_block *this, unsigned long event, @@ -1069,6 +1156,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, struct in_ifaddr *ifa = inet_alloc_ifa(); if (ifa) { + INIT_HLIST_NODE(&ifa->hash); ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; @@ -1082,18 +1170,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, } ip_mc_up(in_dev); /* fall through */ - case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEADDR: + if (!IN_DEV_ARP_NOTIFY(in_dev)) + break; + /* fall through */ + case NETDEV_NOTIFY_PEERS: /* Send gratuitous ARP to notify of link change */ - if (IN_DEV_ARP_NOTIFY(in_dev)) { - struct in_ifaddr *ifa = in_dev->ifa_list; - - if (ifa) - arp_send(ARPOP_REQUEST, ETH_P_ARP, - ifa->ifa_address, dev, - ifa->ifa_address, NULL, - dev->dev_addr, NULL); - } + inetdev_send_gratuitous_arp(dev, in_dev); break; case NETDEV_DOWN: ip_mc_down(in_dev); @@ -1710,6 +1793,11 @@ static struct rtnl_af_ops inet_af_ops = { void __init devinet_init(void) { + int i; + + for (i = 0; i < IN4_ADDR_HSIZE; i++) + INIT_HLIST_HEAD(&inet_addr_lst[i]); + register_pernet_subsys(&devinet_ops); register_gifconf(PF_INET, inet_gifconf); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index e42a905180f0..03f994bcf7de 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -33,11 +33,14 @@ static u32 esp4_get_mtu(struct xfrm_state *x, int mtu); * * TODO: Use spare space in skb for this where possible. */ -static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) +static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqhilen) { unsigned int len; - len = crypto_aead_ivsize(aead); + len = seqhilen; + + len += crypto_aead_ivsize(aead); + if (len) { len += crypto_aead_alignmask(aead) & ~(crypto_tfm_ctx_alignment() - 1); @@ -52,10 +55,15 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags) return kmalloc(len, GFP_ATOMIC); } -static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp) +static inline __be32 *esp_tmp_seqhi(void *tmp) +{ + return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); +} +static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) { return crypto_aead_ivsize(aead) ? - PTR_ALIGN((u8 *)tmp, crypto_aead_alignmask(aead) + 1) : tmp; + PTR_ALIGN((u8 *)tmp + seqhilen, + crypto_aead_alignmask(aead) + 1) : tmp + seqhilen; } static inline struct aead_givcrypt_request *esp_tmp_givreq( @@ -122,6 +130,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) int plen; int tfclen; int nfrags; + int assoclen; + int sglists; + int seqhilen; + __be32 *seqhi; /* skb is pure payload to encrypt */ @@ -151,14 +163,25 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) goto error; nfrags = err; - tmp = esp_alloc_tmp(aead, nfrags + 1); + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); if (!tmp) goto error; - iv = esp_tmp_iv(aead, tmp); + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_givreq(aead, iv); asg = esp_givreq_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; /* Fill padding... */ tail = skb_tail_pointer(trailer); @@ -215,19 +238,27 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) } esph->spi = x->id.spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, esph->enc_data + crypto_aead_ivsize(aead) - skb->data, clen + alen); - sg_init_one(asg, esph, sizeof(*esph)); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); aead_givcrypt_set_callback(req, 0, esp_output_done, skb); aead_givcrypt_set_crypt(req, sg, sg, clen, iv); - aead_givcrypt_set_assoc(req, asg, sizeof(*esph)); + aead_givcrypt_set_assoc(req, asg, assoclen); aead_givcrypt_set_giv(req, esph->enc_data, - XFRM_SKB_CB(skb)->seq.output); + XFRM_SKB_CB(skb)->seq.output.low); ESP_SKB_CB(skb)->tmp = tmp; err = crypto_aead_givencrypt(req); @@ -346,6 +377,10 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) struct sk_buff *trailer; int elen = skb->len - sizeof(*esph) - crypto_aead_ivsize(aead); int nfrags; + int assoclen; + int sglists; + int seqhilen; + __be32 *seqhi; void *tmp; u8 *iv; struct scatterlist *sg; @@ -362,16 +397,27 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) goto out; nfrags = err; + assoclen = sizeof(*esph); + sglists = 1; + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + sglists += 2; + seqhilen += sizeof(__be32); + assoclen += seqhilen; + } + err = -ENOMEM; - tmp = esp_alloc_tmp(aead, nfrags + 1); + tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); if (!tmp) goto out; ESP_SKB_CB(skb)->tmp = tmp; - iv = esp_tmp_iv(aead, tmp); + seqhi = esp_tmp_seqhi(tmp); + iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); asg = esp_req_sg(aead, req); - sg = asg + 1; + sg = asg + sglists; skb->ip_summed = CHECKSUM_NONE; @@ -382,11 +428,19 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) sg_init_table(sg, nfrags); skb_to_sgvec(skb, sg, sizeof(*esph) + crypto_aead_ivsize(aead), elen); - sg_init_one(asg, esph, sizeof(*esph)); + + if ((x->props.flags & XFRM_STATE_ESN)) { + sg_init_table(asg, 3); + sg_set_buf(asg, &esph->spi, sizeof(__be32)); + *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; + sg_set_buf(asg + 1, seqhi, seqhilen); + sg_set_buf(asg + 2, &esph->seq_no, sizeof(__be32)); + } else + sg_init_one(asg, esph, sizeof(*esph)); aead_request_set_callback(req, 0, esp_input_done, skb); aead_request_set_crypt(req, sg, sg, elen, iv); - aead_request_set_assoc(req, asg, sizeof(*esph)); + aead_request_set_assoc(req, asg, assoclen); err = crypto_aead_decrypt(req); if (err == -EINPROGRESS) @@ -500,10 +554,20 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; err = -ENAMETOOLONG; - if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "authenc(%s,%s)", - x->aalg ? x->aalg->alg_name : "digest_null", - x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) - goto error; + + if ((x->props.flags & XFRM_STATE_ESN)) { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authencesn(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } else { + if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, + "authenc(%s,%s)", + x->aalg ? x->aalg->alg_name : "digest_null", + x->ealg->alg_name) >= CRYPTO_MAX_ALG_NAME) + goto error; + } aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 1d2cdd43a878..a373a259253c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -51,11 +51,11 @@ static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; - local_table = fib_hash_table(RT_TABLE_LOCAL); + local_table = fib_trie_table(RT_TABLE_LOCAL); if (local_table == NULL) return -ENOMEM; - main_table = fib_hash_table(RT_TABLE_MAIN); + main_table = fib_trie_table(RT_TABLE_MAIN); if (main_table == NULL) goto fail; @@ -82,7 +82,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) if (tb) return tb; - tb = fib_hash_table(id); + tb = fib_trie_table(id); if (!tb) return NULL; h = id & (FIB_TABLE_HASHSZ - 1); @@ -114,21 +114,6 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } #endif /* CONFIG_IP_MULTIPLE_TABLES */ -void fib_select_default(struct net *net, - const struct flowi *flp, struct fib_result *res) -{ - struct fib_table *tb; - int table = RT_TABLE_MAIN; -#ifdef CONFIG_IP_MULTIPLE_TABLES - if (res->r == NULL || res->r->action != FR_ACT_TO_TBL) - return; - table = res->r->table; -#endif - tb = fib_get_table(net, table); - if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - fib_table_select_default(tb, flp, res); -} - static void fib_flush(struct net *net) { int flushed = 0; @@ -147,46 +132,6 @@ static void fib_flush(struct net *net) rt_cache_flush(net, -1); } -/** - * __ip_dev_find - find the first device with a given source address. - * @net: the net namespace - * @addr: the source address - * @devref: if true, take a reference on the found device - * - * If a caller uses devref=false, it should be protected by RCU, or RTNL - */ -struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) -{ - struct flowi fl = { - .fl4_dst = addr, - }; - struct fib_result res = { 0 }; - struct net_device *dev = NULL; - struct fib_table *local_table; - -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - - rcu_read_lock(); - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (!local_table || - fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { - rcu_read_unlock(); - return NULL; - } - if (res.type != RTN_LOCAL) - goto out; - dev = FIB_RES_DEV(res); - - if (dev && devref) - dev_hold(dev); -out: - rcu_read_unlock(); - return dev; -} -EXPORT_SYMBOL(__ip_dev_find); - /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. @@ -195,7 +140,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - struct flowi fl = { .fl4_dst = addr }; + struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned ret = RTN_BROADCAST; struct fib_table *local_table; @@ -213,7 +158,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net, if (local_table) { ret = RTN_UNICAST; rcu_read_lock(); - if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { + if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; } @@ -248,19 +193,21 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, u32 *itag, u32 mark) { struct in_device *in_dev; - struct flowi fl = { - .fl4_dst = src, - .fl4_src = dst, - .fl4_tos = tos, - .mark = mark, - .iif = oif - }; + struct flowi4 fl4; struct fib_result res; int no_addr, rpf, accept_local; bool dev_match; int ret; struct net *net; + fl4.flowi4_oif = 0; + fl4.flowi4_iif = oif; + fl4.flowi4_mark = mark; + fl4.daddr = src; + fl4.saddr = dst; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + no_addr = rpf = accept_local = 0; in_dev = __in_dev_get_rcu(dev); if (in_dev) { @@ -268,14 +215,14 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, rpf = IN_DEV_RPFILTER(in_dev); accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); if (mark && !IN_DEV_SRC_VMARK(in_dev)) - fl.mark = 0; + fl4.flowi4_mark = 0; } if (in_dev == NULL) goto e_inval; net = dev_net(dev); - if (fib_lookup(net, &fl, &res)) + if (fib_lookup(net, &fl4, &res)) goto last_resort; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL || !accept_local) @@ -306,10 +253,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, goto last_resort; if (rpf == 1) goto e_rpf; - fl.oif = dev->ifindex; + fl4.flowi4_oif = dev->ifindex; ret = 0; - if (fib_lookup(net, &fl, &res) == 0) { + if (fib_lookup(net, &fl4, &res) == 0) { if (res.type == RTN_UNICAST) { *spec_dst = FIB_RES_PREFSRC(res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; @@ -849,11 +796,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) { struct fib_result res; - struct flowi fl = { - .mark = frn->fl_mark, - .fl4_dst = frn->fl_addr, - .fl4_tos = frn->fl_tos, - .fl4_scope = frn->fl_scope, + struct flowi4 fl4 = { + .flowi4_mark = frn->fl_mark, + .daddr = frn->fl_addr, + .flowi4_tos = frn->fl_tos, + .flowi4_scope = frn->fl_scope, }; #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -866,7 +813,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) frn->tb_id = tb->tb_id; rcu_read_lock(); - frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF); + frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; @@ -945,10 +892,12 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif + fib_update_nh_saddrs(dev); rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: fib_del_ifaddr(ifa); + fib_update_nh_saddrs(dev); if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. * Disable IP. @@ -1101,5 +1050,5 @@ void __init ip_fib_init(void) register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - fib_hash_init(); + fib_trie_init(); } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c deleted file mode 100644 index b3acb0417b21..000000000000 --- a/net/ipv4/fib_hash.c +++ /dev/null @@ -1,1133 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * IPv4 FIB: lookup engine and maintenance routines. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/errno.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/inetdevice.h> -#include <linux/netdevice.h> -#include <linux/if_arp.h> -#include <linux/proc_fs.h> -#include <linux/skbuff.h> -#include <linux/netlink.h> -#include <linux/init.h> -#include <linux/slab.h> - -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> -#include <net/tcp.h> -#include <net/sock.h> -#include <net/ip_fib.h> - -#include "fib_lookup.h" - -static struct kmem_cache *fn_hash_kmem __read_mostly; -static struct kmem_cache *fn_alias_kmem __read_mostly; - -struct fib_node { - struct hlist_node fn_hash; - struct list_head fn_alias; - __be32 fn_key; - struct fib_alias fn_embedded_alias; -}; - -#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head)) - -struct fn_zone { - struct fn_zone __rcu *fz_next; /* Next not empty zone */ - struct hlist_head __rcu *fz_hash; /* Hash table pointer */ - seqlock_t fz_lock; - u32 fz_hashmask; /* (fz_divisor - 1) */ - - u8 fz_order; /* Zone order (0..32) */ - u8 fz_revorder; /* 32 - fz_order */ - __be32 fz_mask; /* inet_make_mask(order) */ -#define FZ_MASK(fz) ((fz)->fz_mask) - - struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE]; - - int fz_nent; /* Number of entries */ - int fz_divisor; /* Hash size (mask+1) */ -}; - -struct fn_hash { - struct fn_zone *fn_zones[33]; - struct fn_zone __rcu *fn_zone_list; -}; - -static inline u32 fn_hash(__be32 key, struct fn_zone *fz) -{ - u32 h = ntohl(key) >> fz->fz_revorder; - h ^= (h>>20); - h ^= (h>>10); - h ^= (h>>5); - h &= fz->fz_hashmask; - return h; -} - -static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) -{ - return dst & FZ_MASK(fz); -} - -static unsigned int fib_hash_genid; - -#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) - -static struct hlist_head *fz_hash_alloc(int divisor) -{ - unsigned long size = divisor * sizeof(struct hlist_head); - - if (size <= PAGE_SIZE) - return kzalloc(size, GFP_KERNEL); - - return (struct hlist_head *) - __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); -} - -/* The fib hash lock must be held when this is called. */ -static inline void fn_rebuild_zone(struct fn_zone *fz, - struct hlist_head *old_ht, - int old_divisor) -{ - int i; - - for (i = 0; i < old_divisor; i++) { - struct hlist_node *node, *n; - struct fib_node *f; - - hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { - struct hlist_head *new_head; - - hlist_del_rcu(&f->fn_hash); - - new_head = rcu_dereference_protected(fz->fz_hash, 1) + - fn_hash(f->fn_key, fz); - hlist_add_head_rcu(&f->fn_hash, new_head); - } - } -} - -static void fz_hash_free(struct hlist_head *hash, int divisor) -{ - unsigned long size = divisor * sizeof(struct hlist_head); - - if (size <= PAGE_SIZE) - kfree(hash); - else - free_pages((unsigned long)hash, get_order(size)); -} - -static void fn_rehash_zone(struct fn_zone *fz) -{ - struct hlist_head *ht, *old_ht; - int old_divisor, new_divisor; - u32 new_hashmask; - - new_divisor = old_divisor = fz->fz_divisor; - - switch (old_divisor) { - case EMBEDDED_HASH_SIZE: - new_divisor *= EMBEDDED_HASH_SIZE; - break; - case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE: - new_divisor *= (EMBEDDED_HASH_SIZE/2); - break; - default: - if ((old_divisor << 1) > FZ_MAX_DIVISOR) { - printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); - return; - } - new_divisor = (old_divisor << 1); - break; - } - - new_hashmask = (new_divisor - 1); - -#if RT_CACHE_DEBUG >= 2 - printk(KERN_DEBUG "fn_rehash_zone: hash for zone %d grows from %d\n", - fz->fz_order, old_divisor); -#endif - - ht = fz_hash_alloc(new_divisor); - - if (ht) { - struct fn_zone nfz; - - memcpy(&nfz, fz, sizeof(nfz)); - - write_seqlock_bh(&fz->fz_lock); - old_ht = rcu_dereference_protected(fz->fz_hash, 1); - RCU_INIT_POINTER(nfz.fz_hash, ht); - nfz.fz_hashmask = new_hashmask; - nfz.fz_divisor = new_divisor; - fn_rebuild_zone(&nfz, old_ht, old_divisor); - fib_hash_genid++; - rcu_assign_pointer(fz->fz_hash, ht); - fz->fz_hashmask = new_hashmask; - fz->fz_divisor = new_divisor; - write_sequnlock_bh(&fz->fz_lock); - - if (old_ht != fz->fz_embedded_hash) { - synchronize_rcu(); - fz_hash_free(old_ht, old_divisor); - } - } -} - -static void fn_free_node_rcu(struct rcu_head *head) -{ - struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu); - - kmem_cache_free(fn_hash_kmem, f); -} - -static inline void fn_free_node(struct fib_node *f) -{ - call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu); -} - -static void fn_free_alias_rcu(struct rcu_head *head) -{ - struct fib_alias *fa = container_of(head, struct fib_alias, rcu); - - kmem_cache_free(fn_alias_kmem, fa); -} - -static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) -{ - fib_release_info(fa->fa_info); - if (fa == &f->fn_embedded_alias) - fa->fa_info = NULL; - else - call_rcu(&fa->rcu, fn_free_alias_rcu); -} - -static struct fn_zone * -fn_new_zone(struct fn_hash *table, int z) -{ - int i; - struct fn_zone *fz = kzalloc(sizeof(struct fn_zone), GFP_KERNEL); - if (!fz) - return NULL; - - seqlock_init(&fz->fz_lock); - fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; - fz->fz_hashmask = fz->fz_divisor - 1; - RCU_INIT_POINTER(fz->fz_hash, fz->fz_embedded_hash); - fz->fz_order = z; - fz->fz_revorder = 32 - z; - fz->fz_mask = inet_make_mask(z); - - /* Find the first not empty zone with more specific mask */ - for (i = z + 1; i <= 32; i++) - if (table->fn_zones[i]) - break; - if (i > 32) { - /* No more specific masks, we are the first. */ - rcu_assign_pointer(fz->fz_next, - rtnl_dereference(table->fn_zone_list)); - rcu_assign_pointer(table->fn_zone_list, fz); - } else { - rcu_assign_pointer(fz->fz_next, - rtnl_dereference(table->fn_zones[i]->fz_next)); - rcu_assign_pointer(table->fn_zones[i]->fz_next, fz); - } - table->fn_zones[z] = fz; - fib_hash_genid++; - return fz; -} - -int fib_table_lookup(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res, - int fib_flags) -{ - int err; - struct fn_zone *fz; - struct fn_hash *t = (struct fn_hash *)tb->tb_data; - - rcu_read_lock(); - for (fz = rcu_dereference(t->fn_zone_list); - fz != NULL; - fz = rcu_dereference(fz->fz_next)) { - struct hlist_head *head; - struct hlist_node *node; - struct fib_node *f; - __be32 k; - unsigned int seq; - - do { - seq = read_seqbegin(&fz->fz_lock); - k = fz_key(flp->fl4_dst, fz); - - head = rcu_dereference(fz->fz_hash) + fn_hash(k, fz); - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - if (f->fn_key != k) - continue; - - err = fib_semantic_match(&f->fn_alias, - flp, res, - fz->fz_order, fib_flags); - if (err <= 0) - goto out; - } - } while (read_seqretry(&fz->fz_lock, seq)); - } - err = 1; -out: - rcu_read_unlock(); - return err; -} - -void fib_table_select_default(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res) -{ - int order, last_idx; - struct hlist_node *node; - struct fib_node *f; - struct fib_info *fi = NULL; - struct fib_info *last_resort; - struct fn_hash *t = (struct fn_hash *)tb->tb_data; - struct fn_zone *fz = t->fn_zones[0]; - struct hlist_head *head; - - if (fz == NULL) - return; - - last_idx = -1; - last_resort = NULL; - order = -1; - - rcu_read_lock(); - head = rcu_dereference(fz->fz_hash); - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { - struct fib_info *next_fi = fa->fa_info; - - if (fa->fa_scope != res->scope || - fa->fa_type != RTN_UNICAST) - continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; - - fib_alias_accessed(fa); - - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - fi = next_fi; - order++; - } - } - - if (order <= 0 || fi == NULL) { - tb->tb_default = -1; - goto out; - } - - if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - - if (last_idx >= 0) - fib_result_assign(res, last_resort); - tb->tb_default = last_idx; -out: - rcu_read_unlock(); -} - -/* Insert node F to FZ. */ -static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(f->fn_key, fz); - - hlist_add_head_rcu(&f->fn_hash, head); -} - -/* Return the node in FZ matching KEY. */ -static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + fn_hash(key, fz); - struct hlist_node *node; - struct fib_node *f; - - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - if (f->fn_key == key) - return f; - } - - return NULL; -} - - -static struct fib_alias *fib_fast_alloc(struct fib_node *f) -{ - struct fib_alias *fa = &f->fn_embedded_alias; - - if (fa->fa_info != NULL) - fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - return fa; -} - -/* Caller must hold RTNL. */ -int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fib_node *new_f = NULL; - struct fib_node *f; - struct fib_alias *fa, *new_fa; - struct fn_zone *fz; - struct fib_info *fi; - u8 tos = cfg->fc_tos; - __be32 key; - int err; - - if (cfg->fc_dst_len > 32) - return -EINVAL; - - fz = table->fn_zones[cfg->fc_dst_len]; - if (!fz && !(fz = fn_new_zone(table, cfg->fc_dst_len))) - return -ENOBUFS; - - key = 0; - if (cfg->fc_dst) { - if (cfg->fc_dst & ~FZ_MASK(fz)) - return -EINVAL; - key = fz_key(cfg->fc_dst, fz); - } - - fi = fib_create_info(cfg); - if (IS_ERR(fi)) - return PTR_ERR(fi); - - if (fz->fz_nent > (fz->fz_divisor<<1) && - fz->fz_divisor < FZ_MAX_DIVISOR && - (cfg->fc_dst_len == 32 || - (1 << cfg->fc_dst_len) > fz->fz_divisor)) - fn_rehash_zone(fz); - - f = fib_find_node(fz, key); - - if (!f) - fa = NULL; - else - fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority); - - /* Now fa, if non-NULL, points to the first fib alias - * with the same keys [prefix,tos,priority], if such key already - * exists or to the node before which we will insert new one. - * - * If fa is NULL, we will need to allocate a new one and - * insert to the head of f. - * - * If f is NULL, no fib node matched the destination key - * and we need to allocate a new one of those as well. - */ - - if (fa && fa->fa_tos == tos && - fa->fa_info->fib_priority == fi->fib_priority) { - struct fib_alias *fa_first, *fa_match; - - err = -EEXIST; - if (cfg->fc_nlflags & NLM_F_EXCL) - goto out; - - /* We have 2 goals: - * 1. Find exact match for type, scope, fib_info to avoid - * duplicate routes - * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it - */ - fa_match = NULL; - fa_first = fa; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { - if (fa->fa_tos != tos) - break; - if (fa->fa_info->fib_priority != fi->fib_priority) - break; - if (fa->fa_type == cfg->fc_type && - fa->fa_scope == cfg->fc_scope && - fa->fa_info == fi) { - fa_match = fa; - break; - } - } - - if (cfg->fc_nlflags & NLM_F_REPLACE) { - u8 state; - - fa = fa_first; - if (fa_match) { - if (fa == fa_match) - err = 0; - goto out; - } - err = -ENOBUFS; - new_fa = fib_fast_alloc(f); - if (new_fa == NULL) - goto out; - - new_fa->fa_tos = fa->fa_tos; - new_fa->fa_info = fi; - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - state = fa->fa_state; - new_fa->fa_state = state & ~FA_S_ACCESSED; - fib_hash_genid++; - list_replace_rcu(&fa->fa_list, &new_fa->fa_list); - - fn_free_alias(fa, f); - if (state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, - tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); - return 0; - } - - /* Error if we find a perfect match which - * uses the same scope, type, and nexthop - * information. - */ - if (fa_match) - goto out; - - if (!(cfg->fc_nlflags & NLM_F_APPEND)) - fa = fa_first; - } - - err = -ENOENT; - if (!(cfg->fc_nlflags & NLM_F_CREATE)) - goto out; - - err = -ENOBUFS; - - if (!f) { - new_f = kmem_cache_zalloc(fn_hash_kmem, GFP_KERNEL); - if (new_f == NULL) - goto out; - - INIT_HLIST_NODE(&new_f->fn_hash); - INIT_LIST_HEAD(&new_f->fn_alias); - new_f->fn_key = key; - f = new_f; - } - - new_fa = fib_fast_alloc(f); - if (new_fa == NULL) - goto out; - - new_fa->fa_info = fi; - new_fa->fa_tos = tos; - new_fa->fa_type = cfg->fc_type; - new_fa->fa_scope = cfg->fc_scope; - new_fa->fa_state = 0; - - /* - * Insert new entry to the list. - */ - - if (new_f) - fib_insert_node(fz, new_f); - list_add_tail_rcu(&new_fa->fa_list, - (fa ? &fa->fa_list : &f->fn_alias)); - fib_hash_genid++; - - if (new_f) - fz->fz_nent++; - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - - rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, - &cfg->fc_nlinfo, 0); - return 0; - -out: - if (new_f) - kmem_cache_free(fn_hash_kmem, new_f); - fib_release_info(fi); - return err; -} - -int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) -{ - struct fn_hash *table = (struct fn_hash *)tb->tb_data; - struct fib_node *f; - struct fib_alias *fa, *fa_to_delete; - struct fn_zone *fz; - __be32 key; - - if (cfg->fc_dst_len > 32) - return -EINVAL; - - if ((fz = table->fn_zones[cfg->fc_dst_len]) == NULL) - return -ESRCH; - - key = 0; - if (cfg->fc_dst) { - if (cfg->fc_dst & ~FZ_MASK(fz)) - return -EINVAL; - key = fz_key(cfg->fc_dst, fz); - } - - f = fib_find_node(fz, key); - - if (!f) - fa = NULL; - else - fa = fib_find_alias(&f->fn_alias, cfg->fc_tos, 0); - if (!fa) - return -ESRCH; - - fa_to_delete = NULL; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { - struct fib_info *fi = fa->fa_info; - - if (fa->fa_tos != cfg->fc_tos) - break; - - if ((!cfg->fc_type || - fa->fa_type == cfg->fc_type) && - (cfg->fc_scope == RT_SCOPE_NOWHERE || - fa->fa_scope == cfg->fc_scope) && - (!cfg->fc_protocol || - fi->fib_protocol == cfg->fc_protocol) && - fib_nh_match(cfg, fi) == 0) { - fa_to_delete = fa; - break; - } - } - - if (fa_to_delete) { - int kill_fn; - - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, key, fa, cfg->fc_dst_len, - tb->tb_id, &cfg->fc_nlinfo, 0); - - kill_fn = 0; - list_del_rcu(&fa->fa_list); - if (list_empty(&f->fn_alias)) { - hlist_del_rcu(&f->fn_hash); - kill_fn = 1; - } - fib_hash_genid++; - - if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - fn_free_alias(fa, f); - if (kill_fn) { - fn_free_node(f); - fz->fz_nent--; - } - - return 0; - } - return -ESRCH; -} - -static int fn_flush_list(struct fn_zone *fz, int idx) -{ - struct hlist_head *head = rtnl_dereference(fz->fz_hash) + idx; - struct hlist_node *node, *n; - struct fib_node *f; - int found = 0; - - hlist_for_each_entry_safe(f, node, n, head, fn_hash) { - struct fib_alias *fa, *fa_node; - int kill_f; - - kill_f = 0; - list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { - struct fib_info *fi = fa->fa_info; - - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { - list_del_rcu(&fa->fa_list); - if (list_empty(&f->fn_alias)) { - hlist_del_rcu(&f->fn_hash); - kill_f = 1; - } - fib_hash_genid++; - - fn_free_alias(fa, f); - found++; - } - } - if (kill_f) { - fn_free_node(f); - fz->fz_nent--; - } - } - return found; -} - -/* caller must hold RTNL. */ -int fib_table_flush(struct fib_table *tb) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fn_zone *fz; - int found = 0; - - for (fz = rtnl_dereference(table->fn_zone_list); - fz != NULL; - fz = rtnl_dereference(fz->fz_next)) { - int i; - - for (i = fz->fz_divisor - 1; i >= 0; i--) - found += fn_flush_list(fz, i); - } - return found; -} - -void fib_free_table(struct fib_table *tb) -{ - struct fn_hash *table = (struct fn_hash *) tb->tb_data; - struct fn_zone *fz, *next; - - next = table->fn_zone_list; - while (next != NULL) { - fz = next; - next = fz->fz_next; - - if (fz->fz_hash != fz->fz_embedded_hash) - fz_hash_free(fz->fz_hash, fz->fz_divisor); - - kfree(fz); - } - - kfree(tb); -} - -static inline int -fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, - struct fib_table *tb, - struct fn_zone *fz, - struct hlist_head *head) -{ - struct hlist_node *node; - struct fib_node *f; - int i, s_i; - - s_i = cb->args[4]; - i = 0; - hlist_for_each_entry_rcu(f, node, head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { - if (i < s_i) - goto next; - - if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, - fa->fa_type, - fa->fa_scope, - f->fn_key, - fz->fz_order, - fa->fa_tos, - fa->fa_info, - NLM_F_MULTI) < 0) { - cb->args[4] = i; - return -1; - } -next: - i++; - } - } - cb->args[4] = i; - return skb->len; -} - -static inline int -fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, - struct fib_table *tb, - struct fn_zone *fz) -{ - int h, s_h; - struct hlist_head *head = rcu_dereference(fz->fz_hash); - - if (head == NULL) - return skb->len; - s_h = cb->args[3]; - for (h = s_h; h < fz->fz_divisor; h++) { - if (hlist_empty(head + h)) - continue; - if (fn_hash_dump_bucket(skb, cb, tb, fz, head + h) < 0) { - cb->args[3] = h; - return -1; - } - memset(&cb->args[4], 0, - sizeof(cb->args) - 4*sizeof(cb->args[0])); - } - cb->args[3] = h; - return skb->len; -} - -int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) -{ - int m = 0, s_m; - struct fn_zone *fz; - struct fn_hash *table = (struct fn_hash *)tb->tb_data; - - s_m = cb->args[2]; - rcu_read_lock(); - for (fz = rcu_dereference(table->fn_zone_list); - fz != NULL; - fz = rcu_dereference(fz->fz_next), m++) { - if (m < s_m) - continue; - if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { - cb->args[2] = m; - rcu_read_unlock(); - return -1; - } - memset(&cb->args[3], 0, - sizeof(cb->args) - 3*sizeof(cb->args[0])); - } - rcu_read_unlock(); - cb->args[2] = m; - return skb->len; -} - -void __init fib_hash_init(void) -{ - fn_hash_kmem = kmem_cache_create("ip_fib_hash", sizeof(struct fib_node), - 0, SLAB_PANIC, NULL); - - fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_PANIC, NULL); - -} - -struct fib_table *fib_hash_table(u32 id) -{ - struct fib_table *tb; - - tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), - GFP_KERNEL); - if (tb == NULL) - return NULL; - - tb->tb_id = id; - tb->tb_default = -1; - - memset(tb->tb_data, 0, sizeof(struct fn_hash)); - return tb; -} - -/* ------------------------------------------------------------------------ */ -#ifdef CONFIG_PROC_FS - -struct fib_iter_state { - struct seq_net_private p; - struct fn_zone *zone; - int bucket; - struct hlist_head *hash_head; - struct fib_node *fn; - struct fib_alias *fa; - loff_t pos; - unsigned int genid; - int valid; -}; - -static struct fib_alias *fib_get_first(struct seq_file *seq) -{ - struct fib_iter_state *iter = seq->private; - struct fib_table *main_table; - struct fn_hash *table; - - main_table = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); - table = (struct fn_hash *)main_table->tb_data; - - iter->bucket = 0; - iter->hash_head = NULL; - iter->fn = NULL; - iter->fa = NULL; - iter->pos = 0; - iter->genid = fib_hash_genid; - iter->valid = 1; - - for (iter->zone = rcu_dereference(table->fn_zone_list); - iter->zone != NULL; - iter->zone = rcu_dereference(iter->zone->fz_next)) { - int maxslot; - - if (!iter->zone->fz_nent) - continue; - - iter->hash_head = rcu_dereference(iter->zone->fz_hash); - maxslot = iter->zone->fz_divisor; - - for (iter->bucket = 0; iter->bucket < maxslot; - ++iter->bucket, ++iter->hash_head) { - struct hlist_node *node; - struct fib_node *fn; - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - struct fib_alias *fa; - - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } - } -out: - return iter->fa; -} - -static struct fib_alias *fib_get_next(struct seq_file *seq) -{ - struct fib_iter_state *iter = seq->private; - struct fib_node *fn; - struct fib_alias *fa; - - /* Advance FA, if any. */ - fn = iter->fn; - fa = iter->fa; - if (fa) { - BUG_ON(!fn); - list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) { - iter->fa = fa; - goto out; - } - } - - fa = iter->fa = NULL; - - /* Advance FN. */ - if (fn) { - struct hlist_node *node = &fn->fn_hash; - hlist_for_each_entry_continue(fn, node, fn_hash) { - iter->fn = fn; - - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fa = fa; - goto out; - } - } - } - - fn = iter->fn = NULL; - - /* Advance hash chain. */ - if (!iter->zone) - goto out; - - for (;;) { - struct hlist_node *node; - int maxslot; - - maxslot = iter->zone->fz_divisor; - - while (++iter->bucket < maxslot) { - iter->hash_head++; - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } - - iter->zone = rcu_dereference(iter->zone->fz_next); - - if (!iter->zone) - goto out; - - iter->bucket = 0; - iter->hash_head = rcu_dereference(iter->zone->fz_hash); - - hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { - list_for_each_entry(fa, &fn->fn_alias, fa_list) { - iter->fn = fn; - iter->fa = fa; - goto out; - } - } - } -out: - iter->pos++; - return fa; -} - -static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) -{ - struct fib_iter_state *iter = seq->private; - struct fib_alias *fa; - - if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) { - fa = iter->fa; - pos -= iter->pos; - } else - fa = fib_get_first(seq); - - if (fa) - while (pos && (fa = fib_get_next(seq))) - --pos; - return pos ? NULL : fa; -} - -static void *fib_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) -{ - void *v = NULL; - - rcu_read_lock(); - if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) - v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - return v; -} - -static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq); -} - -static void fib_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) -{ - rcu_read_unlock(); -} - -static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) -{ - static const unsigned type2flags[RTN_MAX + 1] = { - [7] = RTF_REJECT, - [8] = RTF_REJECT, - }; - unsigned flags = type2flags[type]; - - if (fi && fi->fib_nh->nh_gw) - flags |= RTF_GATEWAY; - if (mask == htonl(0xFFFFFFFF)) - flags |= RTF_HOST; - flags |= RTF_UP; - return flags; -} - -/* - * This outputs /proc/net/route. - * - * It always works in backward compatibility mode. - * The format of the file is not supposed to be changed. - */ -static int fib_seq_show(struct seq_file *seq, void *v) -{ - struct fib_iter_state *iter; - int len; - __be32 prefix, mask; - unsigned flags; - struct fib_node *f; - struct fib_alias *fa; - struct fib_info *fi; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " - "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" - "\tWindow\tIRTT"); - goto out; - } - - iter = seq->private; - f = iter->fn; - fa = iter->fa; - fi = fa->fa_info; - prefix = f->fn_key; - mask = FZ_MASK(iter->zone); - flags = fib_flag_trans(fa->fa_type, mask, fi); - if (fi) - seq_printf(seq, - "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", - fi->fib_dev ? fi->fib_dev->name : "*", prefix, - fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, - mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), - fi->fib_window, - fi->fib_rtt >> 3, &len); - else - seq_printf(seq, - "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u%n", - prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0, &len); - - seq_printf(seq, "%*s\n", 127 - len, ""); -out: - return 0; -} - -static const struct seq_operations fib_seq_ops = { - .start = fib_seq_start, - .next = fib_seq_next, - .stop = fib_seq_stop, - .show = fib_seq_show, -}; - -static int fib_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &fib_seq_ops, - sizeof(struct fib_iter_state)); -} - -static const struct file_operations fib_seq_fops = { - .owner = THIS_MODULE, - .open = fib_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - -int __net_init fib_proc_init(struct net *net) -{ - if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_seq_fops)) - return -ENOMEM; - return 0; -} - -void __net_exit fib_proc_exit(struct net *net) -{ - proc_net_remove(net, "route"); -} -#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index c079cc0ec651..4ec323875a02 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -25,9 +25,6 @@ static inline void fib_alias_accessed(struct fib_alias *fa) } /* Exported by fib_semantics.c */ -extern int fib_semantic_match(struct list_head *head, - const struct flowi *flp, - struct fib_result *res, int prefixlen, int fib_flags); extern void fib_release_info(struct fib_info *); extern struct fib_info *fib_create_info(struct fib_config *cfg); extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); @@ -51,4 +48,11 @@ static inline void fib_result_assign(struct fib_result *res, res->fi = fi; } +struct fib_prop { + int error; + u8 scope; +}; + +extern const struct fib_prop fib_props[RTN_MAX + 1]; + #endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 7981a24f5c7b..a53bb1b5b118 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -41,19 +41,19 @@ struct fib4_rule { __be32 srcmask; __be32 dst; __be32 dstmask; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID u32 tclassid; #endif }; -#ifdef CONFIG_NET_CLS_ROUTE -u32 fib_rules_tclass(struct fib_result *res) +#ifdef CONFIG_IP_ROUTE_CLASSID +u32 fib_rules_tclass(const struct fib_result *res) { return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; } #endif -int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) +int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, @@ -61,7 +61,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) }; int err; - err = fib_rules_lookup(net->ipv4.rules_ops, flp, 0, &arg); + err = fib_rules_lookup(net->ipv4.rules_ops, flowi4_to_flowi(flp), 0, &arg); res->r = arg.rule; return err; @@ -95,7 +95,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, if (!tbl) goto errout; - err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags); + err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags); if (err > 0) err = -EAGAIN; errout: @@ -106,14 +106,15 @@ errout: static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags) { struct fib4_rule *r = (struct fib4_rule *) rule; - __be32 daddr = fl->fl4_dst; - __be32 saddr = fl->fl4_src; + struct flowi4 *fl4 = &fl->u.ip4; + __be32 daddr = fl4->daddr; + __be32 saddr = fl4->saddr; if (((saddr ^ r->src) & r->srcmask) || ((daddr ^ r->dst) & r->dstmask)) return 0; - if (r->tos && (r->tos != fl->fl4_tos)) + if (r->tos && (r->tos != fl4->flowi4_tos)) return 0; return 1; @@ -165,7 +166,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (frh->dst_len) rule4->dst = nla_get_be32(tb[FRA_DST]); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); #endif @@ -195,7 +196,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->tos && (rule4->tos != frh->tos)) return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; #endif @@ -224,7 +225,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule4->src_len) NLA_PUT_BE32(skb, FRA_SRC, rule4->src); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid) NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 12d3dc3df1b7..622ac4c95026 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -49,7 +49,7 @@ static DEFINE_SPINLOCK(fib_info_lock); static struct hlist_head *fib_info_hash; static struct hlist_head *fib_info_laddrhash; -static unsigned int fib_hash_size; +static unsigned int fib_info_hash_size; static unsigned int fib_info_cnt; #define DEVINDEX_HASHBITS 8 @@ -90,11 +90,7 @@ static DEFINE_SPINLOCK(fib_multipath_lock); #define endfor_nexthops(fi) } -static const struct -{ - int error; - u8 scope; -} fib_props[RTN_MAX + 1] = { +const struct fib_prop fib_props[RTN_MAX + 1] = { [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE, @@ -152,6 +148,8 @@ static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + if (fi->fib_metrics != (u32 *) dst_default_metrics) + kfree(fi->fib_metrics); kfree(fi); } @@ -200,7 +198,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight != onh->nh_weight || #endif -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) @@ -221,7 +219,7 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val) static inline unsigned int fib_info_hashfn(const struct fib_info *fi) { - unsigned int mask = (fib_hash_size - 1); + unsigned int mask = (fib_info_hash_size - 1); unsigned int val = fi->fib_nhs; val ^= fi->fib_protocol; @@ -422,7 +420,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla = nla_find(attrs, attrlen, RTA_GATEWAY); nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; #endif @@ -476,7 +474,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla && nla_get_be32(nla) != nh->nh_gw) return 1; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla && nla_get_u32(nla) != nh->nh_tclassid) return 1; @@ -562,16 +560,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } rcu_read_lock(); { - struct flowi fl = { - .fl4_dst = nh->nh_gw, - .fl4_scope = cfg->fc_scope + 1, - .oif = nh->nh_oif, + struct flowi4 fl4 = { + .daddr = nh->nh_gw, + .flowi4_scope = cfg->fc_scope + 1, + .flowi4_oif = nh->nh_oif, }; /* It is not necessary, but requires a bit of thinking */ - if (fl.fl4_scope < RT_SCOPE_LINK) - fl.fl4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl, &res); + if (fl4.flowi4_scope < RT_SCOPE_LINK) + fl4.flowi4_scope = RT_SCOPE_LINK; + err = fib_lookup(net, &fl4, &res); if (err) { rcu_read_unlock(); return err; @@ -613,14 +611,14 @@ out: static inline unsigned int fib_laddr_hashfn(__be32 val) { - unsigned int mask = (fib_hash_size - 1); + unsigned int mask = (fib_info_hash_size - 1); return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; } -static struct hlist_head *fib_hash_alloc(int bytes) +static struct hlist_head *fib_info_hash_alloc(int bytes) { if (bytes <= PAGE_SIZE) return kzalloc(bytes, GFP_KERNEL); @@ -630,7 +628,7 @@ static struct hlist_head *fib_hash_alloc(int bytes) get_order(bytes)); } -static void fib_hash_free(struct hlist_head *hash, int bytes) +static void fib_info_hash_free(struct hlist_head *hash, int bytes) { if (!hash) return; @@ -641,18 +639,18 @@ static void fib_hash_free(struct hlist_head *hash, int bytes) free_pages((unsigned long) hash, get_order(bytes)); } -static void fib_hash_move(struct hlist_head *new_info_hash, - struct hlist_head *new_laddrhash, - unsigned int new_size) +static void fib_info_hash_move(struct hlist_head *new_info_hash, + struct hlist_head *new_laddrhash, + unsigned int new_size) { struct hlist_head *old_info_hash, *old_laddrhash; - unsigned int old_size = fib_hash_size; + unsigned int old_size = fib_info_hash_size; unsigned int i, bytes; spin_lock_bh(&fib_info_lock); old_info_hash = fib_info_hash; old_laddrhash = fib_info_laddrhash; - fib_hash_size = new_size; + fib_info_hash_size = new_size; for (i = 0; i < old_size; i++) { struct hlist_head *head = &fib_info_hash[i]; @@ -693,8 +691,8 @@ static void fib_hash_move(struct hlist_head *new_info_hash, spin_unlock_bh(&fib_info_lock); bytes = old_size * sizeof(struct hlist_head *); - fib_hash_free(old_info_hash, bytes); - fib_hash_free(old_laddrhash, bytes); + fib_info_hash_free(old_info_hash, bytes); + fib_info_hash_free(old_laddrhash, bytes); } struct fib_info *fib_create_info(struct fib_config *cfg) @@ -705,6 +703,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) int nhs = 1; struct net *net = cfg->fc_nlinfo.nl_net; + if (cfg->fc_type > RTN_MAX) + goto err_inval; + /* Fast check to catch the most weird cases */ if (fib_props[cfg->fc_type].scope > cfg->fc_scope) goto err_inval; @@ -718,8 +719,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) #endif err = -ENOBUFS; - if (fib_info_cnt >= fib_hash_size) { - unsigned int new_size = fib_hash_size << 1; + if (fib_info_cnt >= fib_info_hash_size) { + unsigned int new_size = fib_info_hash_size << 1; struct hlist_head *new_info_hash; struct hlist_head *new_laddrhash; unsigned int bytes; @@ -727,21 +728,27 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (!new_size) new_size = 1; bytes = new_size * sizeof(struct hlist_head *); - new_info_hash = fib_hash_alloc(bytes); - new_laddrhash = fib_hash_alloc(bytes); + new_info_hash = fib_info_hash_alloc(bytes); + new_laddrhash = fib_info_hash_alloc(bytes); if (!new_info_hash || !new_laddrhash) { - fib_hash_free(new_info_hash, bytes); - fib_hash_free(new_laddrhash, bytes); + fib_info_hash_free(new_info_hash, bytes); + fib_info_hash_free(new_laddrhash, bytes); } else - fib_hash_move(new_info_hash, new_laddrhash, new_size); + fib_info_hash_move(new_info_hash, new_laddrhash, new_size); - if (!fib_hash_size) + if (!fib_info_hash_size) goto failure; } fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (fi == NULL) goto failure; + if (cfg->fc_mx) { + fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + if (!fi->fib_metrics) + goto failure; + } else + fi->fib_metrics = (u32 *) dst_default_metrics; fib_info_cnt++; fi->fib_net = hold_net(net); @@ -779,7 +786,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) goto err_inval; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) goto err_inval; #endif @@ -792,7 +799,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -804,6 +811,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) goto err_inval; goto link_it; + } else { + switch (cfg->fc_type) { + case RTN_UNICAST: + case RTN_LOCAL: + case RTN_BROADCAST: + case RTN_ANYCAST: + case RTN_MULTICAST: + break; + default: + goto err_inval; + } } if (cfg->fc_scope > RT_SCOPE_HOST) @@ -835,6 +853,13 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; } + change_nexthops(fi) { + nexthop_nh->nh_cfg_scope = cfg->fc_scope; + nexthop_nh->nh_saddr = inet_select_addr(nexthop_nh->nh_dev, + nexthop_nh->nh_gw, + nexthop_nh->nh_cfg_scope); + } endfor_nexthops(fi) + link_it: ofi = fib_find_info(fi); if (ofi) { @@ -880,84 +905,6 @@ failure: return ERR_PTR(err); } -/* Note! fib_semantic_match intentionally uses RCU list functions. */ -int fib_semantic_match(struct list_head *head, const struct flowi *flp, - struct fib_result *res, int prefixlen, int fib_flags) -{ - struct fib_alias *fa; - int nh_sel = 0; - - list_for_each_entry_rcu(fa, head, fa_list) { - int err; - - if (fa->fa_tos && - fa->fa_tos != flp->fl4_tos) - continue; - - if (fa->fa_scope < flp->fl4_scope) - continue; - - fib_alias_accessed(fa); - - err = fib_props[fa->fa_type].error; - if (err == 0) { - struct fib_info *fi = fa->fa_info; - - if (fi->fib_flags & RTNH_F_DEAD) - continue; - - switch (fa->fa_type) { - case RTN_UNICAST: - case RTN_LOCAL: - case RTN_BROADCAST: - case RTN_ANYCAST: - case RTN_MULTICAST: - for_nexthops(fi) { - if (nh->nh_flags & RTNH_F_DEAD) - continue; - if (!flp->oif || flp->oif == nh->nh_oif) - break; - } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (nhsel < fi->fib_nhs) { - nh_sel = nhsel; - goto out_fill_res; - } -#else - if (nhsel < 1) - goto out_fill_res; -#endif - endfor_nexthops(fi); - continue; - - default: - pr_warning("fib_semantic_match bad type %#x\n", - fa->fa_type); - return -EINVAL; - } - } - return err; - } - return 1; - -out_fill_res: - res->prefixlen = prefixlen; - res->nh_sel = nh_sel; - res->type = fa->fa_type; - res->scope = fa->fa_scope; - res->fi = fa->fa_info; - if (!(fib_flags & FIB_LOOKUP_NOREF)) - atomic_inc(&res->fi->fib_clntref); - return 0; -} - -/* Find appropriate source address to this destination */ - -__be32 __fib_res_prefsrc(struct fib_result *res) -{ - return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); -} - int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, unsigned int flags) @@ -1002,7 +949,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (fi->fib_nh->nh_oif) NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); #endif @@ -1027,7 +974,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (nh->nh_gw) NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); #endif @@ -1125,6 +1072,80 @@ int fib_sync_down_dev(struct net_device *dev, int force) return ret; } +/* Must be invoked inside of an RCU protected region. */ +void fib_select_default(struct fib_result *res) +{ + struct fib_info *fi = NULL, *last_resort = NULL; + struct list_head *fa_head = res->fa_head; + struct fib_table *tb = res->table; + int order = -1, last_idx = -1; + struct fib_alias *fa; + + list_for_each_entry_rcu(fa, fa_head, fa_list) { + struct fib_info *next_fi = fa->fa_info; + + if (fa->fa_scope != res->scope || + fa->fa_type != RTN_UNICAST) + continue; + + if (next_fi->fib_priority > res->fi->fib_priority) + break; + if (!next_fi->fib_nh[0].nh_gw || + next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + continue; + + fib_alias_accessed(fa); + + if (fi == NULL) { + if (next_fi != res->fi) + break; + } else if (!fib_detect_death(fi, order, &last_resort, + &last_idx, tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; + goto out; + } + fi = next_fi; + order++; + } + + if (order <= 0 || fi == NULL) { + tb->tb_default = -1; + goto out; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx, + tb->tb_default)) { + fib_result_assign(res, fi); + tb->tb_default = order; + goto out; + } + + if (last_idx >= 0) + fib_result_assign(res, last_resort); + tb->tb_default = last_idx; +out: + return; +} + +void fib_update_nh_saddrs(struct net_device *dev) +{ + struct hlist_head *head; + struct hlist_node *node; + struct fib_nh *nh; + unsigned int hash; + + hash = fib_devindex_hashfn(dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_for_each_entry(nh, node, head, nh_hash) { + if (nh->nh_dev != dev) + continue; + nh->nh_saddr = inet_select_addr(nh->nh_dev, + nh->nh_gw, + nh->nh_cfg_scope); + } +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH /* @@ -1189,7 +1210,7 @@ int fib_sync_up(struct net_device *dev) * The algorithm is suboptimal, but it provides really * fair weighted route distribution. */ -void fib_select_multipath(const struct flowi *flp, struct fib_result *res) +void fib_select_multipath(struct fib_result *res) { struct fib_info *fi = res->fi; int w; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 0f280348e0fd..3d28a35c2e1a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -95,7 +95,7 @@ typedef unsigned int t_key; #define IS_TNODE(n) (!(n->parent & T_LEAF)) #define IS_LEAF(n) (n->parent & T_LEAF) -struct node { +struct rt_trie_node { unsigned long parent; t_key key; }; @@ -126,7 +126,7 @@ struct tnode { struct work_struct work; struct tnode *tnode_free; }; - struct node *child[0]; + struct rt_trie_node *child[0]; }; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -151,16 +151,16 @@ struct trie_stat { }; struct trie { - struct node *trie; + struct rt_trie_node *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif }; -static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, +static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n); +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull); -static struct node *resize(struct trie *t, struct tnode *tn); +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ @@ -177,12 +177,12 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; -static inline struct tnode *node_parent(struct node *node) +static inline struct tnode *node_parent(struct rt_trie_node *node) { return (struct tnode *)(node->parent & ~NODE_TYPE_MASK); } -static inline struct tnode *node_parent_rcu(struct node *node) +static inline struct tnode *node_parent_rcu(struct rt_trie_node *node) { struct tnode *ret = node_parent(node); @@ -192,22 +192,22 @@ static inline struct tnode *node_parent_rcu(struct node *node) /* Same as rcu_assign_pointer * but that macro() assumes that value is a pointer. */ -static inline void node_set_parent(struct node *node, struct tnode *ptr) +static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) { smp_wmb(); node->parent = (unsigned long)ptr | NODE_TYPE(node); } -static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i) +static inline struct rt_trie_node *tnode_get_child(struct tnode *tn, unsigned int i) { BUG_ON(i >= 1U << tn->bits); return tn->child[i]; } -static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) +static inline struct rt_trie_node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) { - struct node *ret = tnode_get_child(tn, i); + struct rt_trie_node *ret = tnode_get_child(tn, i); return rcu_dereference_rtnl(ret); } @@ -217,12 +217,12 @@ static inline int tnode_child_length(const struct tnode *tn) return 1 << tn->bits; } -static inline t_key mask_pfx(t_key k, unsigned short l) +static inline t_key mask_pfx(t_key k, unsigned int l) { return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); } -static inline t_key tkey_extract_bits(t_key a, int offset, int bits) +static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits) { if (offset < KEYLENGTH) return ((t_key)(a << offset)) >> (KEYLENGTH - bits); @@ -378,7 +378,7 @@ static void __tnode_free_rcu(struct rcu_head *head) { struct tnode *tn = container_of(head, struct tnode, rcu); size_t size = sizeof(struct tnode) + - (sizeof(struct node *) << tn->bits); + (sizeof(struct rt_trie_node *) << tn->bits); if (size <= PAGE_SIZE) kfree(tn); @@ -402,7 +402,7 @@ static void tnode_free_safe(struct tnode *tn) tn->tnode_free = tnode_free_head; tnode_free_head = tn; tnode_free_size += sizeof(struct tnode) + - (sizeof(struct node *) << tn->bits); + (sizeof(struct rt_trie_node *) << tn->bits); } static void tnode_free_flush(void) @@ -443,7 +443,7 @@ static struct leaf_info *leaf_info_new(int plen) static struct tnode *tnode_new(t_key key, int pos, int bits) { - size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits); + size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); struct tnode *tn = tnode_alloc(sz); if (tn) { @@ -456,7 +456,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) } pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct node) << bits); + sizeof(struct rt_trie_node) << bits); return tn; } @@ -465,7 +465,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(const struct tnode *tn, const struct node *n) +static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n) { if (n == NULL || IS_LEAF(n)) return 0; @@ -474,7 +474,7 @@ static inline int tnode_full(const struct tnode *tn, const struct node *n) } static inline void put_child(struct trie *t, struct tnode *tn, int i, - struct node *n) + struct rt_trie_node *n) { tnode_put_child_reorg(tn, i, n, -1); } @@ -484,10 +484,10 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, * Update the value of full_children and empty_children. */ -static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, +static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, int wasfull) { - struct node *chi = tn->child[i]; + struct rt_trie_node *chi = tn->child[i]; int isfull; BUG_ON(i >= 1<<tn->bits); @@ -515,7 +515,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, } #define MAX_WORK 10 -static struct node *resize(struct trie *t, struct tnode *tn) +static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) { int i; struct tnode *old_tn; @@ -605,7 +605,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Keep root node larger */ - if (!node_parent((struct node *)tn)) { + if (!node_parent((struct rt_trie_node *)tn)) { inflate_threshold_use = inflate_threshold_root; halve_threshold_use = halve_threshold_root; } else { @@ -635,7 +635,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Return if at least one inflate is run */ if (max_work != MAX_WORK) - return (struct node *) tn; + return (struct rt_trie_node *) tn; /* * Halve as long as the number of empty children in this @@ -663,7 +663,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (tn->empty_children == tnode_child_length(tn) - 1) { one_child: for (i = 0; i < tnode_child_length(tn); i++) { - struct node *n; + struct rt_trie_node *n; n = tn->child[i]; if (!n) @@ -676,7 +676,7 @@ one_child: return n; } } - return (struct node *) tn; + return (struct rt_trie_node *) tn; } static struct tnode *inflate(struct trie *t, struct tnode *tn) @@ -723,14 +723,14 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) goto nomem; } - put_child(t, tn, 2*i, (struct node *) left); - put_child(t, tn, 2*i+1, (struct node *) right); + put_child(t, tn, 2*i, (struct rt_trie_node *) left); + put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); } } for (i = 0; i < olen; i++) { struct tnode *inode; - struct node *node = tnode_get_child(oldtnode, i); + struct rt_trie_node *node = tnode_get_child(oldtnode, i); struct tnode *left, *right; int size, j; @@ -825,7 +825,7 @@ nomem: static struct tnode *halve(struct trie *t, struct tnode *tn) { struct tnode *oldtnode = tn; - struct node *left, *right; + struct rt_trie_node *left, *right; int i; int olen = tnode_child_length(tn); @@ -856,7 +856,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (!newn) goto nomem; - put_child(t, tn, i/2, (struct node *)newn); + put_child(t, tn, i/2, (struct rt_trie_node *)newn); } } @@ -958,7 +958,7 @@ fib_find_node(struct trie *t, u32 key) { int pos; struct tnode *tn; - struct node *n; + struct rt_trie_node *n; pos = 0; n = rcu_dereference_rtnl(t->trie); @@ -993,17 +993,17 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) key = tn->key; - while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) { + while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = (struct tnode *) resize(t, (struct tnode *)tn); tnode_put_child_reorg((struct tnode *)tp, cindex, - (struct node *)tn, wasfull); + (struct rt_trie_node *)tn, wasfull); - tp = node_parent((struct node *) tn); + tp = node_parent((struct rt_trie_node *) tn); if (!tp) - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); if (!tp) @@ -1015,7 +1015,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) if (IS_TNODE(tn)) tn = (struct tnode *)resize(t, (struct tnode *)tn); - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tnode_free_flush(); } @@ -1025,7 +1025,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) { int pos, newpos; struct tnode *tp = NULL, *tn = NULL; - struct node *n; + struct rt_trie_node *n; struct leaf *l; int missbit; struct list_head *fa_head = NULL; @@ -1111,10 +1111,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) if (t->trie && n == NULL) { /* Case 2: n is NULL, and will just insert a new leaf */ - node_set_parent((struct node *)l, tp); + node_set_parent((struct rt_trie_node *)l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct node *)l); + put_child(t, (struct tnode *)tp, cindex, (struct rt_trie_node *)l); } else { /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* @@ -1141,18 +1141,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) return NULL; } - node_set_parent((struct node *)tn, tp); + node_set_parent((struct rt_trie_node *)tn, tp); missbit = tkey_extract_bits(key, newpos, 1); - put_child(t, tn, missbit, (struct node *)l); + put_child(t, tn, missbit, (struct rt_trie_node *)l); put_child(t, tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, - (struct node *)tn); + (struct rt_trie_node *)tn); } else { - rcu_assign_pointer(t->trie, (struct node *)tn); + rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); tp = tn; } } @@ -1340,8 +1340,8 @@ err: } /* should be called with rcu_read_lock */ -static int check_leaf(struct trie *t, struct leaf *l, - t_key key, const struct flowi *flp, +static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, + t_key key, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct leaf_info *li; @@ -1349,40 +1349,75 @@ static int check_leaf(struct trie *t, struct leaf *l, struct hlist_node *node; hlist_for_each_entry_rcu(li, node, hhead, hlist) { - int err; + struct fib_alias *fa; int plen = li->plen; __be32 mask = inet_make_mask(plen); if (l->key != (key & ntohl(mask))) continue; - err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + struct fib_info *fi = fa->fa_info; + int nhsel, err; + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fa->fa_scope < flp->flowi4_scope) + continue; + fib_alias_accessed(fa); + err = fib_props[fa->fa_type].error; + if (err) { #ifdef CONFIG_IP_FIB_TRIE_STATS - if (err <= 0) - t->stats.semantic_match_passed++; - else - t->stats.semantic_match_miss++; + t->stats.semantic_match_miss++; +#endif + return 1; + } + if (fi->fib_flags & RTNH_F_DEAD) + continue; + for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + const struct fib_nh *nh = &fi->fib_nh[nhsel]; + + if (nh->nh_flags & RTNH_F_DEAD) + continue; + if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) + continue; + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_passed++; +#endif + res->prefixlen = plen; + res->nh_sel = nhsel; + res->type = fa->fa_type; + res->scope = fa->fa_scope; + res->fi = fi; + res->table = tb; + res->fa_head = &li->falh; + if (!(fib_flags & FIB_LOOKUP_NOREF)) + atomic_inc(&res->fi->fib_clntref); + return 0; + } + } + +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats.semantic_match_miss++; #endif - if (err <= 0) - return err; } return 1; } -int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, +int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; int ret; - struct node *n; + struct rt_trie_node *n; struct tnode *pn; - int pos, bits; - t_key key = ntohl(flp->fl4_dst); - int chopped_off; + unsigned int pos, bits; + t_key key = ntohl(flp->daddr); + unsigned int chopped_off; t_key cindex = 0; - int current_prefix_length = KEYLENGTH; + unsigned int current_prefix_length = KEYLENGTH; struct tnode *cn; t_key pref_mismatch; @@ -1398,7 +1433,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, /* Just a leaf? */ if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); + ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); goto found; } @@ -1423,7 +1458,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, } if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); + ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); if (ret > 0) goto backtrace; goto found; @@ -1541,7 +1576,7 @@ backtrace: if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); } else { - struct tnode *parent = node_parent_rcu((struct node *) pn); + struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn); if (!parent) goto failed; @@ -1568,7 +1603,7 @@ found: */ static void trie_leaf_remove(struct trie *t, struct leaf *l) { - struct tnode *tp = node_parent((struct node *) l); + struct tnode *tp = node_parent((struct rt_trie_node *) l); pr_debug("entering trie_leaf_remove(%p)\n", l); @@ -1706,7 +1741,7 @@ static int trie_flush_leaf(struct leaf *l) * Scan for the next right leaf starting at node p->child[idx] * Since we have back pointer, no recursion necessary. */ -static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) +static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) { do { t_key idx; @@ -1732,7 +1767,7 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) } /* Node empty, walk back up to parent */ - c = (struct node *) p; + c = (struct rt_trie_node *) p; } while ((p = node_parent_rcu(c)) != NULL); return NULL; /* Root of trie */ @@ -1753,7 +1788,7 @@ static struct leaf *trie_firstleaf(struct trie *t) static struct leaf *trie_nextleaf(struct leaf *l) { - struct node *c = (struct node *) l; + struct rt_trie_node *c = (struct rt_trie_node *) l; struct tnode *p = node_parent_rcu(c); if (!p) @@ -1802,80 +1837,6 @@ void fib_free_table(struct fib_table *tb) kfree(tb); } -void fib_table_select_default(struct fib_table *tb, - const struct flowi *flp, - struct fib_result *res) -{ - struct trie *t = (struct trie *) tb->tb_data; - int order, last_idx; - struct fib_info *fi = NULL; - struct fib_info *last_resort; - struct fib_alias *fa = NULL; - struct list_head *fa_head; - struct leaf *l; - - last_idx = -1; - last_resort = NULL; - order = -1; - - rcu_read_lock(); - - l = fib_find_node(t, 0); - if (!l) - goto out; - - fa_head = get_fa_head(l, 0); - if (!fa_head) - goto out; - - if (list_empty(fa_head)) - goto out; - - list_for_each_entry_rcu(fa, fa_head, fa_list) { - struct fib_info *next_fi = fa->fa_info; - - if (fa->fa_scope != res->scope || - fa->fa_type != RTN_UNICAST) - continue; - - if (next_fi->fib_priority > res->fi->fib_priority) - break; - if (!next_fi->fib_nh[0].nh_gw || - next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; - - fib_alias_accessed(fa); - - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, - &last_idx, tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - fi = next_fi; - order++; - } - if (order <= 0 || fi == NULL) { - tb->tb_default = -1; - goto out; - } - - if (!fib_detect_death(fi, order, &last_resort, &last_idx, - tb->tb_default)) { - fib_result_assign(res, fi); - tb->tb_default = order; - goto out; - } - if (last_idx >= 0) - fib_result_assign(res, last_resort); - tb->tb_default = last_idx; -out: - rcu_read_unlock(); -} - static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) @@ -1990,7 +1951,7 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, return skb->len; } -void __init fib_hash_init(void) +void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), @@ -2003,8 +1964,7 @@ void __init fib_hash_init(void) } -/* Fix more generic FIB names for init later */ -struct fib_table *fib_hash_table(u32 id) +struct fib_table *fib_trie_table(u32 id) { struct fib_table *tb; struct trie *t; @@ -2036,7 +1996,7 @@ struct fib_trie_iter { unsigned int depth; }; -static struct node *fib_trie_get_next(struct fib_trie_iter *iter) +static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) { struct tnode *tn = iter->tnode; unsigned int cindex = iter->index; @@ -2050,7 +2010,7 @@ static struct node *fib_trie_get_next(struct fib_trie_iter *iter) iter->tnode, iter->index, iter->depth); rescan: while (cindex < (1<<tn->bits)) { - struct node *n = tnode_get_child_rcu(tn, cindex); + struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex); if (n) { if (IS_LEAF(n)) { @@ -2069,7 +2029,7 @@ rescan: } /* Current node exhausted, pop back up */ - p = node_parent_rcu((struct node *)tn); + p = node_parent_rcu((struct rt_trie_node *)tn); if (p) { cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; tn = p; @@ -2081,10 +2041,10 @@ rescan: return NULL; } -static struct node *fib_trie_get_first(struct fib_trie_iter *iter, +static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { - struct node *n; + struct rt_trie_node *n; if (!t) return NULL; @@ -2108,7 +2068,7 @@ static struct node *fib_trie_get_first(struct fib_trie_iter *iter, static void trie_collect_stats(struct trie *t, struct trie_stat *s) { - struct node *n; + struct rt_trie_node *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); @@ -2181,7 +2141,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); - bytes += sizeof(struct node *) * pointers; + bytes += sizeof(struct rt_trie_node *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } @@ -2262,7 +2222,7 @@ static const struct file_operations fib_triestat_fops = { .release = single_release_net, }; -static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) +static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -2275,7 +2235,7 @@ static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) struct fib_table *tb; hlist_for_each_entry_rcu(tb, node, head, tb_hlist) { - struct node *n; + struct rt_trie_node *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); @@ -2304,7 +2264,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; - struct node *n; + struct rt_trie_node *n; ++*pos; /* next node in same table */ @@ -2390,7 +2350,7 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t) static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; - struct node *n = v; + struct rt_trie_node *n = v; if (!node_parent_rcu(n)) fib_table_print(seq, iter->tb); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4aa1b7f01ea0..a91dc1611081 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -233,48 +233,11 @@ static inline void icmp_xmit_unlock(struct sock *sk) * Send an ICMP frame. */ -/* - * Check transmit rate limitation for given message. - * The rate information is held in the destination cache now. - * This function is generic and could be used for other purposes - * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. - * - * Note that the same dst_entry fields are modified by functions in - * route.c too, but these work for packet destinations while xrlim_allow - * works for icmp destinations. This means the rate limiting information - * for one "ip object" is shared - and these ICMPs are twice limited: - * by source and by destination. - * - * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate - * SHOULD allow setting of rate limits - * - * Shared between ICMPv4 and ICMPv6. - */ -#define XRLIM_BURST_FACTOR 6 -int xrlim_allow(struct dst_entry *dst, int timeout) -{ - unsigned long now, token = dst->rate_tokens; - int rc = 0; - - now = jiffies; - token += now - dst->rate_last; - dst->rate_last = now; - if (token > XRLIM_BURST_FACTOR * timeout) - token = XRLIM_BURST_FACTOR * timeout; - if (token >= timeout) { - token -= timeout; - rc = 1; - } - dst->rate_tokens = token; - return rc; -} -EXPORT_SYMBOL(xrlim_allow); - -static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, +static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, int type, int code) { struct dst_entry *dst = &rt->dst; - int rc = 1; + bool rc = true; if (type > NR_ICMP_TYPES) goto out; @@ -288,8 +251,12 @@ static inline int icmpv4_xrlim_allow(struct net *net, struct rtable *rt, goto out; /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) - rc = xrlim_allow(dst, net->ipv4.sysctl_icmp_ratelimit); + if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { + if (!rt->peer) + rt_bind_peer(rt, 1); + rc = inet_peer_xrlim_allow(rt->peer, + net->ipv4.sysctl_icmp_ratelimit); + } out: return rc; } @@ -386,12 +353,15 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) daddr = icmp_param->replyopts.faddr; } { - struct flowi fl = { .fl4_dst= daddr, - .fl4_src = rt->rt_spec_dst, - .fl4_tos = RT_TOS(ip_hdr(skb)->tos), - .proto = IPPROTO_ICMP }; - security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(net, &rt, &fl)) + struct flowi4 fl4 = { + .daddr = daddr, + .saddr = rt->rt_spec_dst, + .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .flowi4_proto = IPPROTO_ICMP, + }; + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) goto out_unlock; } if (icmpv4_xrlim_allow(net, rt, icmp_param->data.icmph.type, @@ -402,6 +372,97 @@ out_unlock: icmp_xmit_unlock(sk); } +static struct rtable *icmp_route_lookup(struct net *net, struct sk_buff *skb_in, + struct iphdr *iph, + __be32 saddr, u8 tos, + int type, int code, + struct icmp_bxm *param) +{ + struct flowi4 fl4 = { + .daddr = (param->replyopts.srr ? + param->replyopts.faddr : iph->saddr), + .saddr = saddr, + .flowi4_tos = RT_TOS(tos), + .flowi4_proto = IPPROTO_ICMP, + .fl4_icmp_type = type, + .fl4_icmp_code = code, + }; + struct rtable *rt, *rt2; + int err; + + security_skb_classify_flow(skb_in, flowi4_to_flowi(&fl4)); + rt = __ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return rt; + + /* No need to clone since we're just using its address. */ + rt2 = rt; + + if (!fl4.saddr) + fl4.saddr = rt->rt_src; + + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(&fl4), NULL, 0); + if (!IS_ERR(rt)) { + if (rt != rt2) + return rt; + } else if (PTR_ERR(rt) == -EPERM) { + rt = NULL; + } else + return rt; + + err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4), AF_INET); + if (err) + goto relookup_failed; + + if (inet_addr_type(net, fl4.saddr) == RTN_LOCAL) { + rt2 = __ip_route_output_key(net, &fl4); + if (IS_ERR(rt2)) + err = PTR_ERR(rt2); + } else { + struct flowi4 fl4_2 = {}; + unsigned long orefdst; + + fl4_2.daddr = fl4.saddr; + rt2 = ip_route_output_key(net, &fl4_2); + if (IS_ERR(rt2)) { + err = PTR_ERR(rt2); + goto relookup_failed; + } + /* Ugh! */ + orefdst = skb_in->_skb_refdst; /* save old refdst */ + err = ip_route_input(skb_in, fl4.daddr, fl4.saddr, + RT_TOS(tos), rt2->dst.dev); + + dst_release(&rt2->dst); + rt2 = skb_rtable(skb_in); + skb_in->_skb_refdst = orefdst; /* restore old refdst */ + } + + if (err) + goto relookup_failed; + + rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, + flowi4_to_flowi(&fl4), NULL, + XFRM_LOOKUP_ICMP); + if (!IS_ERR(rt2)) { + dst_release(&rt->dst); + rt = rt2; + } else if (PTR_ERR(rt2) == -EPERM) { + if (rt) + dst_release(&rt->dst); + return rt2; + } else { + err = PTR_ERR(rt2); + goto relookup_failed; + } + return rt; + +relookup_failed: + if (rt) + return rt; + return ERR_PTR(err); +} /* * Send an ICMP message in response to a situation @@ -507,7 +568,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) rcu_read_lock(); if (rt_is_input_route(rt) && net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index_rcu(net, rt->fl.iif); + dev = dev_get_by_index_rcu(net, rt->rt_iif); if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); @@ -539,86 +600,11 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) ipc.opt = &icmp_param.replyopts; ipc.tx_flags = 0; - { - struct flowi fl = { - .fl4_dst = icmp_param.replyopts.srr ? - icmp_param.replyopts.faddr : iph->saddr, - .fl4_src = saddr, - .fl4_tos = RT_TOS(tos), - .proto = IPPROTO_ICMP, - .fl_icmp_type = type, - .fl_icmp_code = code, - }; - int err; - struct rtable *rt2; - - security_skb_classify_flow(skb_in, &fl); - if (__ip_route_output_key(net, &rt, &fl)) - goto out_unlock; - - /* No need to clone since we're just using its address. */ - rt2 = rt; - - if (!fl.nl_u.ip4_u.saddr) - fl.nl_u.ip4_u.saddr = rt->rt_src; - - err = xfrm_lookup(net, (struct dst_entry **)&rt, &fl, NULL, 0); - switch (err) { - case 0: - if (rt != rt2) - goto route_done; - break; - case -EPERM: - rt = NULL; - break; - default: - goto out_unlock; - } - - if (xfrm_decode_session_reverse(skb_in, &fl, AF_INET)) - goto relookup_failed; - - if (inet_addr_type(net, fl.fl4_src) == RTN_LOCAL) - err = __ip_route_output_key(net, &rt2, &fl); - else { - struct flowi fl2 = {}; - unsigned long orefdst; - - fl2.fl4_dst = fl.fl4_src; - if (ip_route_output_key(net, &rt2, &fl2)) - goto relookup_failed; - - /* Ugh! */ - orefdst = skb_in->_skb_refdst; /* save old refdst */ - err = ip_route_input(skb_in, fl.fl4_dst, fl.fl4_src, - RT_TOS(tos), rt2->dst.dev); - - dst_release(&rt2->dst); - rt2 = skb_rtable(skb_in); - skb_in->_skb_refdst = orefdst; /* restore old refdst */ - } - - if (err) - goto relookup_failed; - - err = xfrm_lookup(net, (struct dst_entry **)&rt2, &fl, NULL, - XFRM_LOOKUP_ICMP); - switch (err) { - case 0: - dst_release(&rt->dst); - rt = rt2; - break; - case -EPERM: - goto ende; - default: -relookup_failed: - if (!rt) - goto out_unlock; - break; - } - } + rt = icmp_route_lookup(net, skb_in, iph, saddr, tos, + type, code, &icmp_param); + if (IS_ERR(rt)) + goto out_unlock; -route_done: if (!icmpv4_xrlim_allow(net, rt, type, code)) goto ende; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index e0e77e297de3..1fd3d9ce8398 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -321,14 +321,12 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) } igmp_skb_size(skb) = size; - { - struct flowi fl = { .oif = dev->ifindex, - .fl4_dst = IGMPV3_ALL_MCR, - .proto = IPPROTO_IGMP }; - if (ip_route_output_key(net, &rt, &fl)) { - kfree_skb(skb); - return NULL; - } + rt = ip_route_output_ports(net, NULL, IGMPV3_ALL_MCR, 0, + 0, 0, + IPPROTO_IGMP, 0, dev->ifindex); + if (IS_ERR(rt)) { + kfree_skb(skb); + return NULL; } if (rt->rt_src == 0) { kfree_skb(skb); @@ -666,13 +664,12 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, else dst = group; - { - struct flowi fl = { .oif = dev->ifindex, - .fl4_dst = dst, - .proto = IPPROTO_IGMP }; - if (ip_route_output_key(net, &rt, &fl)) - return -1; - } + rt = ip_route_output_ports(net, NULL, dst, 0, + 0, 0, + IPPROTO_IGMP, 0, dev->ifindex); + if (IS_ERR(rt)) + return -1; + if (rt->rt_src == 0) { ip_rt_put(rt); return -1; @@ -1439,8 +1436,6 @@ void ip_mc_destroy_dev(struct in_device *in_dev) /* RTNL is locked */ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) { - struct flowi fl = { .fl4_dst = imr->imr_multiaddr.s_addr }; - struct rtable *rt; struct net_device *dev = NULL; struct in_device *idev = NULL; @@ -1454,9 +1449,14 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) return NULL; } - if (!dev && !ip_route_output_key(net, &rt, &fl)) { - dev = rt->dst.dev; - ip_rt_put(rt); + if (!dev) { + struct rtable *rt = ip_route_output(net, + imr->imr_multiaddr.s_addr, + 0, 0, 0); + if (!IS_ERR(rt)) { + dev = rt->dst.dev; + ip_rt_put(rt); + } } if (dev) { imr->imr_ifindex = dev->ifindex; @@ -2329,13 +2329,13 @@ void ip_mc_drop_socket(struct sock *sk) rtnl_unlock(); } -int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) +/* called with rcu_read_lock() */ +int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 proto) { struct ip_mc_list *im; struct ip_sf_list *psf; int rv = 0; - rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { if (im->multiaddr == mc_addr) break; @@ -2357,7 +2357,6 @@ int ip_check_mc(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u16 p } else rv = 1; /* unspecified source; tentatively allow */ } - rcu_read_unlock(); return rv; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 97e5fb765265..6c0b7f4a3d7d 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -356,20 +356,23 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); struct ip_options *opt = inet_rsk(req)->opt; - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .mark = sk->sk_mark, - .fl4_dst = ((opt && opt->srr) ? - opt->faddr : ireq->rmt_addr), - .fl4_src = ireq->loc_addr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet_sk(sk)->inet_sport, - .fl_ip_dport = ireq->rmt_port }; + struct flowi4 fl4 = { + .flowi4_oif = sk->sk_bound_dev_if, + .flowi4_mark = sk->sk_mark, + .daddr = ((opt && opt->srr) ? + opt->faddr : ireq->rmt_addr), + .saddr = ireq->loc_addr, + .flowi4_tos = RT_CONN_FLAGS(sk), + .flowi4_proto = sk->sk_protocol, + .flowi4_flags = inet_sk_flowi_flags(sk), + .fl4_sport = inet_sk(sk)->inet_sport, + .fl4_dport = ireq->rmt_port, + }; struct net *net = sock_net(sk); - security_req_classify_flow(req, &fl); - if (ip_route_output_flow(net, &rt, &fl, sk, 0)) + security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(net, &fl4, sk); + if (IS_ERR(rt)) goto no_route; if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) goto route_err; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c5af909cf701..3c8dfa16614d 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -505,7 +505,9 @@ restart: } rcu_read_unlock(); + local_bh_disable(); inet_twsk_deschedule(tw, twdr); + local_bh_enable(); inet_twsk_put(tw); goto restart_rcu; } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d9bc85751c74..dd1b20eca1a2 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -81,19 +81,19 @@ static const struct inet_peer peer_fake_node = { struct inet_peer_base { struct inet_peer __rcu *root; - spinlock_t lock; + seqlock_t lock; int total; }; static struct inet_peer_base v4_peers = { .root = peer_avl_empty_rcu, - .lock = __SPIN_LOCK_UNLOCKED(v4_peers.lock), + .lock = __SEQLOCK_UNLOCKED(v4_peers.lock), .total = 0, }; static struct inet_peer_base v6_peers = { .root = peer_avl_empty_rcu, - .lock = __SPIN_LOCK_UNLOCKED(v6_peers.lock), + .lock = __SEQLOCK_UNLOCKED(v6_peers.lock), .total = 0, }; @@ -167,9 +167,9 @@ static int addr_compare(const struct inetpeer_addr *a, int i, n = (a->family == AF_INET ? 1 : 4); for (i = 0; i < n; i++) { - if (a->a6[i] == b->a6[i]) + if (a->addr.a6[i] == b->addr.a6[i]) continue; - if (a->a6[i] < b->a6[i]) + if (a->addr.a6[i] < b->addr.a6[i]) return -1; return 1; } @@ -177,6 +177,9 @@ static int addr_compare(const struct inetpeer_addr *a, return 0; } +#define rcu_deref_locked(X, BASE) \ + rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) + /* * Called with local BH disabled and the pool lock held. */ @@ -187,8 +190,7 @@ static int addr_compare(const struct inetpeer_addr *a, \ stackptr = _stack; \ *stackptr++ = &_base->root; \ - for (u = rcu_dereference_protected(_base->root, \ - lockdep_is_held(&_base->lock)); \ + for (u = rcu_deref_locked(_base->root, _base); \ u != peer_avl_empty; ) { \ int cmp = addr_compare(_daddr, &u->daddr); \ if (cmp == 0) \ @@ -198,23 +200,22 @@ static int addr_compare(const struct inetpeer_addr *a, else \ v = &u->avl_right; \ *stackptr++ = v; \ - u = rcu_dereference_protected(*v, \ - lockdep_is_held(&_base->lock)); \ + u = rcu_deref_locked(*v, _base); \ } \ u; \ }) /* - * Called with rcu_read_lock_bh() + * Called with rcu_read_lock() * Because we hold no lock against a writer, its quite possible we fall * in an endless loop. * But every pointer we follow is guaranteed to be valid thanks to RCU. * We exit from this function if number of links exceeds PEER_MAXDEPTH */ -static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, - struct inet_peer_base *base) +static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, + struct inet_peer_base *base) { - struct inet_peer *u = rcu_dereference_bh(base->root); + struct inet_peer *u = rcu_dereference(base->root); int count = 0; while (u != peer_avl_empty) { @@ -230,9 +231,9 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, return u; } if (cmp == -1) - u = rcu_dereference_bh(u->avl_left); + u = rcu_dereference(u->avl_left); else - u = rcu_dereference_bh(u->avl_right); + u = rcu_dereference(u->avl_right); if (unlikely(++count == PEER_MAXDEPTH)) break; } @@ -246,13 +247,11 @@ static struct inet_peer *lookup_rcu_bh(const struct inetpeer_addr *daddr, struct inet_peer __rcu **v; \ *stackptr++ = &start->avl_left; \ v = &start->avl_left; \ - for (u = rcu_dereference_protected(*v, \ - lockdep_is_held(&base->lock)); \ + for (u = rcu_deref_locked(*v, base); \ u->avl_right != peer_avl_empty_rcu; ) { \ v = &u->avl_right; \ *stackptr++ = v; \ - u = rcu_dereference_protected(*v, \ - lockdep_is_held(&base->lock)); \ + u = rcu_deref_locked(*v, base); \ } \ u; \ }) @@ -271,21 +270,16 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], while (stackend > stack) { nodep = *--stackend; - node = rcu_dereference_protected(*nodep, - lockdep_is_held(&base->lock)); - l = rcu_dereference_protected(node->avl_left, - lockdep_is_held(&base->lock)); - r = rcu_dereference_protected(node->avl_right, - lockdep_is_held(&base->lock)); + node = rcu_deref_locked(*nodep, base); + l = rcu_deref_locked(node->avl_left, base); + r = rcu_deref_locked(node->avl_right, base); lh = node_height(l); rh = node_height(r); if (lh > rh + 1) { /* l: RH+2 */ struct inet_peer *ll, *lr, *lrl, *lrr; int lrh; - ll = rcu_dereference_protected(l->avl_left, - lockdep_is_held(&base->lock)); - lr = rcu_dereference_protected(l->avl_right, - lockdep_is_held(&base->lock)); + ll = rcu_deref_locked(l->avl_left, base); + lr = rcu_deref_locked(l->avl_right, base); lrh = node_height(lr); if (lrh <= node_height(ll)) { /* ll: RH+1 */ RCU_INIT_POINTER(node->avl_left, lr); /* lr: RH or RH+1 */ @@ -296,10 +290,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], l->avl_height = node->avl_height + 1; RCU_INIT_POINTER(*nodep, l); } else { /* ll: RH, lr: RH+1 */ - lrl = rcu_dereference_protected(lr->avl_left, - lockdep_is_held(&base->lock)); /* lrl: RH or RH-1 */ - lrr = rcu_dereference_protected(lr->avl_right, - lockdep_is_held(&base->lock)); /* lrr: RH or RH-1 */ + lrl = rcu_deref_locked(lr->avl_left, base);/* lrl: RH or RH-1 */ + lrr = rcu_deref_locked(lr->avl_right, base);/* lrr: RH or RH-1 */ RCU_INIT_POINTER(node->avl_left, lrr); /* lrr: RH or RH-1 */ RCU_INIT_POINTER(node->avl_right, r); /* r: RH */ node->avl_height = rh + 1; /* node: RH+1 */ @@ -314,10 +306,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], } else if (rh > lh + 1) { /* r: LH+2 */ struct inet_peer *rr, *rl, *rlr, *rll; int rlh; - rr = rcu_dereference_protected(r->avl_right, - lockdep_is_held(&base->lock)); - rl = rcu_dereference_protected(r->avl_left, - lockdep_is_held(&base->lock)); + rr = rcu_deref_locked(r->avl_right, base); + rl = rcu_deref_locked(r->avl_left, base); rlh = node_height(rl); if (rlh <= node_height(rr)) { /* rr: LH+1 */ RCU_INIT_POINTER(node->avl_right, rl); /* rl: LH or LH+1 */ @@ -328,10 +318,8 @@ static void peer_avl_rebalance(struct inet_peer __rcu **stack[], r->avl_height = node->avl_height + 1; RCU_INIT_POINTER(*nodep, r); } else { /* rr: RH, rl: RH+1 */ - rlr = rcu_dereference_protected(rl->avl_right, - lockdep_is_held(&base->lock)); /* rlr: LH or LH-1 */ - rll = rcu_dereference_protected(rl->avl_left, - lockdep_is_held(&base->lock)); /* rll: LH or LH-1 */ + rlr = rcu_deref_locked(rl->avl_right, base);/* rlr: LH or LH-1 */ + rll = rcu_deref_locked(rl->avl_left, base);/* rll: LH or LH-1 */ RCU_INIT_POINTER(node->avl_right, rll); /* rll: LH or LH-1 */ RCU_INIT_POINTER(node->avl_left, l); /* l: LH */ node->avl_height = lh + 1; /* node: LH+1 */ @@ -372,7 +360,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) do_free = 0; - spin_lock_bh(&base->lock); + write_seqlock_bh(&base->lock); /* Check the reference counter. It was artificially incremented by 1 * in cleanup() function to prevent sudden disappearing. If we can * atomically (because of lockless readers) take this last reference, @@ -392,8 +380,7 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) /* look for a node to insert instead of p */ struct inet_peer *t; t = lookup_rightempty(p, base); - BUG_ON(rcu_dereference_protected(*stackptr[-1], - lockdep_is_held(&base->lock)) != t); + BUG_ON(rcu_deref_locked(*stackptr[-1], base) != t); **--stackptr = t->avl_left; /* t is removed, t->daddr > x->daddr for any * x in p->avl_left subtree. @@ -409,10 +396,10 @@ static void unlink_from_pool(struct inet_peer *p, struct inet_peer_base *base) base->total--; do_free = 1; } - spin_unlock_bh(&base->lock); + write_sequnlock_bh(&base->lock); if (do_free) - call_rcu_bh(&p->rcu, inetpeer_free_rcu); + call_rcu(&p->rcu, inetpeer_free_rcu); else /* The node is used again. Decrease the reference counter * back. The loop "cleanup -> unlink_from_unused @@ -475,15 +462,19 @@ static int cleanup_once(unsigned long ttl) struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) { struct inet_peer __rcu **stack[PEER_MAXDEPTH], ***stackptr; - struct inet_peer_base *base = family_to_base(AF_INET); + struct inet_peer_base *base = family_to_base(daddr->family); struct inet_peer *p; + unsigned int sequence; + int invalidated; /* Look up for the address quickly, lockless. * Because of a concurrent writer, we might not find an existing entry. */ - rcu_read_lock_bh(); - p = lookup_rcu_bh(daddr, base); - rcu_read_unlock_bh(); + rcu_read_lock(); + sequence = read_seqbegin(&base->lock); + p = lookup_rcu(daddr, base); + invalidated = read_seqretry(&base->lock, sequence); + rcu_read_unlock(); if (p) { /* The existing node has been found. @@ -493,14 +484,18 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) return p; } + /* If no writer did a change during our lookup, we can return early. */ + if (!create && !invalidated) + return NULL; + /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ - spin_lock_bh(&base->lock); + write_seqlock_bh(&base->lock); p = lookup(daddr, stack, base); if (p != peer_avl_empty) { atomic_inc(&p->refcnt); - spin_unlock_bh(&base->lock); + write_sequnlock_bh(&base->lock); /* Remove the entry from unused list if it was there. */ unlink_from_unused(p); return p; @@ -510,8 +505,14 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) p->daddr = *daddr; atomic_set(&p->refcnt, 1); atomic_set(&p->rid, 0); - atomic_set(&p->ip_id_count, secure_ip_id(daddr->a4)); + atomic_set(&p->ip_id_count, secure_ip_id(daddr->addr.a4)); p->tcp_ts_stamp = 0; + p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; + p->rate_tokens = 0; + p->rate_last = 0; + p->pmtu_expires = 0; + p->pmtu_orig = 0; + memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); INIT_LIST_HEAD(&p->unused); @@ -519,7 +520,7 @@ struct inet_peer *inet_getpeer(struct inetpeer_addr *daddr, int create) link_to_pool(p, base); base->total++; } - spin_unlock_bh(&base->lock); + write_sequnlock_bh(&base->lock); if (base->total >= inet_peer_threshold) /* Remove one less-recently-used entry. */ @@ -579,3 +580,44 @@ void inet_putpeer(struct inet_peer *p) local_bh_enable(); } EXPORT_SYMBOL_GPL(inet_putpeer); + +/* + * Check transmit rate limitation for given message. + * The rate information is held in the inet_peer entries now. + * This function is generic and could be used for other purposes + * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. + * + * Note that the same inet_peer fields are modified by functions in + * route.c too, but these work for packet destinations while xrlim_allow + * works for icmp destinations. This means the rate limiting information + * for one "ip object" is shared - and these ICMPs are twice limited: + * by source and by destination. + * + * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate + * SHOULD allow setting of rate limits + * + * Shared between ICMPv4 and ICMPv6. + */ +#define XRLIM_BURST_FACTOR 6 +bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) +{ + unsigned long now, token; + bool rc = false; + + if (!peer) + return true; + + token = peer->rate_tokens; + now = jiffies; + token += now - peer->rate_last; + peer->rate_last = now; + if (token > XRLIM_BURST_FACTOR * timeout) + token = XRLIM_BURST_FACTOR * timeout; + if (token >= timeout) { + token -= timeout; + rc = true; + } + peer->rate_tokens = token; + return rc; +} +EXPORT_SYMBOL(inet_peer_xrlim_allow); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index eb68a0e34e49..da5941f18c3c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -769,18 +769,12 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph); } - { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = dst, - .fl4_src = tiph->saddr, - .fl4_tos = RT_TOS(tos), - .fl_gre_key = tunnel->parms.o_key - }; - if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - dev->stats.tx_carrier_errors++; - goto tx_error; - } + rt = ip_route_output_gre(dev_net(dev), dst, tiph->saddr, + tunnel->parms.o_key, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error; } tdev = rt->dst.dev; @@ -944,17 +938,13 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = iph->daddr, - .fl4_src = iph->saddr, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_GRE, - .fl_gre_key = tunnel->parms.o_key - }; - struct rtable *rt; - - if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { + struct rtable *rt = ip_route_output_gre(dev_net(dev), + iph->daddr, iph->saddr, + tunnel->parms.o_key, + RT_TOS(iph->tos), + tunnel->parms.link); + + if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } @@ -1206,17 +1196,14 @@ static int ipgre_open(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { - struct flowi fl = { - .oif = t->parms.link, - .fl4_dst = t->parms.iph.daddr, - .fl4_src = t->parms.iph.saddr, - .fl4_tos = RT_TOS(t->parms.iph.tos), - .proto = IPPROTO_GRE, - .fl_gre_key = t->parms.o_key - }; - struct rtable *rt; - - if (ip_route_output_key(dev_net(dev), &rt, &fl)) + struct rtable *rt = ip_route_output_gre(dev_net(dev), + t->parms.iph.daddr, + t->parms.iph.saddr, + t->parms.o_key, + RT_TOS(t->parms.iph.tos), + t->parms.link); + + if (IS_ERR(rt)) return -EADDRNOTAVAIL; dev = rt->dst.dev; ip_rt_put(rt); @@ -1764,4 +1751,4 @@ module_exit(ipgre_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); -MODULE_ALIAS("gre0"); +MODULE_ALIAS_NETDEV("gre0"); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d859bcc26cb7..d7b2b0987a3b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb) } } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 04c7b3ba6b39..67f241b97649 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -339,25 +339,19 @@ int ip_queue_xmit(struct sk_buff *skb) if(opt && opt->srr) daddr = opt->faddr; - { - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .mark = sk->sk_mark, - .fl4_dst = daddr, - .fl4_src = inet->inet_saddr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet->inet_sport, - .fl_ip_dport = inet->inet_dport }; - - /* If this fails, retransmit mechanism of transport layer will - * keep trying until route appears or the connection times - * itself out. - */ - security_sk_classify_flow(sk, &fl); - if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) - goto no_route; - } + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times + * itself out. + */ + rt = ip_route_output_ports(sock_net(sk), sk, + daddr, inet->inet_saddr, + inet->inet_dport, + inet->inet_sport, + sk->sk_protocol, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if); + if (IS_ERR(rt)) + goto no_route; sk_setup_caps(sk, &rt->dst); } skb_dst_set_noref(skb, &rt->dst); @@ -733,6 +727,7 @@ csum_page(struct page *page, int offset, int copy) } static inline int ip_ufo_append_data(struct sock *sk, + struct sk_buff_head *queue, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int hh_len, int fragheaderlen, @@ -745,7 +740,7 @@ static inline int ip_ufo_append_data(struct sock *sk, * device, so create one single skb packet containing complete * udp datagram */ - if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { + if ((skb = skb_peek_tail(queue)) == NULL) { skb = sock_alloc_send_skb(sk, hh_len + fragheaderlen + transhdrlen + 20, (flags & MSG_DONTWAIT), &err); @@ -767,40 +762,28 @@ static inline int ip_ufo_append_data(struct sock *sk, skb->ip_summed = CHECKSUM_PARTIAL; skb->csum = 0; - sk->sk_sndmsg_off = 0; /* specify the length of each IP datagram fragment */ skb_shinfo(skb)->gso_size = mtu - fragheaderlen; skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - __skb_queue_tail(&sk->sk_write_queue, skb); + __skb_queue_tail(queue, skb); } return skb_append_datato_frags(sk, skb, getfrag, from, (length - transhdrlen)); } -/* - * ip_append_data() and ip_append_page() can make one large IP datagram - * from many pieces of data. Each pieces will be holded on the socket - * until ip_push_pending_frames() is called. Each piece can be a page - * or non-page data. - * - * Not only UDP, other transport protocols - e.g. raw sockets - can use - * this interface potentially. - * - * LATER: length must be adjusted by pad at tail, when it is required. - */ -int ip_append_data(struct sock *sk, - int getfrag(void *from, char *to, int offset, int len, - int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, - struct ipcm_cookie *ipc, struct rtable **rtp, - unsigned int flags) +static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue, + struct inet_cork *cork, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + unsigned int flags) { struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; - struct ip_options *opt = NULL; + struct ip_options *opt = cork->opt; int hh_len; int exthdrlen; int mtu; @@ -809,58 +792,19 @@ int ip_append_data(struct sock *sk, int offset = 0; unsigned int maxfraglen, fragheaderlen; int csummode = CHECKSUM_NONE; - struct rtable *rt; + struct rtable *rt = (struct rtable *)cork->dst; - if (flags&MSG_PROBE) - return 0; - - if (skb_queue_empty(&sk->sk_write_queue)) { - /* - * setup for corking. - */ - opt = ipc->opt; - if (opt) { - if (inet->cork.opt == NULL) { - inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); - if (unlikely(inet->cork.opt == NULL)) - return -ENOBUFS; - } - memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); - inet->cork.flags |= IPCORK_OPT; - inet->cork.addr = ipc->addr; - } - rt = *rtp; - if (unlikely(!rt)) - return -EFAULT; - /* - * We steal reference to this route, caller should not release it - */ - *rtp = NULL; - inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ? - rt->dst.dev->mtu : - dst_mtu(rt->dst.path); - inet->cork.dst = &rt->dst; - inet->cork.length = 0; - sk->sk_sndmsg_page = NULL; - sk->sk_sndmsg_off = 0; - exthdrlen = rt->dst.header_len; - length += exthdrlen; - transhdrlen += exthdrlen; - } else { - rt = (struct rtable *)inet->cork.dst; - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + exthdrlen = transhdrlen ? rt->dst.header_len : 0; + length += exthdrlen; + transhdrlen += exthdrlen; + mtu = cork->fragsize; - transhdrlen = 0; - exthdrlen = 0; - mtu = inet->cork.fragsize; - } hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; - if (inet->cork.length + length > 0xFFFF - fragheaderlen) { + if (cork->length + length > 0xFFFF - fragheaderlen) { ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu-exthdrlen); return -EMSGSIZE; @@ -876,15 +820,15 @@ int ip_append_data(struct sock *sk, !exthdrlen) csummode = CHECKSUM_PARTIAL; - skb = skb_peek_tail(&sk->sk_write_queue); + skb = skb_peek_tail(queue); - inet->cork.length += length; + cork->length += length; if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { - err = ip_ufo_append_data(sk, getfrag, from, length, hh_len, - fragheaderlen, transhdrlen, mtu, - flags); + err = ip_ufo_append_data(sk, queue, getfrag, from, length, + hh_len, fragheaderlen, transhdrlen, + mtu, flags); if (err) goto error; return 0; @@ -961,7 +905,7 @@ alloc_new_skb: else /* only the initial fragment is time stamped */ - ipc->tx_flags = 0; + cork->tx_flags = 0; } if (skb == NULL) goto error; @@ -972,7 +916,7 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); - skb_shinfo(skb)->tx_flags = ipc->tx_flags; + skb_shinfo(skb)->tx_flags = cork->tx_flags; /* * Find where to start putting bytes. @@ -1009,7 +953,7 @@ alloc_new_skb: /* * Put the packet on the pending queue. */ - __skb_queue_tail(&sk->sk_write_queue, skb); + __skb_queue_tail(queue, skb); continue; } @@ -1029,8 +973,8 @@ alloc_new_skb: } else { int i = skb_shinfo(skb)->nr_frags; skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; - struct page *page = sk->sk_sndmsg_page; - int off = sk->sk_sndmsg_off; + struct page *page = cork->page; + int off = cork->off; unsigned int left; if (page && (left = PAGE_SIZE - off) > 0) { @@ -1042,7 +986,7 @@ alloc_new_skb: goto error; } get_page(page); - skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); + skb_fill_page_desc(skb, i, page, off, 0); frag = &skb_shinfo(skb)->frags[i]; } } else if (i < MAX_SKB_FRAGS) { @@ -1053,8 +997,8 @@ alloc_new_skb: err = -ENOMEM; goto error; } - sk->sk_sndmsg_page = page; - sk->sk_sndmsg_off = 0; + cork->page = page; + cork->off = 0; skb_fill_page_desc(skb, i, page, 0, 0); frag = &skb_shinfo(skb)->frags[i]; @@ -1066,7 +1010,7 @@ alloc_new_skb: err = -EFAULT; goto error; } - sk->sk_sndmsg_off += copy; + cork->off += copy; frag->size += copy; skb->len += copy; skb->data_len += copy; @@ -1080,11 +1024,87 @@ alloc_new_skb: return 0; error: - inet->cork.length -= length; + cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } +static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, + struct ipcm_cookie *ipc, struct rtable **rtp) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_options *opt; + struct rtable *rt; + + /* + * setup for corking. + */ + opt = ipc->opt; + if (opt) { + if (cork->opt == NULL) { + cork->opt = kmalloc(sizeof(struct ip_options) + 40, + sk->sk_allocation); + if (unlikely(cork->opt == NULL)) + return -ENOBUFS; + } + memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen); + cork->flags |= IPCORK_OPT; + cork->addr = ipc->addr; + } + rt = *rtp; + if (unlikely(!rt)) + return -EFAULT; + /* + * We steal reference to this route, caller should not release it + */ + *rtp = NULL; + cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ? + rt->dst.dev->mtu : dst_mtu(rt->dst.path); + cork->dst = &rt->dst; + cork->length = 0; + cork->tx_flags = ipc->tx_flags; + cork->page = NULL; + cork->off = 0; + + return 0; +} + +/* + * ip_append_data() and ip_append_page() can make one large IP datagram + * from many pieces of data. Each pieces will be holded on the socket + * until ip_push_pending_frames() is called. Each piece can be a page + * or non-page data. + * + * Not only UDP, other transport protocols - e.g. raw sockets - can use + * this interface potentially. + * + * LATER: length must be adjusted by pad at tail, when it is required. + */ +int ip_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable **rtp, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->sk_write_queue)) { + err = ip_setup_cork(sk, &inet->cork, ipc, rtp); + if (err) + return err; + } else { + transhdrlen = 0; + } + + return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag, + from, length, transhdrlen, flags); +} + ssize_t ip_append_page(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -1228,40 +1248,41 @@ error: return err; } -static void ip_cork_release(struct inet_sock *inet) +static void ip_cork_release(struct inet_cork *cork) { - inet->cork.flags &= ~IPCORK_OPT; - kfree(inet->cork.opt); - inet->cork.opt = NULL; - dst_release(inet->cork.dst); - inet->cork.dst = NULL; + cork->flags &= ~IPCORK_OPT; + kfree(cork->opt); + cork->opt = NULL; + dst_release(cork->dst); + cork->dst = NULL; } /* * Combined all pending IP fragments on the socket as one IP datagram * and push them out. */ -int ip_push_pending_frames(struct sock *sk) +struct sk_buff *__ip_make_skb(struct sock *sk, + struct sk_buff_head *queue, + struct inet_cork *cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ip_options *opt = NULL; - struct rtable *rt = (struct rtable *)inet->cork.dst; + struct rtable *rt = (struct rtable *)cork->dst; struct iphdr *iph; __be16 df = 0; __u8 ttl; - int err = 0; - if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + if ((skb = __skb_dequeue(queue)) == NULL) goto out; tail_skb = &(skb_shinfo(skb)->frag_list); /* move skb->data to ip header from ext header */ if (skb->data < skb_network_header(skb)) __skb_pull(skb, skb_network_offset(skb)); - while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + while ((tmp_skb = __skb_dequeue(queue)) != NULL) { __skb_pull(tmp_skb, skb_network_header_len(skb)); *tail_skb = tmp_skb; tail_skb = &(tmp_skb->next); @@ -1287,8 +1308,8 @@ int ip_push_pending_frames(struct sock *sk) ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); - if (inet->cork.flags & IPCORK_OPT) - opt = inet->cork.opt; + if (cork->flags & IPCORK_OPT) + opt = cork->opt; if (rt->rt_type == RTN_MULTICAST) ttl = inet->mc_ttl; @@ -1300,7 +1321,7 @@ int ip_push_pending_frames(struct sock *sk) iph->ihl = 5; if (opt) { iph->ihl += opt->optlen>>2; - ip_options_build(skb, opt, inet->cork.addr, rt, 0); + ip_options_build(skb, opt, cork->addr, rt, 0); } iph->tos = inet->tos; iph->frag_off = df; @@ -1316,44 +1337,95 @@ int ip_push_pending_frames(struct sock *sk) * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec * on dst refcount */ - inet->cork.dst = NULL; + cork->dst = NULL; skb_dst_set(skb, &rt->dst); if (iph->protocol == IPPROTO_ICMP) icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); - /* Netfilter gets whole the not fragmented skb. */ + ip_cork_release(cork); +out: + return skb; +} + +int ip_send_skb(struct sk_buff *skb) +{ + struct net *net = sock_net(skb->sk); + int err; + err = ip_local_out(skb); if (err) { if (err > 0) err = net_xmit_errno(err); if (err) - goto error; + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); } -out: - ip_cork_release(inet); return err; +} -error: - IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); - goto out; +int ip_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb; + + skb = ip_finish_skb(sk); + if (!skb) + return 0; + + /* Netfilter gets whole the not fragmented skb. */ + return ip_send_skb(skb); } /* * Throw away all pending data on the socket. */ -void ip_flush_pending_frames(struct sock *sk) +static void __ip_flush_pending_frames(struct sock *sk, + struct sk_buff_head *queue, + struct inet_cork *cork) { struct sk_buff *skb; - while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) + while ((skb = __skb_dequeue_tail(queue)) != NULL) kfree_skb(skb); - ip_cork_release(inet_sk(sk)); + ip_cork_release(cork); +} + +void ip_flush_pending_frames(struct sock *sk) +{ + __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork); } +struct sk_buff *ip_make_skb(struct sock *sk, + int getfrag(void *from, char *to, int offset, + int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable **rtp, + unsigned int flags) +{ + struct inet_cork cork = {}; + struct sk_buff_head queue; + int err; + + if (flags & MSG_PROBE) + return NULL; + + __skb_queue_head_init(&queue); + + err = ip_setup_cork(sk, &cork, ipc, rtp); + if (err) + return ERR_PTR(err); + + err = __ip_append_data(sk, &queue, &cork, getfrag, + from, length, transhdrlen, flags); + if (err) { + __ip_flush_pending_frames(sk, &queue, &cork); + return ERR_PTR(err); + } + + return __ip_make_skb(sk, &queue, &cork); +} /* * Fetch data from kernel space and fill in checksum if needed. @@ -1402,16 +1474,19 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar } { - struct flowi fl = { .oif = arg->bound_dev_if, - .fl4_dst = daddr, - .fl4_src = rt->rt_spec_dst, - .fl4_tos = RT_TOS(ip_hdr(skb)->tos), - .fl_ip_sport = tcp_hdr(skb)->dest, - .fl_ip_dport = tcp_hdr(skb)->source, - .proto = sk->sk_protocol, - .flags = ip_reply_arg_flowi_flags(arg) }; - security_skb_classify_flow(skb, &fl); - if (ip_route_output_key(sock_net(sk), &rt, &fl)) + struct flowi4 fl4 = { + .flowi4_oif = arg->bound_dev_if, + .daddr = daddr, + .saddr = rt->rt_spec_dst, + .flowi4_tos = RT_TOS(ip_hdr(skb)->tos), + .fl4_sport = tcp_hdr(skb)->dest, + .fl4_dport = tcp_hdr(skb)->source, + .flowi4_proto = sk->sk_protocol, + .flowi4_flags = ip_reply_arg_flowi_flags(arg), + }; + security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) return; } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 988f52fba54a..bfc17c5914e7 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -460,19 +460,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error_icmp; } - { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = dst, - .fl4_src= tiph->saddr, - .fl4_tos = RT_TOS(tos), - .proto = IPPROTO_IPIP - }; - - if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - dev->stats.tx_carrier_errors++; - goto tx_error_icmp; - } + rt = ip_route_output_ports(dev_net(dev), NULL, + dst, tiph->saddr, + 0, 0, + IPPROTO_IPIP, RT_TOS(tos), + tunnel->parms.link); + if (IS_ERR(rt)) { + dev->stats.tx_carrier_errors++; + goto tx_error_icmp; } tdev = rt->dst.dev; @@ -583,16 +578,14 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) iph = &tunnel->parms.iph; if (iph->daddr) { - struct flowi fl = { - .oif = tunnel->parms.link, - .fl4_dst = iph->daddr, - .fl4_src = iph->saddr, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_IPIP - }; - struct rtable *rt; - - if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { + struct rtable *rt = ip_route_output_ports(dev_net(dev), NULL, + iph->daddr, iph->saddr, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), + tunnel->parms.link); + + if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } @@ -913,4 +906,4 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); MODULE_LICENSE("GPL"); -MODULE_ALIAS("tunl0"); +MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3f3a9afd73e0..1f62eaeb6de4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -60,6 +60,7 @@ #include <linux/notifier.h> #include <linux/if_arp.h> #include <linux/netfilter_ipv4.h> +#include <linux/compat.h> #include <net/ipip.h> #include <net/checksum.h> #include <net/netlink.h> @@ -147,14 +148,15 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) return NULL; } -static int ipmr_fib_lookup(struct net *net, struct flowi *flp, +static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { struct ipmr_result res; struct fib_lookup_arg arg = { .result = &res, }; int err; - err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg); + err = fib_rules_lookup(net->ipv4.mr_rules_ops, + flowi4_to_flowi(flp4), 0, &arg); if (err < 0) return err; *mrt = res.mrt; @@ -282,7 +284,7 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id) return net->ipv4.mrt; } -static int ipmr_fib_lookup(struct net *net, struct flowi *flp, +static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { *mrt = net->ipv4.mrt; @@ -434,14 +436,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; - struct flowi fl = { - .oif = dev->ifindex, - .iif = skb->skb_iif, - .mark = skb->mark, + struct flowi4 fl4 = { + .flowi4_oif = dev->ifindex, + .flowi4_iif = skb->skb_iif, + .flowi4_mark = skb->mark, }; int err; - err = ipmr_fib_lookup(net, &fl, &mrt); + err = ipmr_fib_lookup(net, &fl4, &mrt); if (err < 0) { kfree_skb(skb); return err; @@ -1434,6 +1436,81 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) } } +#ifdef CONFIG_COMPAT +struct compat_sioc_sg_req { + struct in_addr src; + struct in_addr grp; + compat_ulong_t pktcnt; + compat_ulong_t bytecnt; + compat_ulong_t wrong_if; +}; + +struct compat_sioc_vif_req { + vifi_t vifi; /* Which iface */ + compat_ulong_t icount; + compat_ulong_t ocount; + compat_ulong_t ibytes; + compat_ulong_t obytes; +}; + +int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + struct compat_sioc_sg_req sr; + struct compat_sioc_vif_req vr; + struct vif_device *vif; + struct mfc_cache *c; + struct net *net = sock_net(sk); + struct mr_table *mrt; + + mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); + if (mrt == NULL) + return -ENOENT; + + switch (cmd) { + case SIOCGETVIFCNT: + if (copy_from_user(&vr, arg, sizeof(vr))) + return -EFAULT; + if (vr.vifi >= mrt->maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif = &mrt->vif_table[vr.vifi]; + if (VIF_EXISTS(mrt, vr.vifi)) { + vr.icount = vif->pkt_in; + vr.ocount = vif->pkt_out; + vr.ibytes = vif->bytes_in; + vr.obytes = vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg, &vr, sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT: + if (copy_from_user(&sr, arg, sizeof(sr))) + return -EFAULT; + + rcu_read_lock(); + c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + rcu_read_unlock(); + + if (copy_to_user(arg, &sr, sizeof(sr))) + return -EFAULT; + return 0; + } + rcu_read_unlock(); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} +#endif + static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -1535,26 +1612,20 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, #endif if (vif->flags & VIFF_TUNNEL) { - struct flowi fl = { - .oif = vif->link, - .fl4_dst = vif->remote, - .fl4_src = vif->local, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_IPIP - }; - - if (ip_route_output_key(net, &rt, &fl)) + rt = ip_route_output_ports(net, NULL, + vif->remote, vif->local, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), vif->link); + if (IS_ERR(rt)) goto out_free; encap = sizeof(struct iphdr); } else { - struct flowi fl = { - .oif = vif->link, - .fl4_dst = iph->daddr, - .fl4_tos = RT_TOS(iph->tos), - .proto = IPPROTO_IPIP - }; - - if (ip_route_output_key(net, &rt, &fl)) + rt = ip_route_output_ports(net, NULL, iph->daddr, 0, + 0, 0, + IPPROTO_IPIP, + RT_TOS(iph->tos), vif->link); + if (IS_ERR(rt)) goto out_free; } @@ -1717,6 +1788,24 @@ dont_forward: return 0; } +static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct rtable *rt) +{ + struct flowi4 fl4 = { + .daddr = rt->rt_key_dst, + .saddr = rt->rt_key_src, + .flowi4_tos = rt->rt_tos, + .flowi4_oif = rt->rt_oif, + .flowi4_iif = rt->rt_iif, + .flowi4_mark = rt->rt_mark, + }; + struct mr_table *mrt; + int err; + + err = ipmr_fib_lookup(net, &fl4, &mrt); + if (err) + return ERR_PTR(err); + return mrt; +} /* * Multicast packets for forwarding arrive here @@ -1729,7 +1818,6 @@ int ip_mr_input(struct sk_buff *skb) struct net *net = dev_net(skb->dev); int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; struct mr_table *mrt; - int err; /* Packet is looped back after forward, it should not be * forwarded second time, but still can be delivered locally. @@ -1737,12 +1825,11 @@ int ip_mr_input(struct sk_buff *skb) if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; - err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); - if (err < 0) { + mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb)); + if (IS_ERR(mrt)) { kfree_skb(skb); - return err; + return PTR_ERR(mrt); } - if (!local) { if (IPCB(skb)->opt.router_alert) { if (ip_call_ra_chain(skb)) @@ -1870,9 +1957,9 @@ int pim_rcv_v1(struct sk_buff *skb) pim = igmp_hdr(skb); - if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) + mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb)); + if (IS_ERR(mrt)) goto drop; - if (!mrt->mroute_do_pim || pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) goto drop; @@ -1902,9 +1989,9 @@ static int pim_rcv(struct sk_buff *skb) csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; - if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0) + mrt = ipmr_rt_fib_lookup(net, skb_rtable(skb)); + if (IS_ERR(mrt)) goto drop; - if (__pim_rcv(mrt, skb, sizeof(*pim))) { drop: kfree_skb(skb); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 994a1f29ebbc..f3c0b549b8e1 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -16,7 +16,7 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) struct net *net = dev_net(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - struct flowi fl = {}; + struct flowi4 fl4 = {}; unsigned long orefdst; unsigned int hh_len; unsigned int type; @@ -31,14 +31,15 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) * packets with foreign saddr to appear on the NF_INET_LOCAL_OUT hook. */ if (addr_type == RTN_LOCAL) { - fl.fl4_dst = iph->daddr; + fl4.daddr = iph->daddr; if (type == RTN_LOCAL) - fl.fl4_src = iph->saddr; - fl.fl4_tos = RT_TOS(iph->tos); - fl.oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; - fl.mark = skb->mark; - fl.flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; - if (ip_route_output_key(net, &rt, &fl) != 0) + fl4.saddr = iph->saddr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) return -1; /* Drop old route. */ @@ -47,8 +48,9 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) } else { /* non-local src, find valid iif to satisfy * rp-filter when calling ip_route_input. */ - fl.fl4_dst = iph->saddr; - if (ip_route_output_key(net, &rt, &fl) != 0) + fl4.daddr = iph->saddr; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) return -1; orefdst = skb->_skb_refdst; @@ -66,10 +68,11 @@ int ip_route_me_harder(struct sk_buff *skb, unsigned addr_type) #ifdef CONFIG_XFRM if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && - xfrm_decode_session(skb, &fl, AF_INET) == 0) { + xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { struct dst_entry *dst = skb_dst(skb); skb_dst_set(skb, NULL); - if (xfrm_lookup(net, &dst, &fl, skb->sk, 0)) + dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), skb->sk, 0); + if (IS_ERR(dst)) return -1; skb_dst_set(skb, dst); } @@ -102,7 +105,8 @@ int ip_xfrm_me_harder(struct sk_buff *skb) dst = ((struct xfrm_dst *)dst)->route; dst_hold(dst); - if (xfrm_lookup(dev_net(dst->dev), &dst, &fl, skb->sk, 0) < 0) + dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0); + if (IS_ERR(dst)) return -1; skb_dst_drop(skb); @@ -219,7 +223,11 @@ static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, static int nf_ip_route(struct dst_entry **dst, struct flowi *fl) { - return ip_route_output_key(&init_net, (struct rtable **)dst, fl); + struct rtable *rt = ip_route_output_key(&init_net, &fl->u.ip4); + if (IS_ERR(rt)) + return PTR_ERR(rt); + *dst = &rt->dst; + return 0; } static const struct nf_afinfo nf_ip_afinfo = { diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index babd1a2bae5f..1dfc18a03fd4 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -64,16 +64,6 @@ config IP_NF_IPTABLES if IP_NF_IPTABLES # The matches. -config IP_NF_MATCH_ADDRTYPE - tristate '"addrtype" address type match support' - depends on NETFILTER_ADVANCED - help - This option allows you to match what routing thinks of an address, - eg. UNICAST, LOCAL, BROADCAST, ... - - If you want to compile it as a module, say M here and read - <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. - config IP_NF_MATCH_AH tristate '"ah" match support' depends on NETFILTER_ADVANCED @@ -206,8 +196,9 @@ config IP_NF_TARGET_REDIRECT config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" - depends on NF_NAT + depends on NF_CONNTRACK_SNMP && NF_NAT depends on NETFILTER_ADVANCED + default NF_NAT && NF_CONNTRACK_SNMP ---help--- This module implements an Application Layer Gateway (ALG) for diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 19eb59d01037..dca2082ec683 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -48,7 +48,6 @@ obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o # matches -obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e855fffaed95..4b5d457c2d76 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1065,6 +1066,7 @@ static int do_replace(struct net *net, const void __user *user, /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1333,6 +1335,7 @@ static int translate_compat_table(const char *name, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(NFPROTO_ARP); + xt_compat_init_offsets(NFPROTO_ARP, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, @@ -1486,6 +1489,7 @@ static int compat_do_replace(struct net *net, void __user *user, return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1738,6 +1742,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; try_then_request_module(xt_find_revision(NFPROTO_ARP, rev.name, rev.revision, 1, &ret), diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index b8ddcc480ed9..a5e52a9f0a12 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -60,12 +60,12 @@ static int checkentry(const struct xt_tgchk_param *par) if (mangle->flags & ~ARPT_MANGLE_MASK || !(mangle->flags & ARPT_MANGLE_MASK)) - return false; + return -EINVAL; if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && mangle->target != XT_CONTINUE) - return false; - return true; + return -EINVAL; + return 0; } static struct xt_target arpt_mangle_reg __read_mostly = { diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 652efea013dc..ffcea0d1678e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -387,7 +387,7 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned)(-v) - 1; break; } - if (*stackptr == 0) { + if (*stackptr <= origptr) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " @@ -427,10 +427,10 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - xt_info_rdunlock_bh(); pr_debug("Exiting %s; resetting sp from %u to %u\n", __func__, *stackptr, origptr); *stackptr = origptr; + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; #else @@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1261,6 +1262,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len) /* overflow check */ if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -1664,6 +1666,7 @@ translate_compat_table(struct net *net, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET); + xt_compat_init_offsets(AF_INET, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, @@ -1805,6 +1808,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; + tmp.name[sizeof(tmp.name)-1] = 0; newinfo = xt_alloc_table_info(tmp.size); if (!newinfo) @@ -2034,6 +2038,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EFAULT; break; } + rev.name[sizeof(rev.name)-1] = 0; if (cmd == IPT_SO_GET_REVISION_TARGET) target = 1; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 1e26a4897655..d609ac3cb9a4 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) * that the ->target() function isn't called after ->destroy() */ ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) { - pr_info("no conntrack!\n"); - /* FIXME: need to drop invalid ones, since replies - * to outgoing connections of other nodes will be - * marked as INVALID */ + if (ct == NULL) return NF_DROP; - } /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ @@ -669,8 +664,11 @@ static ssize_t clusterip_proc_write(struct file *file, const char __user *input, char buffer[PROC_WRITELEN+1]; unsigned long nodenum; - if (copy_from_user(buffer, input, PROC_WRITELEN)) + if (size > PROC_WRITELEN) + return -EIO; + if (copy_from_user(buffer, input, size)) return -EFAULT; + buffer[size] = 0; if (*buffer == '+') { nodenum = simple_strtoul(buffer+1, NULL, 10); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 72ffc8fda2e9..d76d6c9ed946 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf, } #endif - /* MAC logging for input path only. */ - if (in && !out) + if (in != NULL) dump_mac_header(m, loginfo, skb); dump_packet(m, loginfo, skb, 0); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c deleted file mode 100644 index db8bff0fb86d..000000000000 --- a/net/ipv4/netfilter/ipt_addrtype.c +++ /dev/null @@ -1,134 +0,0 @@ -/* - * iptables module to match inet_addr_type() of an ip. - * - * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> - * (C) 2007 Laszlo Attila Toth <panther@balabit.hu> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/ip.h> -#include <net/route.h> - -#include <linux/netfilter_ipv4/ipt_addrtype.h> -#include <linux/netfilter/x_tables.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); -MODULE_DESCRIPTION("Xtables: address type match for IPv4"); - -static inline bool match_type(struct net *net, const struct net_device *dev, - __be32 addr, u_int16_t mask) -{ - return !!(mask & (1 << inet_dev_addr_type(net, dev, addr))); -} - -static bool -addrtype_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - const struct ipt_addrtype_info *info = par->matchinfo; - const struct iphdr *iph = ip_hdr(skb); - bool ret = true; - - if (info->source) - ret &= match_type(net, NULL, iph->saddr, info->source) ^ - info->invert_source; - if (info->dest) - ret &= match_type(net, NULL, iph->daddr, info->dest) ^ - info->invert_dest; - - return ret; -} - -static bool -addrtype_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) -{ - struct net *net = dev_net(par->in ? par->in : par->out); - const struct ipt_addrtype_info_v1 *info = par->matchinfo; - const struct iphdr *iph = ip_hdr(skb); - const struct net_device *dev = NULL; - bool ret = true; - - if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) - dev = par->in; - else if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) - dev = par->out; - - if (info->source) - ret &= match_type(net, dev, iph->saddr, info->source) ^ - (info->flags & IPT_ADDRTYPE_INVERT_SOURCE); - if (ret && info->dest) - ret &= match_type(net, dev, iph->daddr, info->dest) ^ - !!(info->flags & IPT_ADDRTYPE_INVERT_DEST); - return ret; -} - -static int addrtype_mt_checkentry_v1(const struct xt_mtchk_param *par) -{ - struct ipt_addrtype_info_v1 *info = par->matchinfo; - - if (info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { - pr_info("both incoming and outgoing " - "interface limitation cannot be selected\n"); - return -EINVAL; - } - - if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_LOCAL_IN)) && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_OUT) { - pr_info("output interface limitation " - "not valid in PREROUTING and INPUT\n"); - return -EINVAL; - } - - if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | - (1 << NF_INET_LOCAL_OUT)) && - info->flags & IPT_ADDRTYPE_LIMIT_IFACE_IN) { - pr_info("input interface limitation " - "not valid in POSTROUTING and OUTPUT\n"); - return -EINVAL; - } - - return 0; -} - -static struct xt_match addrtype_mt_reg[] __read_mostly = { - { - .name = "addrtype", - .family = NFPROTO_IPV4, - .match = addrtype_mt_v0, - .matchsize = sizeof(struct ipt_addrtype_info), - .me = THIS_MODULE - }, - { - .name = "addrtype", - .family = NFPROTO_IPV4, - .revision = 1, - .match = addrtype_mt_v1, - .checkentry = addrtype_mt_checkentry_v1, - .matchsize = sizeof(struct ipt_addrtype_info_v1), - .me = THIS_MODULE - } -}; - -static int __init addrtype_mt_init(void) -{ - return xt_register_matches(addrtype_mt_reg, - ARRAY_SIZE(addrtype_mt_reg)); -} - -static void __exit addrtype_mt_exit(void) -{ - xt_unregister_matches(addrtype_mt_reg, ARRAY_SIZE(addrtype_mt_reg)); -} - -module_init(addrtype_mt_init); -module_exit(addrtype_mt_exit); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 294a2a32f293..aef5d1fbe77d 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, dev_net(out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ - if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { + if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 63f60fc5d26a..5585980fce2e 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -20,6 +20,7 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_acct.h> +#include <linux/rculist_nulls.h> struct ct_iter_state { struct seq_net_private p; @@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < net->ct.htable_size; st->bucket++) { - n = rcu_dereference(net->ct.hash[st->bucket].first); + n = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { if (++st->bucket >= net->ct.htable_size) return NULL; } - head = rcu_dereference(net->ct.hash[st->bucket].first); + head = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); } return head; } @@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(net->ct.expect_hash[st->bucket].first); + n = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); if (n) return n; } @@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(net->ct.expect_hash[st->bucket].first); + head = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); } return head; } diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 0f23b3f06df0..703f366fd235 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; + int res; exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp); - if (ret == 0) + res = nf_ct_expect_related(exp); + if (res == 0) break; - else if (ret != -EBUSY) { + else if (res != -EBUSY) { port = 0; break; } diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index c04787ce1a71..21bcf471b25a 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, manips not an issue. */ if (maniptype == IP_NAT_MANIP_SRC && !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { - if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { + /* try the original tuple first */ + if (in_range(orig_tuple, range)) { + if (!nf_nat_used_tuple(orig_tuple, ct)) { + *tuple = *orig_tuple; + return; + } + } else if (find_appropriate_src(net, zone, orig_tuple, tuple, + range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; @@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct, struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; - int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); /* nat helper or nfctnetlink also setup binding */ nat = nfct_nat(ct); @@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_DST_NAT; } - /* Place in source hash if this is the first time. */ - if (have_to_hash) { + if (maniptype == IP_NAT_MANIP_SRC) { unsigned int srchash; srchash = hash_by_src(net, nf_ct_zone(ct), @@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct, /* It's done. */ if (maniptype == IP_NAT_MANIP_DST) - set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_DST_NAT_DONE; else - set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } @@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) int ret = 0; spin_lock_bh(&nf_nat_lock); - if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { + if (rcu_dereference_protected( + nf_nat_protos[proto->protonum], + lockdep_is_held(&nf_nat_lock) + ) != &nf_nat_unknown_protocol) { ret = -EBUSY; goto out; } @@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) if (nat == NULL || nat->ct == NULL) return; - NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); + NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); spin_lock_bh(&nf_nat_lock); hlist_del_rcu(&nat->bysource); @@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old) struct nf_conn_nat *old_nat = old; struct nf_conn *ct = old_nat->ct; - if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) + if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) return; spin_lock_bh(&nf_nat_lock); - new_nat->ct = ct; hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); spin_unlock_bh(&nf_nat_lock); } @@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net) { /* Leave them the same for the moment. */ net->ipv4.nat_htable_size = net->ct.htable_size; - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, - &net->ipv4.nat_vmalloced, 0); + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; @@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { nf_ct_iterate_cleanup(net, &clean_nat, NULL); synchronize_rcu(); - nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, - net->ipv4.nat_htable_size); + nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ee5f419d0a56..8812a02078ab 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -54,6 +54,7 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_snmp.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); @@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void) { int ret = 0; - ret = nf_conntrack_helper_register(&snmp_helper); - if (ret < 0) - return ret; + BUG_ON(nf_nat_snmp_hook != NULL); + rcu_assign_pointer(nf_nat_snmp_hook, help); + ret = nf_conntrack_helper_register(&snmp_trap_helper); if (ret < 0) { nf_conntrack_helper_unregister(&snmp_helper); @@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void) static void __exit nf_nat_snmp_basic_fini(void) { - nf_conntrack_helper_unregister(&snmp_helper); + rcu_assign_pointer(nf_nat_snmp_hook, NULL); nf_conntrack_helper_unregister(&snmp_trap_helper); } diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 95481fee8bdb..7317bdf1d457 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -31,6 +31,7 @@ #ifdef CONFIG_XFRM static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) { + struct flowi4 *fl4 = &fl->u.ip4; const struct nf_conn *ct; const struct nf_conntrack_tuple *t; enum ip_conntrack_info ctinfo; @@ -49,25 +50,25 @@ static void nat_decode_session(struct sk_buff *skb, struct flowi *fl) statusbit = IPS_SRC_NAT; if (ct->status & statusbit) { - fl->fl4_dst = t->dst.u3.ip; + fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) - fl->fl_ip_dport = t->dst.u.tcp.port; + fl4->fl4_dport = t->dst.u.tcp.port; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { - fl->fl4_src = t->src.u3.ip; + fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) - fl->fl_ip_sport = t->src.u.tcp.port; + fl4->fl4_sport = t->src.u.tcp.port; } } #endif diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index a3d5ab786e81..e837ffd3edc3 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -76,6 +76,7 @@ #include <linux/seq_file.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> +#include <linux/compat.h> static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), @@ -401,7 +402,7 @@ error: return err; } -static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) { struct iovec *iov; u8 __user *type = NULL; @@ -417,7 +418,7 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) if (!iov) continue; - switch (fl->proto) { + switch (fl4->flowi4_proto) { case IPPROTO_ICMP: /* check if one-byte field is readable or not. */ if (iov->iov_base && iov->iov_len < 1) @@ -432,8 +433,8 @@ static int raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) code = iov->iov_base; if (type && code) { - if (get_user(fl->fl_icmp_type, type) || - get_user(fl->fl_icmp_code, code)) + if (get_user(fl4->fl4_icmp_type, type) || + get_user(fl4->fl4_icmp_code, code)) return -EFAULT; probed = 1; } @@ -547,25 +548,30 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } { - struct flowi fl = { .oif = ipc.oif, - .mark = sk->sk_mark, - .fl4_dst = daddr, - .fl4_src = saddr, - .fl4_tos = tos, - .proto = inet->hdrincl ? IPPROTO_RAW : - sk->sk_protocol, - }; + struct flowi4 fl4 = { + .flowi4_oif = ipc.oif, + .flowi4_mark = sk->sk_mark, + .daddr = daddr, + .saddr = saddr, + .flowi4_tos = tos, + .flowi4_proto = (inet->hdrincl ? + IPPROTO_RAW : + sk->sk_protocol), + .flowi4_flags = FLOWI_FLAG_CAN_SLEEP, + }; if (!inet->hdrincl) { - err = raw_probe_proto_opt(&fl, msg); + err = raw_probe_proto_opt(&fl4, msg); if (err) goto done; } - security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto done; + } } - if (err) - goto done; err = -EACCES; if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) @@ -838,6 +844,23 @@ static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) } } +#ifdef CONFIG_COMPAT +static int compat_raw_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: + case SIOCINQ: + return -ENOIOCTLCMD; + default: +#ifdef CONFIG_IP_MROUTE + return ipmr_compat_ioctl(sk, cmd, compat_ptr(arg)); +#else + return -ENOIOCTLCMD; +#endif + } +} +#endif + struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, @@ -860,6 +883,7 @@ struct proto raw_prot = { #ifdef CONFIG_COMPAT .compat_setsockopt = compat_raw_setsockopt, .compat_getsockopt = compat_raw_getsockopt, + .compat_ioctl = compat_raw_ioctl, #endif }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 351dc4e85242..870b5182ddd8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -109,8 +109,8 @@ #include <linux/sysctl.h> #endif -#define RT_FL_TOS(oldflp) \ - ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) +#define RT_FL_TOS(oldflp4) \ + ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) #define IP_MAX_MTU 0xFFF0 @@ -131,9 +131,6 @@ static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; -static struct delayed_work expires_work; -static unsigned long expires_ljiffies; - /* * Interface to generic destination cache. */ @@ -152,6 +149,41 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { } +static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) +{ + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer; + u32 *p = NULL; + + if (!rt->peer) + rt_bind_peer(rt, 1); + + peer = rt->peer; + if (peer) { + u32 *old_p = __DST_METRICS_PTR(old); + unsigned long prev, new; + + p = peer->metrics; + if (inet_metrics_new(peer)) + memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + + new = (unsigned long) p; + prev = cmpxchg(&dst->_metrics, old, new); + + if (prev != old) { + p = __DST_METRICS_PTR(prev); + if (prev & DST_METRICS_READ_ONLY) + p = NULL; + } else { + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } + } + } + return p; +} + static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), @@ -159,6 +191,7 @@ static struct dst_ops ipv4_dst_ops = { .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .default_mtu = ipv4_default_mtu, + .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, @@ -171,7 +204,7 @@ static struct dst_ops ipv4_dst_ops = { const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, - ECN_OR_COST(FILLER), + ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, @@ -391,7 +424,7 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) dst_metric(&r->dst, RTAX_WINDOW), (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + dst_metric(&r->dst, RTAX_RTTVAR)), - r->fl.fl4_tos, + r->rt_tos, r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1, r->dst.hh ? (r->dst.hh->hh_output == dev_queue_xmit) : 0, @@ -514,7 +547,7 @@ static const struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; @@ -567,14 +600,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net) if (!pde) goto err2; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif @@ -588,7 +621,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } @@ -632,7 +665,7 @@ static inline int rt_fast_clean(struct rtable *rth) static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || - rth->dst.expires; + (rth->peer && rth->peer->pmtu_expires); } static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) @@ -643,13 +676,7 @@ static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long t if (atomic_read(&rth->dst.__refcnt)) goto out; - ret = 1; - if (rth->dst.expires && - time_after_eq(jiffies, rth->dst.expires)) - goto out; - age = jiffies - rth->dst.lastuse; - ret = 0; if ((age <= tmo1 && !rt_fast_clean(rth)) || (age <= tmo2 && rt_valuable(rth))) goto out; @@ -684,22 +711,22 @@ static inline bool rt_caching(const struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count; } -static inline bool compare_hash_inputs(const struct flowi *fl1, - const struct flowi *fl2) +static inline bool compare_hash_inputs(const struct rtable *rt1, + const struct rtable *rt2) { - return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | - ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | - (fl1->iif ^ fl2->iif)) == 0); + return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | + ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | + (rt1->rt_iif ^ rt2->rt_iif)) == 0); } -static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) +static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) { - return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) | - ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) | - (fl1->mark ^ fl2->mark) | - (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) | - (fl1->oif ^ fl2->oif) | - (fl1->iif ^ fl2->iif)) == 0; + return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | + ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | + (rt1->rt_mark ^ rt2->rt_mark) | + (rt1->rt_tos ^ rt2->rt_tos) | + (rt1->rt_oif ^ rt2->rt_oif) | + (rt1->rt_iif ^ rt2->rt_iif)) == 0; } static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) @@ -786,104 +813,13 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth) const struct rtable *aux = head; while (aux != rth) { - if (compare_hash_inputs(&aux->fl, &rth->fl)) + if (compare_hash_inputs(aux, rth)) return 0; aux = rcu_dereference_protected(aux->dst.rt_next, 1); } return ONE; } -static void rt_check_expire(void) -{ - static unsigned int rover; - unsigned int i = rover, goal; - struct rtable *rth; - struct rtable __rcu **rthp; - unsigned long samples = 0; - unsigned long sum = 0, sum2 = 0; - unsigned long delta; - u64 mult; - - delta = jiffies - expires_ljiffies; - expires_ljiffies = jiffies; - mult = ((u64)delta) << rt_hash_log; - if (ip_rt_gc_timeout > 1) - do_div(mult, ip_rt_gc_timeout); - goal = (unsigned int)mult; - if (goal > rt_hash_mask) - goal = rt_hash_mask + 1; - for (; goal > 0; goal--) { - unsigned long tmo = ip_rt_gc_timeout; - unsigned long length; - - i = (i + 1) & rt_hash_mask; - rthp = &rt_hash_table[i].chain; - - if (need_resched()) - cond_resched(); - - samples++; - - if (rcu_dereference_raw(*rthp) == NULL) - continue; - length = 0; - spin_lock_bh(rt_hash_lock_addr(i)); - while ((rth = rcu_dereference_protected(*rthp, - lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { - prefetch(rth->dst.rt_next); - if (rt_is_expired(rth)) { - *rthp = rth->dst.rt_next; - rt_free(rth); - continue; - } - if (rth->dst.expires) { - /* Entry is expired even if it is in use */ - if (time_before_eq(jiffies, rth->dst.expires)) { -nofree: - tmo >>= 1; - rthp = &rth->dst.rt_next; - /* - * We only count entries on - * a chain with equal hash inputs once - * so that entries for different QOS - * levels, and other non-hash input - * attributes don't unfairly skew - * the length computation - */ - length += has_noalias(rt_hash_table[i].chain, rth); - continue; - } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) - goto nofree; - - /* Cleanup aged off entries. */ - *rthp = rth->dst.rt_next; - rt_free(rth); - } - spin_unlock_bh(rt_hash_lock_addr(i)); - sum += length; - sum2 += length*length; - } - if (samples) { - unsigned long avg = sum / samples; - unsigned long sd = int_sqrt(sum2 / samples - avg*avg); - rt_chain_length_max = max_t(unsigned long, - ip_rt_gc_elasticity, - (avg + 4*sd) >> FRACT_BITS); - } - rover = i; -} - -/* - * rt_worker_func() is run in process context. - * we call rt_check_expire() to scan part of the hash table - */ -static void rt_worker_func(struct work_struct *work) -{ - rt_check_expire(); - schedule_delayed_work(&expires_work, ip_rt_gc_interval); -} - /* * Pertubation of rt_genid by a small quantity [1..256] * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() @@ -1078,8 +1014,8 @@ static int slow_chain_length(const struct rtable *head) return length >> FRACT_BITS; } -static int rt_intern_hash(unsigned hash, struct rtable *rt, - struct rtable **rp, struct sk_buff *skb, int ifindex) +static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, + struct sk_buff *skb, int ifindex) { struct rtable *rth, *cand; struct rtable __rcu **rthp, **candp; @@ -1120,7 +1056,7 @@ restart: printk(KERN_WARNING "Neighbour table failure & not caching routes.\n"); ip_rt_put(rt); - return err; + return ERR_PTR(err); } } @@ -1137,7 +1073,7 @@ restart: rt_free(rth); continue; } - if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { + if (compare_keys(rth, rt) && compare_netns(rth, rt)) { /* Put it first */ *rthp = rth->dst.rt_next; /* @@ -1157,11 +1093,9 @@ restart: spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); - if (rp) - *rp = rth; - else + if (skb) skb_dst_set(skb, &rth->dst); - return 0; + return rth; } if (!atomic_read(&rth->dst.__refcnt)) { @@ -1202,7 +1136,7 @@ restart: rt_emergency_hash_rebuild(net); spin_unlock_bh(rt_hash_lock_addr(hash)); - hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, + hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, ifindex, rt_genid(net)); goto restart; } @@ -1218,7 +1152,7 @@ restart: if (err != -ENOBUFS) { rt_drop(rt); - return err; + return ERR_PTR(err); } /* Neighbour tables are full and nothing @@ -1239,7 +1173,7 @@ restart: if (net_ratelimit()) printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); rt_drop(rt); - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); } } @@ -1265,11 +1199,16 @@ restart: spin_unlock_bh(rt_hash_lock_addr(hash)); skip_hashing: - if (rp) - *rp = rt; - else + if (skb) skb_dst_set(skb, &rt->dst); - return 0; + return rt; +} + +static atomic_t __rt_peer_genid = ATOMIC_INIT(0); + +static u32 rt_peer_genid(void) +{ + return atomic_read(&__rt_peer_genid); } void rt_bind_peer(struct rtable *rt, int create) @@ -1280,6 +1219,8 @@ void rt_bind_peer(struct rtable *rt, int create) if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); + else + rt->rt_peer_genid = rt_peer_genid(); } /* @@ -1349,13 +1290,8 @@ static void rt_del(unsigned hash, struct rtable *rt) void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { - int i, k; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rth; - struct rtable __rcu **rthp; - __be32 skeys[2] = { saddr, 0 }; - int ikeys[2] = { dev->ifindex, 0 }; - struct netevent_redirect netevent; + struct inet_peer *peer; struct net *net; if (!in_dev) @@ -1367,9 +1303,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, ipv4_is_zeronet(new_gw)) goto reject_redirect; - if (!rt_caching(net)) - goto reject_redirect; - if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; @@ -1380,91 +1313,13 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - for (i = 0; i < 2; i++) { - for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - rt_genid(net)); - - rthp = &rt_hash_table[hash].chain; - - while ((rth = rcu_dereference(*rthp)) != NULL) { - struct rtable *rt; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->fl.oif != ikeys[k] || - rt_is_input_route(rth) || - rt_is_expired(rth) || - !net_eq(dev_net(rth->dst.dev), net)) { - rthp = &rth->dst.rt_next; - continue; - } - - if (rth->rt_dst != daddr || - rth->rt_src != saddr || - rth->dst.error || - rth->rt_gateway != old_gw || - rth->dst.dev != dev) - break; - - dst_hold(&rth->dst); - - rt = dst_alloc(&ipv4_dst_ops); - if (rt == NULL) { - ip_rt_put(rth); - return; - } - - /* Copy all the information. */ - *rt = *rth; - rt->dst.__use = 1; - atomic_set(&rt->dst.__refcnt, 1); - rt->dst.child = NULL; - if (rt->dst.dev) - dev_hold(rt->dst.dev); - rt->dst.obsolete = -1; - rt->dst.lastuse = jiffies; - rt->dst.path = &rt->dst; - rt->dst.neighbour = NULL; - rt->dst.hh = NULL; -#ifdef CONFIG_XFRM - rt->dst.xfrm = NULL; -#endif - rt->rt_genid = rt_genid(net); - rt->rt_flags |= RTCF_REDIRECTED; - - /* Gateway is different ... */ - rt->rt_gateway = new_gw; - - /* Redirect received -> path was valid */ - dst_confirm(&rth->dst); - - if (rt->peer) - atomic_inc(&rt->peer->refcnt); - - if (arp_bind_neighbour(&rt->dst) || - !(rt->dst.neighbour->nud_state & - NUD_VALID)) { - if (rt->dst.neighbour) - neigh_event_send(rt->dst.neighbour, NULL); - ip_rt_put(rth); - rt_drop(rt); - goto do_next; - } + peer = inet_getpeer_v4(daddr, 1); + if (peer) { + peer->redirect_learned.a4 = new_gw; - netevent.old = &rth->dst; - netevent.new = &rt->dst; - call_netevent_notifiers(NETEVENT_REDIRECT, - &netevent); + inet_putpeer(peer); - rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif)) - ip_rt_put(rt); - goto do_next; - } - do_next: - ; - } + atomic_inc(&__rt_peer_genid); } return; @@ -1488,18 +1343,24 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; - } else if ((rt->rt_flags & RTCF_REDIRECTED) || - (rt->dst.expires && - time_after_eq(jiffies, rt->dst.expires))) { - unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, - rt->fl.oif, + } else if (rt->rt_flags & RTCF_REDIRECTED) { + unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, + rt->rt_oif, rt_genid(dev_net(dst->dev))); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", - &rt->rt_dst, rt->fl.fl4_tos); + &rt->rt_dst, rt->rt_tos); #endif rt_del(hash, rt); ret = NULL; + } else if (rt->peer && + rt->peer->pmtu_expires && + time_after_eq(jiffies, rt->peer->pmtu_expires)) { + unsigned long orig = rt->peer->pmtu_expires; + + if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) + dst_metric_set(dst, RTAX_MTU, + rt->peer->pmtu_orig); } } return ret; @@ -1525,6 +1386,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; + struct inet_peer *peer; int log_martians; rcu_read_lock(); @@ -1536,33 +1398,41 @@ void ip_rt_send_redirect(struct sk_buff *skb) log_martians = IN_DEV_LOG_MARTIANS(in_dev); rcu_read_unlock(); + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + if (!peer) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + return; + } + /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ - if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence)) - rt->dst.rate_tokens = 0; + if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) + peer->rate_tokens = 0; /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ - if (rt->dst.rate_tokens >= ip_rt_redirect_number) { - rt->dst.rate_last = jiffies; + if (peer->rate_tokens >= ip_rt_redirect_number) { + peer->rate_last = jiffies; return; } /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (rt->dst.rate_tokens == 0 || + if (peer->rate_tokens == 0 || time_after(jiffies, - (rt->dst.rate_last + - (ip_rt_redirect_load << rt->dst.rate_tokens)))) { + (peer->rate_last + + (ip_rt_redirect_load << peer->rate_tokens)))) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); - rt->dst.rate_last = jiffies; - ++rt->dst.rate_tokens; + peer->rate_last = jiffies; + ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE if (log_martians && - rt->dst.rate_tokens == ip_rt_redirect_number && + peer->rate_tokens == ip_rt_redirect_number && net_ratelimit()) printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", &rt->rt_src, rt->rt_iif, @@ -1574,7 +1444,9 @@ void ip_rt_send_redirect(struct sk_buff *skb) static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); + struct inet_peer *peer; unsigned long now; + bool send; int code; switch (rt->dst.error) { @@ -1594,15 +1466,24 @@ static int ip_error(struct sk_buff *skb) break; } - now = jiffies; - rt->dst.rate_tokens += now - rt->dst.rate_last; - if (rt->dst.rate_tokens > ip_rt_error_burst) - rt->dst.rate_tokens = ip_rt_error_burst; - rt->dst.rate_last = now; - if (rt->dst.rate_tokens >= ip_rt_error_cost) { - rt->dst.rate_tokens -= ip_rt_error_cost; - icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + + send = true; + if (peer) { + now = jiffies; + peer->rate_tokens += now - peer->rate_last; + if (peer->rate_tokens > ip_rt_error_burst) + peer->rate_tokens = ip_rt_error_burst; + peer->rate_last = now; + if (peer->rate_tokens >= ip_rt_error_cost) + peer->rate_tokens -= ip_rt_error_cost; + else + send = false; } + if (send) + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb(skb); return 0; @@ -1630,88 +1511,142 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, unsigned short new_mtu, struct net_device *dev) { - int i, k; unsigned short old_mtu = ntohs(iph->tot_len); - struct rtable *rth; - int ikeys[2] = { dev->ifindex, 0 }; - __be32 skeys[2] = { iph->saddr, 0, }; - __be32 daddr = iph->daddr; unsigned short est_mtu = 0; + struct inet_peer *peer; - for (k = 0; k < 2; k++) { - for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], - rt_genid(net)); - - rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->dst.rt_next)) { - unsigned short mtu = new_mtu; - - if (rth->fl.fl4_dst != daddr || - rth->fl.fl4_src != skeys[i] || - rth->rt_dst != daddr || - rth->rt_src != iph->saddr || - rth->fl.oif != ikeys[k] || - rt_is_input_route(rth) || - dst_metric_locked(&rth->dst, RTAX_MTU) || - !net_eq(dev_net(rth->dst.dev), net) || - rt_is_expired(rth)) - continue; + peer = inet_getpeer_v4(iph->daddr, 1); + if (peer) { + unsigned short mtu = new_mtu; + + if (new_mtu < 68 || new_mtu >= old_mtu) { + /* BSD 4.2 derived systems incorrectly adjust + * tot_len by the IP header length, and report + * a zero MTU in the ICMP message. + */ + if (mtu == 0 && + old_mtu >= 68 + (iph->ihl << 2)) + old_mtu -= iph->ihl << 2; + mtu = guess_mtu(old_mtu); + } - if (new_mtu < 68 || new_mtu >= old_mtu) { + if (mtu < ip_rt_min_pmtu) + mtu = ip_rt_min_pmtu; + if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { + unsigned long pmtu_expires; - /* BSD 4.2 compatibility hack :-( */ - if (mtu == 0 && - old_mtu >= dst_mtu(&rth->dst) && - old_mtu >= 68 + (iph->ihl << 2)) - old_mtu -= iph->ihl << 2; + pmtu_expires = jiffies + ip_rt_mtu_expires; + if (!pmtu_expires) + pmtu_expires = 1UL; - mtu = guess_mtu(old_mtu); - } - if (mtu <= dst_mtu(&rth->dst)) { - if (mtu < dst_mtu(&rth->dst)) { - dst_confirm(&rth->dst); - if (mtu < ip_rt_min_pmtu) { - u32 lock = dst_metric(&rth->dst, - RTAX_LOCK); - mtu = ip_rt_min_pmtu; - lock |= (1 << RTAX_MTU); - dst_metric_set(&rth->dst, RTAX_LOCK, - lock); - } - dst_metric_set(&rth->dst, RTAX_MTU, mtu); - dst_set_expires(&rth->dst, - ip_rt_mtu_expires); - } - est_mtu = mtu; - } - } - rcu_read_unlock(); + est_mtu = mtu; + peer->pmtu_learned = mtu; + peer->pmtu_expires = pmtu_expires; } + + inet_putpeer(peer); + + atomic_inc(&__rt_peer_genid); } return est_mtu ? : new_mtu; } +static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) +{ + unsigned long expires = peer->pmtu_expires; + + if (time_before(jiffies, expires)) { + u32 orig_dst_mtu = dst_mtu(dst); + if (peer->pmtu_learned < orig_dst_mtu) { + if (!peer->pmtu_orig) + peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); + dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); + } + } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) + dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); +} + static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) { - if (dst_mtu(dst) > mtu && mtu >= 68 && - !(dst_metric_locked(dst, RTAX_MTU))) { - if (mtu < ip_rt_min_pmtu) { - u32 lock = dst_metric(dst, RTAX_LOCK); + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer; + + dst_confirm(dst); + + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + if (peer) { + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; - dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU)); + if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { + unsigned long pmtu_expires; + + pmtu_expires = jiffies + ip_rt_mtu_expires; + if (!pmtu_expires) + pmtu_expires = 1UL; + + peer->pmtu_learned = mtu; + peer->pmtu_expires = pmtu_expires; + + atomic_inc(&__rt_peer_genid); + rt->rt_peer_genid = rt_peer_genid(); } - dst_metric_set(dst, RTAX_MTU, mtu); - dst_set_expires(dst, ip_rt_mtu_expires); - call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); + check_peer_pmtu(dst, peer); + + inet_putpeer(peer); + } +} + +static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +{ + struct rtable *rt = (struct rtable *) dst; + __be32 orig_gw = rt->rt_gateway; + + dst_confirm(&rt->dst); + + neigh_release(rt->dst.neighbour); + rt->dst.neighbour = NULL; + + rt->rt_gateway = peer->redirect_learned.a4; + if (arp_bind_neighbour(&rt->dst) || + !(rt->dst.neighbour->nud_state & NUD_VALID)) { + if (rt->dst.neighbour) + neigh_event_send(rt->dst.neighbour, NULL); + rt->rt_gateway = orig_gw; + return -EAGAIN; + } else { + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, + rt->dst.neighbour); } + return 0; } static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - if (rt_is_expired((struct rtable *)dst)) + struct rtable *rt = (struct rtable *) dst; + + if (rt_is_expired(rt)) return NULL; + if (rt->rt_peer_genid != rt_peer_genid()) { + struct inet_peer *peer; + + if (!rt->peer) + rt_bind_peer(rt, 0); + + peer = rt->peer; + if (peer && peer->pmtu_expires) + check_peer_pmtu(dst, peer); + + if (peer && peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + if (check_peer_redir(dst, peer)) + return NULL; + } + + rt->rt_peer_genid = rt_peer_genid(); + } return dst; } @@ -1720,6 +1655,10 @@ static void ipv4_dst_destroy(struct dst_entry *dst) struct rtable *rt = (struct rtable *) dst; struct inet_peer *peer = rt->peer; + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } if (peer) { rt->peer = NULL; inet_putpeer(peer); @@ -1734,8 +1673,14 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); rt = skb_rtable(skb); - if (rt) - dst_set_expires(&rt->dst, 0); + if (rt && + rt->peer && + rt->peer->pmtu_expires) { + unsigned long orig = rt->peer->pmtu_expires; + + if (cmpxchg(&rt->peer->pmtu_expires, orig, 0) == orig) + dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); + } } static int ip_rt_bug(struct sk_buff *skb) @@ -1764,8 +1709,17 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt_is_output_route(rt)) src = rt->rt_src; else { + struct flowi4 fl4 = { + .daddr = rt->rt_key_dst, + .saddr = rt->rt_key_src, + .flowi4_tos = rt->rt_tos, + .flowi4_oif = rt->rt_oif, + .flowi4_iif = rt->rt_iif, + .flowi4_mark = rt->rt_mark, + }; + rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) + if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) src = FIB_RES_PREFSRC(res); else src = inet_select_addr(rt->dst.dev, rt->rt_gateway, @@ -1775,7 +1729,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) memcpy(addr, &src, 4); } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) @@ -1815,17 +1769,54 @@ static unsigned int ipv4_default_mtu(const struct dst_entry *dst) return mtu; } -static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +static void rt_init_metrics(struct rtable *rt, const struct flowi4 *oldflp4, + struct fib_info *fi) +{ + struct inet_peer *peer; + int create = 0; + + /* If a peer entry exists for this destination, we must hook + * it up in order to get at cached metrics. + */ + if (oldflp4 && (oldflp4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) + create = 1; + + rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); + if (peer) { + rt->rt_peer_genid = rt_peer_genid(); + if (inet_metrics_new(peer)) + memcpy(peer->metrics, fi->fib_metrics, + sizeof(u32) * RTAX_MAX); + dst_init_metrics(&rt->dst, peer->metrics, false); + + if (peer->pmtu_expires) + check_peer_pmtu(&rt->dst, peer); + if (peer->redirect_learned.a4 && + peer->redirect_learned.a4 != rt->rt_gateway) { + rt->rt_gateway = peer->redirect_learned.a4; + rt->rt_flags |= RTCF_REDIRECTED; + } + } else { + if (fi->fib_metrics != (u32 *) dst_default_metrics) { + rt->fi = fi; + atomic_inc(&fi->fib_clntref); + } + dst_init_metrics(&rt->dst, fi->fib_metrics, true); + } +} + +static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *oldflp4, + const struct fib_result *res, + struct fib_info *fi, u16 type, u32 itag) { struct dst_entry *dst = &rt->dst; - struct fib_info *fi = res->fi; if (fi) { if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); - dst_import_metrics(dst, fi->fib_metrics); -#ifdef CONFIG_NET_CLS_ROUTE + rt_init_metrics(rt, oldflp4, fi); +#ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } @@ -1835,13 +1826,26 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); #endif set_class_tag(rt, itag); #endif - rt->rt_type = res->type; + rt->rt_type = type; +} + +static struct rtable *rt_dst_alloc(bool nopolicy, bool noxfrm) +{ + struct rtable *rt = dst_alloc(&ipv4_dst_ops, 1); + if (rt) { + rt->dst.obsolete = -1; + + rt->dst.flags = DST_HOST | + (nopolicy ? DST_NOPOLICY : 0) | + (noxfrm ? DST_NOXFRM : 0); + } + return rt; } /* called in rcu_read_lock() section */ @@ -1874,31 +1878,25 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (err < 0) goto e_err; } - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; rth->dst.output = ip_rt_bug; - rth->dst.obsolete = -1; - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; + rth->rt_key_dst = daddr; rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; + rth->rt_tos = tos; + rth->rt_mark = skb->mark; + rth->rt_key_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; + rth->rt_iif = dev->ifindex; rth->dst.dev = init_net.loopback_dev; dev_hold(rth->dst.dev); - rth->fl.oif = 0; + rth->rt_oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_genid = rt_genid(dev_net(dev)); @@ -1916,7 +1914,10 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, RT_CACHE_STAT_INC(in_slow_mc); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); + rth = rt_intern_hash(hash, rth, skb, dev->ifindex); + err = 0; + if (IS_ERR(rth)) + err = PTR_ERR(rth); e_nobufs: return -ENOBUFS; @@ -1959,7 +1960,7 @@ static void ip_handle_martian_source(struct net_device *dev, /* called in rcu_read_lock() section */ static int __mkroute_input(struct sk_buff *skb, - struct fib_result *res, + const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos, struct rtable **result) @@ -2013,39 +2014,31 @@ static int __mkroute_input(struct sk_buff *skb, } } - - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(out_dev, NOXFRM)); if (!rth) { err = -ENOBUFS; goto cleanup; } - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - if (IN_DEV_CONF_GET(out_dev, NOXFRM)) - rth->dst.flags |= DST_NOXFRM; - rth->fl.fl4_dst = daddr; + rth->rt_key_dst = daddr; rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; + rth->rt_tos = tos; + rth->rt_mark = skb->mark; + rth->rt_key_src = saddr; rth->rt_src = saddr; rth->rt_gateway = daddr; - rth->rt_iif = - rth->fl.iif = in_dev->dev->ifindex; + rth->rt_iif = in_dev->dev->ifindex; rth->dst.dev = (out_dev)->dev; dev_hold(rth->dst.dev); - rth->fl.oif = 0; + rth->rt_oif = 0; rth->rt_spec_dst= spec_dst; - rth->dst.obsolete = -1; rth->dst.input = ip_forward; rth->dst.output = ip_output; rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); - rt_set_nexthop(rth, res, itag); + rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); rth->rt_flags = flags; @@ -2057,7 +2050,7 @@ static int __mkroute_input(struct sk_buff *skb, static int ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, - const struct flowi *fl, + const struct flowi4 *fl4, struct in_device *in_dev, __be32 daddr, __be32 saddr, u32 tos) { @@ -2066,8 +2059,8 @@ static int ip_mkroute_input(struct sk_buff *skb, unsigned hash; #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) - fib_select_multipath(fl, res); + if (res->fi && res->fi->fib_nhs > 1) + fib_select_multipath(res); #endif /* create a routing cache entry */ @@ -2076,9 +2069,12 @@ static int ip_mkroute_input(struct sk_buff *skb, return err; /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif, + hash = rt_hash(daddr, saddr, fl4->flowi4_iif, rt_genid(dev_net(rth->dst.dev))); - return rt_intern_hash(hash, rth, NULL, skb, fl->iif); + rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); + if (IS_ERR(rth)) + return PTR_ERR(rth); + return 0; } /* @@ -2097,12 +2093,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct flowi fl = { .fl4_dst = daddr, - .fl4_src = saddr, - .fl4_tos = tos, - .fl4_scope = RT_SCOPE_UNIVERSE, - .mark = skb->mark, - .iif = dev->ifindex }; + struct flowi4 fl4; unsigned flags = 0; u32 itag = 0; struct rtable * rth; @@ -2139,7 +2130,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* * Now we are ready to route packet. */ - err = fib_lookup(net, &fl, &res); + fl4.flowi4_oif = 0; + fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_tos = tos; + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.daddr = daddr; + fl4.saddr = saddr; + err = fib_lookup(net, &fl4, &res); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; @@ -2168,7 +2166,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type != RTN_UNICAST) goto martian_destination; - err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); + err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); out: return err; brd_input: @@ -2190,29 +2188,23 @@ brd_input: RT_CACHE_STAT_INC(in_brd); local_input: - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), false); if (!rth) goto e_nobufs; rth->dst.output= ip_rt_bug; - rth->dst.obsolete = -1; rth->rt_genid = rt_genid(net); - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - rth->fl.fl4_dst = daddr; + rth->rt_key_dst = daddr; rth->rt_dst = daddr; - rth->fl.fl4_tos = tos; - rth->fl.mark = skb->mark; - rth->fl.fl4_src = saddr; + rth->rt_tos = tos; + rth->rt_mark = skb->mark; + rth->rt_key_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif - rth->rt_iif = - rth->fl.iif = dev->ifindex; + rth->rt_iif = dev->ifindex; rth->dst.dev = net->loopback_dev; dev_hold(rth->dst.dev); rth->rt_gateway = daddr; @@ -2225,8 +2217,11 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); - err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); + hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); + rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); + err = 0; + if (IS_ERR(rth)) + err = PTR_ERR(rth); goto out; no_route: @@ -2288,12 +2283,12 @@ int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->dst.rt_next)) { - if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) | - ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) | - (rth->fl.iif ^ iif) | - rth->fl.oif | - (rth->fl.fl4_tos ^ tos)) == 0 && - rth->fl.mark == skb->mark && + if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | + ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | + (rth->rt_iif ^ iif) | + rth->rt_oif | + (rth->rt_tos ^ tos)) == 0 && + rth->rt_mark == skb->mark && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { if (noref) { @@ -2326,8 +2321,8 @@ skip_cache: struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev) { - int our = ip_check_mc(in_dev, daddr, saddr, - ip_hdr(skb)->protocol); + int our = ip_check_mc_rcu(in_dev, daddr, saddr, + ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE || @@ -2351,98 +2346,91 @@ skip_cache: EXPORT_SYMBOL(ip_route_input_common); /* called with rcu_read_lock() */ -static int __mkroute_output(struct rtable **result, - struct fib_result *res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +static struct rtable *__mkroute_output(const struct fib_result *res, + const struct flowi4 *fl4, + const struct flowi4 *oldflp4, + struct net_device *dev_out, + unsigned int flags) { - struct rtable *rth; + struct fib_info *fi = res->fi; + u32 tos = RT_FL_TOS(oldflp4); struct in_device *in_dev; - u32 tos = RT_FL_TOS(oldflp); + u16 type = res->type; + struct rtable *rth; - if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) - return -EINVAL; + if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) + return ERR_PTR(-EINVAL); - if (ipv4_is_lbcast(fl->fl4_dst)) - res->type = RTN_BROADCAST; - else if (ipv4_is_multicast(fl->fl4_dst)) - res->type = RTN_MULTICAST; - else if (ipv4_is_zeronet(fl->fl4_dst)) - return -EINVAL; + if (ipv4_is_lbcast(fl4->daddr)) + type = RTN_BROADCAST; + else if (ipv4_is_multicast(fl4->daddr)) + type = RTN_MULTICAST; + else if (ipv4_is_zeronet(fl4->daddr)) + return ERR_PTR(-EINVAL); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) - return -EINVAL; + return ERR_PTR(-EINVAL); - if (res->type == RTN_BROADCAST) { + if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - res->fi = NULL; - } else if (res->type == RTN_MULTICAST) { + fi = NULL; + } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; - if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, - oldflp->proto)) + if (!ip_check_mc_rcu(in_dev, oldflp4->daddr, oldflp4->saddr, + oldflp4->flowi4_proto)) flags &= ~RTCF_LOCAL; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. */ - if (res->fi && res->prefixlen < 4) - res->fi = NULL; + if (fi && res->prefixlen < 4) + fi = NULL; } - - rth = dst_alloc(&ipv4_dst_ops); + rth = rt_dst_alloc(IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(in_dev, NOXFRM)); if (!rth) - return -ENOBUFS; - - atomic_set(&rth->dst.__refcnt, 1); - rth->dst.flags= DST_HOST; - if (IN_DEV_CONF_GET(in_dev, NOXFRM)) - rth->dst.flags |= DST_NOXFRM; - if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) - rth->dst.flags |= DST_NOPOLICY; - - rth->fl.fl4_dst = oldflp->fl4_dst; - rth->fl.fl4_tos = tos; - rth->fl.fl4_src = oldflp->fl4_src; - rth->fl.oif = oldflp->oif; - rth->fl.mark = oldflp->mark; - rth->rt_dst = fl->fl4_dst; - rth->rt_src = fl->fl4_src; - rth->rt_iif = oldflp->oif ? : dev_out->ifindex; + return ERR_PTR(-ENOBUFS); + + rth->rt_key_dst = oldflp4->daddr; + rth->rt_tos = tos; + rth->rt_key_src = oldflp4->saddr; + rth->rt_oif = oldflp4->flowi4_oif; + rth->rt_mark = oldflp4->flowi4_mark; + rth->rt_dst = fl4->daddr; + rth->rt_src = fl4->saddr; + rth->rt_iif = 0; /* get references to the devices that are to be hold by the routing cache entry */ rth->dst.dev = dev_out; dev_hold(dev_out); - rth->rt_gateway = fl->fl4_dst; - rth->rt_spec_dst= fl->fl4_src; + rth->rt_gateway = fl4->daddr; + rth->rt_spec_dst= fl4->saddr; rth->dst.output=ip_output; - rth->dst.obsolete = -1; rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) { rth->dst.input = ip_local_deliver; - rth->rt_spec_dst = fl->fl4_dst; + rth->rt_spec_dst = fl4->daddr; } if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - rth->rt_spec_dst = fl->fl4_src; + rth->rt_spec_dst = fl4->saddr; if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE - if (res->type == RTN_MULTICAST) { + if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && - !ipv4_is_local_multicast(oldflp->fl4_dst)) { + !ipv4_is_local_multicast(oldflp4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mc_output; } @@ -2450,31 +2438,10 @@ static int __mkroute_output(struct rtable **result, #endif } - rt_set_nexthop(rth, res, 0); + rt_set_nexthop(rth, oldflp4, res, fi, type, 0); rth->rt_flags = flags; - *result = rth; - return 0; -} - -/* called with rcu_read_lock() */ -static int ip_mkroute_output(struct rtable **rp, - struct fib_result *res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) -{ - struct rtable *rth = NULL; - int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); - unsigned hash; - if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, - rt_genid(dev_net(dev_out))); - err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); - } - - return err; + return rth; } /* @@ -2482,34 +2449,36 @@ static int ip_mkroute_output(struct rtable **rp, * called with rcu_read_lock(); */ -static int ip_route_output_slow(struct net *net, struct rtable **rp, - const struct flowi *oldflp) -{ - u32 tos = RT_FL_TOS(oldflp); - struct flowi fl = { .fl4_dst = oldflp->fl4_dst, - .fl4_src = oldflp->fl4_src, - .fl4_tos = tos & IPTOS_RT_MASK, - .fl4_scope = ((tos & RTO_ONLINK) ? - RT_SCOPE_LINK : RT_SCOPE_UNIVERSE), - .mark = oldflp->mark, - .iif = net->loopback_dev->ifindex, - .oif = oldflp->oif }; +static struct rtable *ip_route_output_slow(struct net *net, + const struct flowi4 *oldflp4) +{ + u32 tos = RT_FL_TOS(oldflp4); + struct flowi4 fl4; struct fib_result res; unsigned int flags = 0; struct net_device *dev_out = NULL; - int err; - + struct rtable *rth; res.fi = NULL; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif - if (oldflp->fl4_src) { - err = -EINVAL; - if (ipv4_is_multicast(oldflp->fl4_src) || - ipv4_is_lbcast(oldflp->fl4_src) || - ipv4_is_zeronet(oldflp->fl4_src)) + fl4.flowi4_oif = oldflp4->flowi4_oif; + fl4.flowi4_iif = net->loopback_dev->ifindex; + fl4.flowi4_mark = oldflp4->flowi4_mark; + fl4.daddr = oldflp4->daddr; + fl4.saddr = oldflp4->saddr; + fl4.flowi4_tos = tos & IPTOS_RT_MASK; + fl4.flowi4_scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); + + rcu_read_lock(); + if (oldflp4->saddr) { + rth = ERR_PTR(-EINVAL); + if (ipv4_is_multicast(oldflp4->saddr) || + ipv4_is_lbcast(oldflp4->saddr) || + ipv4_is_zeronet(oldflp4->saddr)) goto out; /* I removed check for oif == dev_out->oif here. @@ -2520,11 +2489,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, of another iface. --ANK */ - if (oldflp->oif == 0 && - (ipv4_is_multicast(oldflp->fl4_dst) || - ipv4_is_lbcast(oldflp->fl4_dst))) { + if (oldflp4->flowi4_oif == 0 && + (ipv4_is_multicast(oldflp4->daddr) || + ipv4_is_lbcast(oldflp4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = __ip_dev_find(net, oldflp->fl4_src, false); + dev_out = __ip_dev_find(net, oldflp4->saddr, false); if (dev_out == NULL) goto out; @@ -2543,60 +2512,60 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, Luckily, this hack is good workaround. */ - fl.oif = dev_out->ifindex; + fl4.flowi4_oif = dev_out->ifindex; goto make_route; } - if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { + if (!(oldflp4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - if (!__ip_dev_find(net, oldflp->fl4_src, false)) + if (!__ip_dev_find(net, oldflp4->saddr, false)) goto out; } } - if (oldflp->oif) { - dev_out = dev_get_by_index_rcu(net, oldflp->oif); - err = -ENODEV; + if (oldflp4->flowi4_oif) { + dev_out = dev_get_by_index_rcu(net, oldflp4->flowi4_oif); + rth = ERR_PTR(-ENODEV); if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { - err = -ENETUNREACH; + rth = ERR_PTR(-ENETUNREACH); goto out; } - if (ipv4_is_local_multicast(oldflp->fl4_dst) || - ipv4_is_lbcast(oldflp->fl4_dst)) { - if (!fl.fl4_src) - fl.fl4_src = inet_select_addr(dev_out, 0, - RT_SCOPE_LINK); + if (ipv4_is_local_multicast(oldflp4->daddr) || + ipv4_is_lbcast(oldflp4->daddr)) { + if (!fl4.saddr) + fl4.saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); goto make_route; } - if (!fl.fl4_src) { - if (ipv4_is_multicast(oldflp->fl4_dst)) - fl.fl4_src = inet_select_addr(dev_out, 0, - fl.fl4_scope); - else if (!oldflp->fl4_dst) - fl.fl4_src = inet_select_addr(dev_out, 0, - RT_SCOPE_HOST); + if (!fl4.saddr) { + if (ipv4_is_multicast(oldflp4->daddr)) + fl4.saddr = inet_select_addr(dev_out, 0, + fl4.flowi4_scope); + else if (!oldflp4->daddr) + fl4.saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_HOST); } } - if (!fl.fl4_dst) { - fl.fl4_dst = fl.fl4_src; - if (!fl.fl4_dst) - fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); + if (!fl4.daddr) { + fl4.daddr = fl4.saddr; + if (!fl4.daddr) + fl4.daddr = fl4.saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; - fl.oif = net->loopback_dev->ifindex; + fl4.flowi4_oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(net, &fl, &res)) { + if (fib_lookup(net, &fl4, &res)) { res.fi = NULL; - if (oldflp->oif) { + if (oldflp4->flowi4_oif) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -2615,90 +2584,93 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, likely IPv6, but we do not. */ - if (fl.fl4_src == 0) - fl.fl4_src = inet_select_addr(dev_out, 0, - RT_SCOPE_LINK); + if (fl4.saddr == 0) + fl4.saddr = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); res.type = RTN_UNICAST; goto make_route; } - err = -ENETUNREACH; + rth = ERR_PTR(-ENETUNREACH); goto out; } if (res.type == RTN_LOCAL) { - if (!fl.fl4_src) { + if (!fl4.saddr) { if (res.fi->fib_prefsrc) - fl.fl4_src = res.fi->fib_prefsrc; + fl4.saddr = res.fi->fib_prefsrc; else - fl.fl4_src = fl.fl4_dst; + fl4.saddr = fl4.daddr; } dev_out = net->loopback_dev; - fl.oif = dev_out->ifindex; + fl4.flowi4_oif = dev_out->ifindex; res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && fl.oif == 0) - fib_select_multipath(&fl, &res); + if (res.fi->fib_nhs > 1 && fl4.flowi4_oif == 0) + fib_select_multipath(&res); else #endif - if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) - fib_select_default(net, &fl, &res); + if (!res.prefixlen && res.type == RTN_UNICAST && !fl4.flowi4_oif) + fib_select_default(&res); - if (!fl.fl4_src) - fl.fl4_src = FIB_RES_PREFSRC(res); + if (!fl4.saddr) + fl4.saddr = FIB_RES_PREFSRC(res); dev_out = FIB_RES_DEV(res); - fl.oif = dev_out->ifindex; + fl4.flowi4_oif = dev_out->ifindex; make_route: - err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); + rth = __mkroute_output(&res, &fl4, oldflp4, dev_out, flags); + if (!IS_ERR(rth)) { + unsigned int hash; -out: return err; + hash = rt_hash(oldflp4->daddr, oldflp4->saddr, oldflp4->flowi4_oif, + rt_genid(dev_net(dev_out))); + rth = rt_intern_hash(hash, rth, NULL, oldflp4->flowi4_oif); + } + +out: + rcu_read_unlock(); + return rth; } -int __ip_route_output_key(struct net *net, struct rtable **rp, - const struct flowi *flp) +struct rtable *__ip_route_output_key(struct net *net, const struct flowi4 *flp4) { - unsigned int hash; - int res; struct rtable *rth; + unsigned int hash; if (!rt_caching(net)) goto slow_output; - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); + hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); rcu_read_lock_bh(); for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; rth = rcu_dereference_bh(rth->dst.rt_next)) { - if (rth->fl.fl4_dst == flp->fl4_dst && - rth->fl.fl4_src == flp->fl4_src && + if (rth->rt_key_dst == flp4->daddr && + rth->rt_key_src == flp4->saddr && rt_is_output_route(rth) && - rth->fl.oif == flp->oif && - rth->fl.mark == flp->mark && - !((rth->fl.fl4_tos ^ flp->fl4_tos) & + rth->rt_oif == flp4->flowi4_oif && + rth->rt_mark == flp4->flowi4_mark && + !((rth->rt_tos ^ flp4->flowi4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->dst.dev), net) && !rt_is_expired(rth)) { dst_use(&rth->dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); - *rp = rth; - return 0; + return rth; } RT_CACHE_STAT_INC(out_hlist_search); } rcu_read_unlock_bh(); slow_output: - rcu_read_lock(); - res = ip_route_output_slow(net, rp, flp); - rcu_read_unlock(); - return res; + return ip_route_output_slow(net, flp4); } EXPORT_SYMBOL_GPL(__ip_route_output_key); @@ -2707,6 +2679,11 @@ static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 coo return NULL; } +static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst) +{ + return 0; +} + static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) { } @@ -2716,20 +2693,19 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .protocol = cpu_to_be16(ETH_P_IP), .destroy = ipv4_dst_destroy, .check = ipv4_blackhole_dst_check, + .default_mtu = ipv4_blackhole_default_mtu, + .default_advmss = ipv4_default_advmss, .update_pmtu = ipv4_rt_blackhole_update_pmtu, }; - -static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) +struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *ort = *rp; - struct rtable *rt = (struct rtable *) - dst_alloc(&ipv4_dst_blackhole_ops); + struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, 1); + struct rtable *ort = (struct rtable *) dst_orig; if (rt) { struct dst_entry *new = &rt->dst; - atomic_set(&new->__refcnt, 1); new->__use = 1; new->input = dst_discard; new->output = dst_discard; @@ -2739,7 +2715,12 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi if (new->dev) dev_hold(new->dev); - rt->fl = ort->fl; + rt->rt_key_dst = ort->rt_key_dst; + rt->rt_key_src = ort->rt_key_src; + rt->rt_tos = ort->rt_tos; + rt->rt_iif = ort->rt_iif; + rt->rt_oif = ort->rt_oif; + rt->rt_mark = ort->rt_mark; rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; @@ -2752,46 +2733,40 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi rt->peer = ort->peer; if (rt->peer) atomic_inc(&rt->peer->refcnt); + rt->fi = ort->fi; + if (rt->fi) + atomic_inc(&rt->fi->fib_clntref); dst_free(new); } - dst_release(&(*rp)->dst); - *rp = rt; - return rt ? 0 : -ENOMEM; + dst_release(dst_orig); + + return rt ? &rt->dst : ERR_PTR(-ENOMEM); } -int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, - struct sock *sk, int flags) +struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, + struct sock *sk) { - int err; + struct rtable *rt = __ip_route_output_key(net, flp4); - if ((err = __ip_route_output_key(net, rp, flp)) != 0) - return err; + if (IS_ERR(rt)) + return rt; - if (flp->proto) { - if (!flp->fl4_src) - flp->fl4_src = (*rp)->rt_src; - if (!flp->fl4_dst) - flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, - flags ? XFRM_LOOKUP_WAIT : 0); - if (err == -EREMOTE) - err = ipv4_dst_blackhole(net, rp, flp); - - return err; + if (flp4->flowi4_proto) { + if (!flp4->saddr) + flp4->saddr = rt->rt_src; + if (!flp4->daddr) + flp4->daddr = rt->rt_dst; + rt = (struct rtable *) xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0); } - return 0; + return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); -int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) -{ - return ip_route_output_flow(net, rp, flp, NULL, 0); -} -EXPORT_SYMBOL(ip_route_output_key); - static int rt_fill_info(struct net *net, struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) @@ -2810,7 +2785,7 @@ static int rt_fill_info(struct net *net, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = rt->fl.fl4_tos; + r->rtm_tos = rt->rt_tos; r->rtm_table = RT_TABLE_MAIN; NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); r->rtm_type = rt->rt_type; @@ -2822,19 +2797,19 @@ static int rt_fill_info(struct net *net, NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); - if (rt->fl.fl4_src) { + if (rt->rt_key_src) { r->rtm_src_len = 32; - NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); + NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); } if (rt->dst.dev) NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid) NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); #endif if (rt_is_input_route(rt)) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); - else if (rt->rt_src != rt->fl.fl4_src) + else if (rt->rt_src != rt->rt_key_src) NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); if (rt->rt_dst != rt->rt_gateway) @@ -2843,11 +2818,12 @@ static int rt_fill_info(struct net *net, if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) goto nla_put_failure; - if (rt->fl.mark) - NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark); + if (rt->rt_mark) + NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); error = rt->dst.error; - expires = rt->dst.expires ? rt->dst.expires - jiffies : 0; + expires = (rt->peer && rt->peer->pmtu_expires) ? + rt->peer->pmtu_expires - jiffies : 0; if (rt->peer) { inet_peer_refcheck(rt->peer); id = atomic_read(&rt->peer->ip_id_count) & 0xffff; @@ -2877,7 +2853,7 @@ static int rt_fill_info(struct net *net, } } else #endif - NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); + NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); } if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, @@ -2951,14 +2927,18 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { - struct flowi fl = { - .fl4_dst = dst, - .fl4_src = src, - .fl4_tos = rtm->rtm_tos, - .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, - .mark = mark, + struct flowi4 fl4 = { + .daddr = dst, + .saddr = src, + .flowi4_tos = rtm->rtm_tos, + .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, + .flowi4_mark = mark, }; - err = ip_route_output_key(net, &rt, &fl); + rt = ip_route_output_key(net, &fl4); + + err = 0; + if (IS_ERR(rt)) + err = PTR_ERR(rt); } if (err) @@ -3249,9 +3229,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = { }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; -#endif /* CONFIG_NET_CLS_ROUTE */ +#endif /* CONFIG_IP_ROUTE_CLASSID */ static __initdata unsigned long rhash_entries; static int __init set_rhash_entries(char *str) @@ -3267,7 +3247,7 @@ int __init ip_rt_init(void) { int rc = 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); @@ -3304,14 +3284,6 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - /* All the timers, started at system startup tend - to synchronize. Perturb it a bit. - */ - INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); - expires_ljiffies = jiffies; - schedule_delayed_work(&expires_work, - net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 47519205a014..8b44c6d2a79b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -345,17 +345,20 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * no easy way to do this. */ { - struct flowi fl = { .mark = sk->sk_mark, - .fl4_dst = ((opt && opt->srr) ? - opt->faddr : ireq->rmt_addr), - .fl4_src = ireq->loc_addr, - .fl4_tos = RT_CONN_FLAGS(sk), - .proto = IPPROTO_TCP, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = th->dest, - .fl_ip_dport = th->source }; - security_req_classify_flow(req, &fl); - if (ip_route_output_key(sock_net(sk), &rt, &fl)) { + struct flowi4 fl4 = { + .flowi4_mark = sk->sk_mark, + .daddr = ((opt && opt->srr) ? + opt->faddr : ireq->rmt_addr), + .saddr = ireq->loc_addr, + .flowi4_tos = RT_CONN_FLAGS(sk), + .flowi4_proto = IPPROTO_TCP, + .flowi4_flags = inet_sk_flowi_flags(sk), + .fl4_sport = th->dest, + .fl4_dport = th->source, + }; + security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) { reqsk_free(req); goto out; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6c11eece262c..b22d45010545 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -505,6 +505,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) else answ = tp->write_seq - tp->snd_una; break; + case SIOCOUTQNSD: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else + answ = tp->write_seq - tp->snd_nxt; + break; default: return -ENOIOCTLCMD; } @@ -873,9 +882,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, flags); lock_sock(sk); - TCP_CHECK_TIMER(sk); res = do_tcp_sendpages(sk, &page, offset, size, flags); - TCP_CHECK_TIMER(sk); release_sock(sk); return res; } @@ -916,7 +923,6 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, long timeo; lock_sock(sk); - TCP_CHECK_TIMER(sk); flags = msg->msg_flags; timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); @@ -1104,7 +1110,6 @@ wait_for_memory: out: if (copied) tcp_push(sk, flags, mss_now, tp->nonagle); - TCP_CHECK_TIMER(sk); release_sock(sk); return copied; @@ -1123,7 +1128,6 @@ do_error: goto out; out_err: err = sk_stream_error(sk, flags, err); - TCP_CHECK_TIMER(sk); release_sock(sk); return err; } @@ -1415,8 +1419,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, lock_sock(sk); - TCP_CHECK_TIMER(sk); - err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; @@ -1767,12 +1769,10 @@ skip_copy: /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); - TCP_CHECK_TIMER(sk); release_sock(sk); return copied; out: - TCP_CHECK_TIMER(sk); release_sock(sk); return err; @@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif -struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct tcphdr *th; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 3b53fd1af23f..6187eb4d1dcf 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -209,7 +209,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) } -static struct tcp_congestion_ops bictcp = { +static struct tcp_congestion_ops bictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 71d5f2f29fa6..34340c9c95fa 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -39,7 +39,7 @@ /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 -#define HYSTART_DELAY_MIN (2U<<3) +#define HYSTART_DELAY_MIN (4U<<3) #define HYSTART_DELAY_MAX (16U<<3) #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) @@ -52,6 +52,7 @@ static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; +static int hystart_ack_delta __read_mostly = 2; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; @@ -75,6 +76,8 @@ MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); +module_param(hystart_ack_delta, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); /* BIC TCP Parameters */ struct bictcp { @@ -85,7 +88,7 @@ struct bictcp { u32 last_time; /* time when updated last_cwnd */ u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ - u32 delay_min; /* min delay */ + u32 delay_min; /* min delay (msec << 3) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ @@ -95,7 +98,7 @@ struct bictcp { u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ u32 end_seq; /* end_seq of the round */ - u32 last_jiffies; /* last time when the ACK spacing is close */ + u32 last_ack; /* last time when the ACK spacing is close */ u32 curr_rtt; /* the minimum rtt of current round */ }; @@ -116,12 +119,21 @@ static inline void bictcp_reset(struct bictcp *ca) ca->found = 0; } +static inline u32 bictcp_clock(void) +{ +#if HZ < 1000 + return ktime_to_ms(ktime_get_real()); +#else + return jiffies_to_msecs(jiffies); +#endif +} + static inline void bictcp_hystart_reset(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - ca->round_start = ca->last_jiffies = jiffies; + ca->round_start = ca->last_ack = bictcp_clock(); ca->end_seq = tp->snd_nxt; ca->curr_rtt = 0; ca->sample_cnt = 0; @@ -236,8 +248,8 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) */ /* change the unit from HZ to bictcp_HZ */ - t = ((tcp_time_stamp + (ca->delay_min>>3) - ca->epoch_start) - << BICTCP_HZ) / HZ; + t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) + - ca->epoch_start) << BICTCP_HZ) / HZ; if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; @@ -258,6 +270,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 100 * cwnd; /* very small increment*/ } + /* + * The initial growth of cubic function may be too conservative + * when the available bandwidth is still unknown. + */ + if (ca->loss_cwnd == 0 && ca->cnt > 20) + ca->cnt = 20; /* increase cwnd 5% per RTT */ + /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; @@ -339,12 +358,12 @@ static void hystart_update(struct sock *sk, u32 delay) struct bictcp *ca = inet_csk_ca(sk); if (!(ca->found & hystart_detect)) { - u32 curr_jiffies = jiffies; + u32 now = bictcp_clock(); /* first detection parameter - ack-train detection */ - if (curr_jiffies - ca->last_jiffies <= msecs_to_jiffies(2)) { - ca->last_jiffies = curr_jiffies; - if (curr_jiffies - ca->round_start >= ca->delay_min>>4) + if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + ca->last_ack = now; + if ((s32)(now - ca->round_start) > ca->delay_min >> 4) ca->found |= HYSTART_ACK_TRAIN; } @@ -391,7 +410,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; - delay = usecs_to_jiffies(rtt_us) << 3; + delay = (rtt_us << 3) / USEC_PER_MSEC; if (delay == 0) delay = 1; @@ -405,7 +424,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) hystart_update(sk, delay); } -static struct tcp_congestion_ops cubictcp = { +static struct tcp_congestion_ops cubictcp __read_mostly = { .init = bictcp_init, .ssthresh = bictcp_recalc_ssthresh, .cong_avoid = bictcp_cong_avoid, @@ -447,6 +466,10 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); + /* hystart needs ms clock resolution */ + if (hystart && HZ < 1000) + cubictcp.flags |= TCP_CONG_RTT_STAMP; + return tcp_register_congestion_control(&cubictcp); } diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 8b6caaf75bb9..30f27f6b3655 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -158,7 +158,7 @@ static u32 hstcp_ssthresh(struct sock *sk) } -static struct tcp_congestion_ops tcp_highspeed = { +static struct tcp_congestion_ops tcp_highspeed __read_mostly = { .init = hstcp_init, .ssthresh = hstcp_ssthresh, .cong_avoid = hstcp_cong_avoid, diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 7c94a4955416..c1a8175361e8 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -284,7 +284,7 @@ static void htcp_state(struct sock *sk, u8 new_state) } } -static struct tcp_congestion_ops htcp = { +static struct tcp_congestion_ops htcp __read_mostly = { .init = htcp_init, .ssthresh = htcp_recalc_ssthresh, .cong_avoid = htcp_cong_avoid, diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 377bc9349371..fe3ecf484b44 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -162,7 +162,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); } -static struct tcp_congestion_ops tcp_hybla = { +static struct tcp_congestion_ops tcp_hybla __read_mostly = { .init = hybla_init, .ssthresh = tcp_reno_ssthresh, .min_cwnd = tcp_reno_min_cwnd, diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 00ca688d8964..813b43a76fec 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -322,7 +322,7 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, } } -static struct tcp_congestion_ops tcp_illinois = { +static struct tcp_congestion_ops tcp_illinois __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_illinois_init, .ssthresh = tcp_illinois_ssthresh, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2549b29b062d..da782e7ab16d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -817,7 +817,7 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) - cwnd = rfc3390_bytes_to_packets(tp->mss_cache); + cwnd = TCP_INIT_CWND; return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } @@ -1222,7 +1222,7 @@ static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, } /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && + if (dup_sack && tp->undo_marker && tp->undo_retrans && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; @@ -1299,7 +1299,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { - if (after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + if (tp->undo_marker && tp->undo_retrans && + after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) state->reord = min(fack_count, state->reord); @@ -3349,7 +3350,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, net_invalid_timestamp())) rtt_us = ktime_us_delta(ktime_get_real(), last_ackt); - else if (ca_seq_rtt > 0) + else if (ca_seq_rtt >= 0) rtt_us = jiffies_to_usecs(ca_seq_rtt); } @@ -4399,7 +4400,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { tp->ucopy.len -= chunk; tp->copied_seq += chunk; - eaten = (chunk == skb->len && !th->fin); + eaten = (chunk == skb->len); tcp_rcv_space_adjust(sk); } local_bh_disable(); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 856f68466d49..f7e6c2c2d2bb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -149,9 +149,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + __be16 orig_sport, orig_dport; struct rtable *rt; __be32 daddr, nexthop; - int tmp; int err; if (addr_len < sizeof(struct sockaddr_in)) @@ -167,14 +167,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) nexthop = inet->opt->faddr; } - tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, - RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, - IPPROTO_TCP, - inet->inet_sport, usin->sin_port, sk, 1); - if (tmp < 0) { - if (tmp == -ENETUNREACH) + orig_sport = inet->inet_sport; + orig_dport = usin->sin_port; + rt = ip_route_connect(nexthop, inet->inet_saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_TCP, + orig_sport, orig_dport, sk, true); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + if (err == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); - return tmp; + return err; } if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { @@ -233,11 +236,14 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (err) goto failure; - err = ip_route_newports(&rt, IPPROTO_TCP, - inet->inet_sport, inet->inet_dport, sk); - if (err) + rt = ip_route_newports(rt, IPPROTO_TCP, + orig_sport, orig_dport, + inet->inet_sport, inet->inet_dport, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; goto failure; - + } /* OK, now commit destination to socket. */ sk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(sk, &rt->dst); @@ -1341,7 +1347,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && - peer->daddr.a4 == saddr) { + peer->daddr.addr.a4 == saddr) { inet_peer_refcheck(peer); if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > @@ -1556,12 +1562,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ sock_rps_save_rxhash(sk, skb->rxhash); - TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } - TCP_CHECK_TIMER(sk); return 0; } @@ -1583,13 +1587,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) } else sock_rps_save_rxhash(sk, skb->rxhash); - - TCP_CHECK_TIMER(sk); if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; } - TCP_CHECK_TIMER(sk); return 0; reset: @@ -1994,7 +1995,6 @@ static void *listening_get_next(struct seq_file *seq, void *cur) } req = req->dl_next; } - st->offset = 0; if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries) break; get_req: diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index de870377fbba..656d431c99ad 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -313,7 +313,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) lp->last_drop = tcp_time_stamp; } -static struct tcp_congestion_ops tcp_lp = { +static struct tcp_congestion_ops tcp_lp __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_lp_init, .ssthresh = tcp_reno_ssthresh, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 406f320336e6..dfa5beb0c1c8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2162,7 +2162,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; - tp->undo_retrans++; + tp->undo_retrans += tcp_skb_pcount(skb); /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index a76513779e2b..8ce55b8aaec8 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk) } -static struct tcp_congestion_ops tcp_scalable = { +static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .cong_avoid = tcp_scalable_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 74a6aa003657..ecd44b0c45f1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -259,7 +259,6 @@ static void tcp_delack_timer(unsigned long data) tcp_send_ack(sk); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); } - TCP_CHECK_TIMER(sk); out: if (tcp_memory_pressure) @@ -481,7 +480,6 @@ static void tcp_write_timer(unsigned long data) tcp_probe_timer(sk); break; } - TCP_CHECK_TIMER(sk); out: sk_mem_reclaim(sk); @@ -589,7 +587,6 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp) - elapsed; } - TCP_CHECK_TIMER(sk); sk_mem_reclaim(sk); resched: diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index c6743eec9b7d..80fa2bfd7ede 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -304,7 +304,7 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(tcp_vegas_get_info); -static struct tcp_congestion_ops tcp_vegas = { +static struct tcp_congestion_ops tcp_vegas __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_vegas_init, .ssthresh = tcp_reno_ssthresh, diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 38bc0b52d745..ac43cd747bce 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -201,7 +201,7 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } -static struct tcp_congestion_ops tcp_veno = { +static struct tcp_congestion_ops tcp_veno __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index a534dda5456e..1b91bf48e277 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -272,7 +272,7 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, } -static struct tcp_congestion_ops tcp_westwood = { +static struct tcp_congestion_ops tcp_westwood __read_mostly = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index a0f240358892..dc7f43179c9a 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -225,7 +225,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { return tp->snd_cwnd - reduction; } -static struct tcp_congestion_ops tcp_yeah = { +static struct tcp_congestion_ops tcp_yeah __read_mostly = { .flags = TCP_CONG_RTT_STAMP, .init = tcp_yeah_init, .ssthresh = tcp_yeah_ssthresh, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8157b17959ee..588f47af5faf 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -663,75 +663,72 @@ void udp_flush_pending_frames(struct sock *sk) EXPORT_SYMBOL(udp_flush_pending_frames); /** - * udp4_hwcsum_outgoing - handle outgoing HW checksumming - * @sk: socket we are sending on + * udp4_hwcsum - handle outgoing HW checksumming * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) + * @src: source IP address + * @dst: destination IP address */ -static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, int len) +static void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { - unsigned int offset; struct udphdr *uh = udp_hdr(skb); + struct sk_buff *frags = skb_shinfo(skb)->frag_list; + int offset = skb_transport_offset(skb); + int len = skb->len - offset; + int hlen = len; __wsum csum = 0; - if (skb_queue_len(&sk->sk_write_queue) == 1) { + if (!frags) { /* * Only one fragment on the socket. */ skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); - uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); + uh->check = ~csum_tcpudp_magic(src, dst, len, + IPPROTO_UDP, 0); } else { /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ - offset = skb_transport_offset(skb); - skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + do { + csum = csum_add(csum, frags->csum); + hlen -= frags->len; + } while ((frags = frags->next)); + csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; - skb_queue_walk(&sk->sk_write_queue, skb) { - csum = csum_add(csum, skb->csum); - } - uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } } -/* - * Push out all pending data as one UDP datagram. Socket is locked. - */ -static int udp_push_pending_frames(struct sock *sk) +static int udp_send_skb(struct sk_buff *skb, __be32 daddr, __be32 dport) { - struct udp_sock *up = udp_sk(sk); + struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); - struct flowi *fl = &inet->cork.fl; - struct sk_buff *skb; struct udphdr *uh; + struct rtable *rt = (struct rtable *)skb_dst(skb); int err = 0; int is_udplite = IS_UDPLITE(sk); + int offset = skb_transport_offset(skb); + int len = skb->len - offset; __wsum csum = 0; - /* Grab the skbuff where UDP header space exists. */ - if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) - goto out; - /* * Create a UDP header */ uh = udp_hdr(skb); - uh->source = fl->fl_ip_sport; - uh->dest = fl->fl_ip_dport; - uh->len = htons(up->len); + uh->source = inet->inet_sport; + uh->dest = dport; + uh->len = htons(len); uh->check = 0; if (is_udplite) /* UDP-Lite */ - csum = udplite_csum_outgoing(sk, skb); + csum = udplite_csum(skb); else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */ @@ -740,20 +737,20 @@ static int udp_push_pending_frames(struct sock *sk) } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ - udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len); + udp4_hwcsum(skb, rt->rt_src, daddr); goto send; - } else /* `normal' UDP */ - csum = udp_csum_outgoing(sk, skb); + } else + csum = udp_csum(skb); /* add protocol-dependent pseudo-header */ - uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, + uh->check = csum_tcpudp_magic(rt->rt_src, daddr, len, sk->sk_protocol, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; send: - err = ip_push_pending_frames(sk); + err = ip_send_skb(skb); if (err) { if (err == -ENOBUFS && !inet->recverr) { UDP_INC_STATS_USER(sock_net(sk), @@ -763,6 +760,26 @@ send: } else UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_OUTDATAGRAMS, is_udplite); + return err; +} + +/* + * Push out all pending data as one UDP datagram. Socket is locked. + */ +static int udp_push_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct flowi4 *fl4 = &inet->cork.fl.u.ip4; + struct sk_buff *skb; + int err = 0; + + skb = ip_finish_skb(sk); + if (!skb) + goto out; + + err = udp_send_skb(skb, fl4->daddr, fl4->fl4_dport); + out: up->len = 0; up->pending = 0; @@ -774,6 +791,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); + struct flowi4 *fl4; int ulen = len; struct ipcm_cookie ipc; struct rtable *rt = NULL; @@ -785,6 +803,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int err, is_udplite = IS_UDPLITE(sk); int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); + struct sk_buff *skb; if (len > 0xFFFF) return -EMSGSIZE; @@ -799,6 +818,8 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.opt = NULL; ipc.tx_flags = 0; + getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; + if (up->pending) { /* * There are pending frames. @@ -888,20 +909,25 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, rt = (struct rtable *)sk_dst_check(sk, 0); if (rt == NULL) { - struct flowi fl = { .oif = ipc.oif, - .mark = sk->sk_mark, - .fl4_dst = faddr, - .fl4_src = saddr, - .fl4_tos = tos, - .proto = sk->sk_protocol, - .flags = inet_sk_flowi_flags(sk), - .fl_ip_sport = inet->inet_sport, - .fl_ip_dport = dport }; + struct flowi4 fl4 = { + .flowi4_oif = ipc.oif, + .flowi4_mark = sk->sk_mark, + .daddr = faddr, + .saddr = saddr, + .flowi4_tos = tos, + .flowi4_proto = sk->sk_protocol, + .flowi4_flags = (inet_sk_flowi_flags(sk) | + FLOWI_FLAG_CAN_SLEEP), + .fl4_sport = inet->inet_sport, + .fl4_dport = dport, + }; struct net *net = sock_net(sk); - security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(net, &rt, &fl, sk, 1); - if (err) { + security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); + rt = ip_route_output_flow(net, &fl4, sk); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); goto out; @@ -923,6 +949,17 @@ back_from_confirm: if (!ipc.addr) daddr = ipc.addr = rt->rt_dst; + /* Lockless fast path for the non-corking case. */ + if (!corkreq) { + skb = ip_make_skb(sk, getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), &ipc, &rt, + msg->msg_flags); + err = PTR_ERR(skb); + if (skb && !IS_ERR(skb)) + err = udp_send_skb(skb, daddr, dport); + goto out; + } + lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ @@ -936,15 +973,15 @@ back_from_confirm: /* * Now cork the socket to pend data. */ - inet->cork.fl.fl4_dst = daddr; - inet->cork.fl.fl_ip_dport = dport; - inet->cork.fl.fl4_src = saddr; - inet->cork.fl.fl_ip_sport = inet->inet_sport; + fl4 = &inet->cork.fl.u.ip4; + fl4->daddr = daddr; + fl4->saddr = saddr; + fl4->fl4_dport = dport; + fl4->fl4_sport = inet->inet_sport; up->pending = AF_INET; do_append_data: up->len += ulen; - getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; err = ip_append_data(sk, getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); @@ -2199,7 +2236,7 @@ int udp4_ufo_send_check(struct sk_buff *skb) return 0; } -struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) +struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features) { struct sk_buff *segs = ERR_PTR(-EINVAL); unsigned int mss; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index b057d40addec..13e0e7f659ff 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -19,25 +19,23 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, - xfrm_address_t *saddr, - xfrm_address_t *daddr) + const xfrm_address_t *saddr, + const xfrm_address_t *daddr) { - struct flowi fl = { - .fl4_dst = daddr->a4, - .fl4_tos = tos, + struct flowi4 fl4 = { + .daddr = daddr->a4, + .flowi4_tos = tos, }; - struct dst_entry *dst; struct rtable *rt; - int err; if (saddr) - fl.fl4_src = saddr->a4; + fl4.saddr = saddr->a4; + + rt = __ip_route_output_key(net, &fl4); + if (!IS_ERR(rt)) + return &rt->dst; - err = __ip_route_output_key(net, &rt, &fl); - dst = &rt->dst; - if (err) - dst = ERR_PTR(err); - return dst; + return ERR_CAST(rt); } static int xfrm4_get_saddr(struct net *net, @@ -56,9 +54,9 @@ static int xfrm4_get_saddr(struct net *net, return 0; } -static int xfrm4_get_tos(struct flowi *fl) +static int xfrm4_get_tos(const struct flowi *fl) { - return IPTOS_RT_MASK & fl->fl4_tos; /* Strip ECN bits */ + return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */ } static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, @@ -68,11 +66,17 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, } static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, - struct flowi *fl) + const struct flowi *fl) { struct rtable *rt = (struct rtable *)xdst->route; + const struct flowi4 *fl4 = &fl->u.ip4; - xdst->u.rt.fl = *fl; + rt->rt_key_dst = fl4->daddr; + rt->rt_key_src = fl4->saddr; + rt->rt_tos = fl4->flowi4_tos; + rt->rt_iif = fl4->flowi4_iif; + rt->rt_oif = fl4->flowi4_oif; + rt->rt_mark = fl4->flowi4_mark; xdst->u.dst.dev = dev; dev_hold(dev); @@ -99,9 +103,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) { struct iphdr *iph = ip_hdr(skb); u8 *xprth = skb_network_header(skb) + iph->ihl * 4; + struct flowi4 *fl4 = &fl->u.ip4; - memset(fl, 0, sizeof(struct flowi)); - fl->mark = skb->mark; + memset(fl4, 0, sizeof(struct flowi4)); + fl4->flowi4_mark = skb->mark; if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { switch (iph->protocol) { @@ -114,8 +119,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ports = (__be16 *)xprth; - fl->fl_ip_sport = ports[!!reverse]; - fl->fl_ip_dport = ports[!reverse]; + fl4->fl4_sport = ports[!!reverse]; + fl4->fl4_dport = ports[!reverse]; } break; @@ -123,8 +128,8 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 2 - skb->data)) { u8 *icmp = xprth; - fl->fl_icmp_type = icmp[0]; - fl->fl_icmp_code = icmp[1]; + fl4->fl4_icmp_type = icmp[0]; + fl4->fl4_icmp_code = icmp[1]; } break; @@ -132,7 +137,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 4 - skb->data)) { __be32 *ehdr = (__be32 *)xprth; - fl->fl_ipsec_spi = ehdr[0]; + fl4->fl4_ipsec_spi = ehdr[0]; } break; @@ -140,7 +145,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 8 - skb->data)) { __be32 *ah_hdr = (__be32*)xprth; - fl->fl_ipsec_spi = ah_hdr[1]; + fl4->fl4_ipsec_spi = ah_hdr[1]; } break; @@ -148,7 +153,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ipcomp_hdr = (__be16 *)xprth; - fl->fl_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); + fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); } break; @@ -160,20 +165,20 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) if (greflags[0] & GRE_KEY) { if (greflags[0] & GRE_CSUM) gre_hdr++; - fl->fl_gre_key = gre_hdr[1]; + fl4->fl4_gre_key = gre_hdr[1]; } } break; default: - fl->fl_ipsec_spi = 0; + fl4->fl4_ipsec_spi = 0; break; } } - fl->proto = iph->protocol; - fl->fl4_dst = reverse ? iph->saddr : iph->daddr; - fl->fl4_src = reverse ? iph->daddr : iph->saddr; - fl->fl4_tos = iph->tos; + fl4->flowi4_proto = iph->protocol; + fl4->daddr = reverse ? iph->saddr : iph->daddr; + fl4->saddr = reverse ? iph->daddr : iph->saddr; + fl4->flowi4_tos = iph->tos; } static inline int xfrm4_garbage_collect(struct dst_ops *ops) @@ -196,8 +201,11 @@ static void xfrm4_dst_destroy(struct dst_entry *dst) { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + dst_destroy_metrics_generic(dst); + if (likely(xdst->u.rt.peer)) inet_putpeer(xdst->u.rt.peer); + xfrm_dst_destroy(xdst); } @@ -215,6 +223,7 @@ static struct dst_ops xfrm4_dst_ops = { .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, + .cow_metrics = dst_cow_metrics_generic, .destroy = xfrm4_dst_destroy, .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, @@ -230,6 +239,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .get_tos = xfrm4_get_tos, .init_path = xfrm4_init_path, .fill_dst = xfrm4_fill_dst, + .blackhole_route = ipv4_blackhole_route, }; #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 47947624eccc..1717c64628d1 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -21,24 +21,26 @@ static int xfrm4_init_flags(struct xfrm_state *x) } static void -__xfrm4_init_tempsel(struct xfrm_selector *sel, struct flowi *fl) +__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) { - sel->daddr.a4 = fl->fl4_dst; - sel->saddr.a4 = fl->fl4_src; - sel->dport = xfrm_flowi_dport(fl); + const struct flowi4 *fl4 = &fl->u.ip4; + + sel->daddr.a4 = fl4->daddr; + sel->saddr.a4 = fl4->saddr; + sel->dport = xfrm_flowi_dport(fl, &fl4->uli); sel->dport_mask = htons(0xffff); - sel->sport = xfrm_flowi_sport(fl); + sel->sport = xfrm_flowi_sport(fl, &fl4->uli); sel->sport_mask = htons(0xffff); sel->family = AF_INET; sel->prefixlen_d = 32; sel->prefixlen_s = 32; - sel->proto = fl->proto; - sel->ifindex = fl->oif; + sel->proto = fl4->flowi4_proto; + sel->ifindex = fl4->flowi4_oif; } static void -xfrm4_init_temprop(struct xfrm_state *x, struct xfrm_tmpl *tmpl, - xfrm_address_t *daddr, xfrm_address_t *saddr) +xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl, + const xfrm_address_t *daddr, const xfrm_address_t *saddr) { x->id = tmpl->id; if (x->id.daddr.a4 == 0) |