diff options
Diffstat (limited to 'net/ipv4')
51 files changed, 2110 insertions, 1791 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d2e49baaff63..64a9c0fdc4aa 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -716,8 +716,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, } EXPORT_SYMBOL(inet_getname); -int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size) +int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; @@ -728,7 +727,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->sendmsg(iocb, sk, msg, size); + return sk->sk_prot->sendmsg(sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg); @@ -750,8 +749,8 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, } EXPORT_SYMBOL(inet_sendpage); -int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, - size_t size, int flags) +int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) { struct sock *sk = sock->sk; int addr_len = 0; @@ -759,7 +758,7 @@ int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, sock_rps_record_flow(sk); - err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, + err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; @@ -1675,7 +1674,7 @@ static int __init inet_init(void) struct list_head *r; int rc = -EINVAL; - BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); + sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); rc = proto_register(&tcp_prot, 1); if (rc) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 205e1472aa78..5f5c674e130a 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -122,6 +122,7 @@ * Interface to generic neighbour cache. */ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); +static bool arp_key_eq(const struct neighbour *n, const void *pkey); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -149,18 +150,12 @@ static const struct neigh_ops arp_direct_ops = { .connected_output = neigh_direct_output, }; -static const struct neigh_ops arp_broken_ops = { - .family = AF_INET, - .solicit = arp_solicit, - .error_report = arp_error_report, - .output = neigh_compat_output, - .connected_output = neigh_compat_output, -}; - struct neigh_table arp_tbl = { .family = AF_INET, .key_len = 4, + .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, + .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .id = "arp_cache", @@ -216,7 +211,12 @@ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { - return arp_hashfn(*(u32 *)pkey, dev, *hash_rnd); + return arp_hashfn(pkey, dev, hash_rnd); +} + +static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) +{ + return neigh_key_eq32(neigh, pkey); } static int arp_constructor(struct neighbour *neigh) @@ -260,35 +260,6 @@ static int arp_constructor(struct neighbour *neigh) in old paradigm. */ -#if 1 - /* So... these "amateur" devices are hopeless. - The only thing, that I can say now: - It is very sad that we need to keep ugly obsolete - code to make them happy. - - They should be moved to more reasonable state, now - they use rebuild_header INSTEAD OF hard_start_xmit!!! - Besides that, they are sort of out of date - (a lot of redundant clones/copies, useless in 2.1), - I wonder why people believe that they work. - */ - switch (dev->type) { - default: - break; - case ARPHRD_ROSE: -#if IS_ENABLED(CONFIG_AX25) - case ARPHRD_AX25: -#if IS_ENABLED(CONFIG_NETROM) - case ARPHRD_NETROM: -#endif - neigh->ops = &arp_broken_ops; - neigh->output = neigh->ops->output; - return 0; -#else - break; -#endif - } -#endif if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); @@ -433,71 +404,6 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) return flag; } -/* OBSOLETE FUNCTIONS */ - -/* - * Find an arp mapping in the cache. If not found, post a request. - * - * It is very UGLY routine: it DOES NOT use skb->dst->neighbour, - * even if it exists. It is supposed that skb->dev was mangled - * by a virtual device (eql, shaper). Nobody but broken devices - * is allowed to use this function, it is scheduled to be removed. --ANK - */ - -static int arp_set_predefined(int addr_hint, unsigned char *haddr, - __be32 paddr, struct net_device *dev) -{ - switch (addr_hint) { - case RTN_LOCAL: - pr_debug("arp called for own IP address\n"); - memcpy(haddr, dev->dev_addr, dev->addr_len); - return 1; - case RTN_MULTICAST: - arp_mc_map(paddr, haddr, dev, 1); - return 1; - case RTN_BROADCAST: - memcpy(haddr, dev->broadcast, dev->addr_len); - return 1; - } - return 0; -} - - -int arp_find(unsigned char *haddr, struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - __be32 paddr; - struct neighbour *n; - - if (!skb_dst(skb)) { - pr_debug("arp_find is called with dst==NULL\n"); - kfree_skb(skb); - return 1; - } - - paddr = rt_nexthop(skb_rtable(skb), ip_hdr(skb)->daddr); - if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, - paddr, dev)) - return 0; - - n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); - - if (n) { - n->used = jiffies; - if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) { - neigh_ha_snapshot(haddr, n, dev); - neigh_release(n); - return 0; - } - neigh_release(n); - } else - kfree_skb(skb); - return 1; -} -EXPORT_SYMBOL(arp_find); - -/* END OF OBSOLETE FUNCTIONS */ - /* * Check if we can use proxy ARP for this path */ diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 3a8985c94581..c6473f365ad1 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -107,7 +107,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; -static u32 inet_addr_hash(struct net *net, __be32 addr) +static u32 inet_addr_hash(const struct net *net, __be32 addr) { u32 val = (__force u32) addr ^ net_hash_mix(net); @@ -548,6 +548,26 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, return NULL; } +static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) +{ + struct ip_mreqn mreq = { + .imr_multiaddr.s_addr = ifa->ifa_address, + .imr_ifindex = ifa->ifa_dev->dev->ifindex, + }; + int ret; + + ASSERT_RTNL(); + + lock_sock(sk); + if (join) + ret = ip_mc_join_group(sk, &mreq); + else + ret = ip_mc_leave_group(sk, &mreq); + release_sock(sk); + + return ret; +} + static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); @@ -573,7 +593,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && - ifa->ifa_local != nla_get_be32(tb[IFA_LOCAL])) + ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) continue; if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) @@ -581,9 +601,11 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) if (tb[IFA_ADDRESS] && (ifm->ifa_prefixlen != ifa->ifa_prefixlen || - !inet_ifa_match(nla_get_be32(tb[IFA_ADDRESS]), ifa))) + !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa))) continue; + if (ipv4_is_multicast(ifa->ifa_address)) + ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); return 0; } @@ -769,11 +791,11 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_dev = in_dev; - ifa->ifa_local = nla_get_be32(tb[IFA_LOCAL]); - ifa->ifa_address = nla_get_be32(tb[IFA_ADDRESS]); + ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); + ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); if (tb[IFA_BROADCAST]) - ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]); + ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); @@ -838,6 +860,15 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); + if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { + int ret = ip_mc_config(net->ipv4.mc_autojoin_sk, + true, ifa); + + if (ret < 0) { + inet_free_ifa(ifa); + return ret; + } + } return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid); } else { inet_free_ifa(ifa); @@ -1510,11 +1541,11 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, valid = INFINITY_LIFE_TIME; } if ((ifa->ifa_address && - nla_put_be32(skb, IFA_ADDRESS, ifa->ifa_address)) || + nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) || (ifa->ifa_local && - nla_put_be32(skb, IFA_LOCAL, ifa->ifa_local)) || + nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) || (ifa->ifa_broadcast && - nla_put_be32(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || + nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) || diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 57be71dd6a9e..718b0a16ea40 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -52,12 +52,12 @@ static int __net_init fib4_rules_init(struct net *net) { struct fib_table *local_table, *main_table; - local_table = fib_trie_table(RT_TABLE_LOCAL); - if (local_table == NULL) + main_table = fib_trie_table(RT_TABLE_MAIN, NULL); + if (main_table == NULL) return -ENOMEM; - main_table = fib_trie_table(RT_TABLE_MAIN); - if (main_table == NULL) + local_table = fib_trie_table(RT_TABLE_LOCAL, main_table); + if (local_table == NULL) goto fail; hlist_add_head_rcu(&local_table->tb_hlist, @@ -67,14 +67,14 @@ static int __net_init fib4_rules_init(struct net *net) return 0; fail: - fib_free_table(local_table); + fib_free_table(main_table); return -ENOMEM; } #else struct fib_table *fib_new_table(struct net *net, u32 id) { - struct fib_table *tb; + struct fib_table *tb, *alias = NULL; unsigned int h; if (id == 0) @@ -83,23 +83,23 @@ struct fib_table *fib_new_table(struct net *net, u32 id) if (tb) return tb; - tb = fib_trie_table(id); + if (id == RT_TABLE_LOCAL) + alias = fib_new_table(net, RT_TABLE_MAIN); + + tb = fib_trie_table(id, alias); if (!tb) return NULL; switch (id) { case RT_TABLE_LOCAL: - net->ipv4.fib_local = tb; + rcu_assign_pointer(net->ipv4.fib_local, tb); break; - case RT_TABLE_MAIN: - net->ipv4.fib_main = tb; + rcu_assign_pointer(net->ipv4.fib_main, tb); break; - case RT_TABLE_DEFAULT: - net->ipv4.fib_default = tb; + rcu_assign_pointer(net->ipv4.fib_default, tb); break; - default: break; } @@ -129,16 +129,62 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } #endif /* CONFIG_IP_MULTIPLE_TABLES */ +static void fib_replace_table(struct net *net, struct fib_table *old, + struct fib_table *new) +{ +#ifdef CONFIG_IP_MULTIPLE_TABLES + switch (new->tb_id) { + case RT_TABLE_LOCAL: + rcu_assign_pointer(net->ipv4.fib_local, new); + break; + case RT_TABLE_MAIN: + rcu_assign_pointer(net->ipv4.fib_main, new); + break; + case RT_TABLE_DEFAULT: + rcu_assign_pointer(net->ipv4.fib_default, new); + break; + default: + break; + } + +#endif + /* replace the old table in the hlist */ + hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist); +} + +int fib_unmerge(struct net *net) +{ + struct fib_table *old, *new; + + /* attempt to fetch local table if it has been allocated */ + old = fib_get_table(net, RT_TABLE_LOCAL); + if (!old) + return 0; + + new = fib_trie_unmerge(old); + if (!new) + return -ENOMEM; + + /* replace merged table with clean table */ + if (new != old) { + fib_replace_table(net, old, new); + fib_free_table(old); + } + + return 0; +} + static void fib_flush(struct net *net) { int flushed = 0; - struct fib_table *tb; - struct hlist_head *head; unsigned int h; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { - head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, head, tb_hlist) + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct hlist_node *tmp; + struct fib_table *tb; + + hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) flushed += fib_table_flush(tb); } @@ -146,6 +192,19 @@ static void fib_flush(struct net *net) rt_cache_flush(net); } +void fib_flush_external(struct net *net) +{ + struct fib_table *tb; + struct hlist_head *head; + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + head = &net->ipv4.fib_table_hash[h]; + hlist_for_each_entry(tb, head, tb_hlist) + fib_table_flush_external(tb); + } +} + /* * Find address type as if only "dev" was present in the system. If * on_dev is NULL then all interfaces are taken into consideration. @@ -665,10 +724,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[0]; s_e = cb->args[1]; + rcu_read_lock(); + for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv4.fib_table_hash[h]; - hlist_for_each_entry(tb, head, tb_hlist) { + hlist_for_each_entry_rcu(tb, head, tb_hlist) { if (e < s_e) goto next; if (dumped) @@ -682,6 +743,8 @@ next: } } out: + rcu_read_unlock(); + cb->args[1] = e; cb->args[0] = h; @@ -1111,23 +1174,29 @@ static void ip_fib_net_exit(struct net *net) { unsigned int i; + rtnl_lock(); + #ifdef CONFIG_IP_MULTIPLE_TABLES - fib4_rules_exit(net); + RCU_INIT_POINTER(net->ipv4.fib_local, NULL); + RCU_INIT_POINTER(net->ipv4.fib_main, NULL); + RCU_INIT_POINTER(net->ipv4.fib_default, NULL); #endif - rtnl_lock(); for (i = 0; i < FIB_TABLE_HASHSZ; i++) { - struct fib_table *tb; - struct hlist_head *head; + struct hlist_head *head = &net->ipv4.fib_table_hash[i]; struct hlist_node *tmp; + struct fib_table *tb; - head = &net->ipv4.fib_table_hash[i]; hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_del(&tb->tb_hlist); fib_table_flush(tb); fib_free_table(tb); } } + +#ifdef CONFIG_IP_MULTIPLE_TABLES + fib4_rules_exit(net); +#endif rtnl_unlock(); kfree(net->ipv4.fib_table_hash); } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 825981b1049a..c6211ed60b03 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -6,11 +6,13 @@ #include <net/ip_fib.h> struct fib_alias { - struct list_head fa_list; + struct hlist_node fa_list; struct fib_info *fa_info; u8 fa_tos; u8 fa_type; u8 fa_state; + u8 fa_slen; + u32 tb_id; struct rcu_head rcu; }; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index d3db718be51d..8162dd8e86d7 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -174,6 +174,11 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (frh->tos & ~IPTOS_TOS_MASK) goto errout; + /* split local/main if they are not already split */ + err = fib_unmerge(net); + if (err) + goto errout; + if (rule->table == RT_TABLE_UNSPEC) { if (rule->action == FR_ACT_TO_TBL) { struct fib_table *table; @@ -189,10 +194,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, } if (frh->src_len) - rule4->src = nla_get_be32(tb[FRA_SRC]); + rule4->src = nla_get_in_addr(tb[FRA_SRC]); if (frh->dst_len) - rule4->dst = nla_get_be32(tb[FRA_DST]); + rule4->dst = nla_get_in_addr(tb[FRA_DST]); #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) { @@ -209,21 +214,31 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->tos = frh->tos; net->ipv4.fib_has_custom_rules = true; + fib_flush_external(rule->fr_net); + err = 0; errout: return err; } -static void fib4_rule_delete(struct fib_rule *rule) +static int fib4_rule_delete(struct fib_rule *rule) { struct net *net = rule->fr_net; -#ifdef CONFIG_IP_ROUTE_CLASSID - struct fib4_rule *rule4 = (struct fib4_rule *) rule; + int err; - if (rule4->tclassid) + /* split local/main if they are not already split */ + err = fib_unmerge(net); + if (err) + goto errout; + +#ifdef CONFIG_IP_ROUTE_CLASSID + if (((struct fib4_rule *)rule)->tclassid) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; + fib_flush_external(rule->fr_net); +errout: + return err; } static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, @@ -245,10 +260,10 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, return 0; #endif - if (frh->src_len && (rule4->src != nla_get_be32(tb[FRA_SRC]))) + if (frh->src_len && (rule4->src != nla_get_in_addr(tb[FRA_SRC]))) return 0; - if (frh->dst_len && (rule4->dst != nla_get_be32(tb[FRA_DST]))) + if (frh->dst_len && (rule4->dst != nla_get_in_addr(tb[FRA_DST]))) return 0; return 1; @@ -264,9 +279,9 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, frh->tos = rule4->tos; if ((rule4->dst_len && - nla_put_be32(skb, FRA_DST, rule4->dst)) || + nla_put_in_addr(skb, FRA_DST, rule4->dst)) || (rule4->src_len && - nla_put_be32(skb, FRA_SRC, rule4->src))) + nla_put_in_addr(skb, FRA_SRC, rule4->src))) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid && diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1e2090ea663e..eac5aec7772a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -213,7 +213,6 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - release_net(fi->fib_net); if (fi->fib_metrics != (u32 *) dst_default_metrics) kfree(fi->fib_metrics); kfree(fi); @@ -469,7 +468,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; + nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0; #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; @@ -524,7 +523,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla && nla_get_be32(nla) != nh->nh_gw) + if (nla && nla_get_in_addr(nla) != nh->nh_gw) return 1; #ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); @@ -814,7 +813,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } else fi->fib_metrics = (u32 *) dst_default_metrics; - fi->fib_net = hold_net(net); + fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; fi->fib_flags = cfg->fc_flags; @@ -1016,7 +1015,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && - nla_put_be32(skb, RTA_DST, dst)) + nla_put_in_addr(skb, RTA_DST, dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) @@ -1025,11 +1024,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; if (fi->fib_prefsrc && - nla_put_be32(skb, RTA_PREFSRC, fi->fib_prefsrc)) + nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) goto nla_put_failure; if (fi->fib_nhs == 1) { if (fi->fib_nh->nh_gw && - nla_put_be32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) + nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) goto nla_put_failure; if (fi->fib_nh->nh_oif && nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) @@ -1059,7 +1058,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, rtnh->rtnh_ifindex = nh->nh_oif; if (nh->nh_gw && - nla_put_be32(skb, RTA_GATEWAY, nh->nh_gw)) + nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw)) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid && @@ -1163,12 +1162,12 @@ int fib_sync_down_dev(struct net_device *dev, int force) void fib_select_default(struct fib_result *res) { struct fib_info *fi = NULL, *last_resort = NULL; - struct list_head *fa_head = res->fa_head; + struct hlist_head *fa_head = res->fa_head; struct fib_table *tb = res->table; int order = -1, last_idx = -1; struct fib_alias *fa; - list_for_each_entry_rcu(fa, fa_head, fa_list) { + hlist_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; if (next_fi->fib_scope != res->scope || diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3daf0224ff2e..2c7c299ee2b9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -79,6 +79,7 @@ #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/switchdev.h> #include "fib_lookup.h" #define MAX_STAT_DEPTH 32 @@ -88,38 +89,35 @@ typedef unsigned int t_key; -#define IS_TNODE(n) ((n)->bits) -#define IS_LEAF(n) (!(n)->bits) +#define IS_TRIE(n) ((n)->pos >= KEYLENGTH) +#define IS_TNODE(n) ((n)->bits) +#define IS_LEAF(n) (!(n)->bits) -#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos) - -struct tnode { +struct key_vector { t_key key; - unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char pos; /* 2log(KEYLENGTH) bits needed */ + unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char slen; - struct tnode __rcu *parent; - struct rcu_head rcu; union { - /* The fields in this struct are valid if bits > 0 (TNODE) */ - struct { - t_key empty_children; /* KEYLENGTH bits needed */ - t_key full_children; /* KEYLENGTH bits needed */ - struct tnode __rcu *child[0]; - }; - /* This list pointer if valid if bits == 0 (LEAF) */ - struct hlist_head list; + /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ + struct hlist_head leaf; + /* This array is valid if (pos | bits) > 0 (TNODE) */ + struct key_vector __rcu *tnode[0]; }; }; -struct leaf_info { - struct hlist_node hlist; - int plen; - u32 mask_plen; /* ntohl(inet_make_mask(plen)) */ - struct list_head falh; +struct tnode { struct rcu_head rcu; + t_key empty_children; /* KEYLENGTH bits needed */ + t_key full_children; /* KEYLENGTH bits needed */ + struct key_vector __rcu *parent; + struct key_vector kv[1]; +#define tn_bits kv[0].bits }; +#define TNODE_SIZE(n) offsetof(struct tnode, kv[0].tnode[n]) +#define LEAF_SIZE TNODE_SIZE(1) + #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats { unsigned int gets; @@ -142,13 +140,13 @@ struct trie_stat { }; struct trie { - struct tnode __rcu *trie; + struct key_vector kv[1]; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats; #endif }; -static void resize(struct trie *t, struct tnode *tn); +static struct key_vector *resize(struct trie *t, struct key_vector *tn); static size_t tnode_free_size; /* @@ -161,41 +159,46 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; +static inline struct tnode *tn_info(struct key_vector *kv) +{ + return container_of(kv, struct tnode, kv[0]); +} + /* caller must hold RTNL */ -#define node_parent(n) rtnl_dereference((n)->parent) +#define node_parent(tn) rtnl_dereference(tn_info(tn)->parent) +#define get_child(tn, i) rtnl_dereference((tn)->tnode[i]) /* caller must hold RCU read lock or RTNL */ -#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent) +#define node_parent_rcu(tn) rcu_dereference_rtnl(tn_info(tn)->parent) +#define get_child_rcu(tn, i) rcu_dereference_rtnl((tn)->tnode[i]) /* wrapper for rcu_assign_pointer */ -static inline void node_set_parent(struct tnode *n, struct tnode *tp) +static inline void node_set_parent(struct key_vector *n, struct key_vector *tp) { if (n) - rcu_assign_pointer(n->parent, tp); + rcu_assign_pointer(tn_info(n)->parent, tp); } -#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p) +#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER(tn_info(n)->parent, p) /* This provides us with the number of children in this node, in the case of a * leaf this will return 0 meaning none of the children are accessible. */ -static inline unsigned long tnode_child_length(const struct tnode *tn) +static inline unsigned long child_length(const struct key_vector *tn) { return (1ul << tn->bits) & ~(1ul); } -/* caller must hold RTNL */ -static inline struct tnode *tnode_get_child(const struct tnode *tn, - unsigned long i) -{ - return rtnl_dereference(tn->child[i]); -} +#define get_cindex(key, kv) (((key) ^ (kv)->key) >> (kv)->pos) -/* caller must hold RCU read lock or RTNL */ -static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, - unsigned long i) +static inline unsigned long get_index(t_key key, struct key_vector *kv) { - return rcu_dereference_rtnl(tn->child[i]); + unsigned long index = key ^ kv->key; + + if ((BITS_PER_LONG <= KEYLENGTH) && (KEYLENGTH == kv->pos)) + return 0; + + return index >> kv->pos; } /* To understand this stuff, an understanding of keys and all their bits is @@ -274,106 +277,104 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) } #define TNODE_KMALLOC_MAX \ - ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *)) + ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *)) +#define TNODE_VMALLOC_MAX \ + ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *)) static void __node_free_rcu(struct rcu_head *head) { struct tnode *n = container_of(head, struct tnode, rcu); - if (IS_LEAF(n)) + if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); - else if (n->bits <= TNODE_KMALLOC_MAX) + else if (n->tn_bits <= TNODE_KMALLOC_MAX) kfree(n); else vfree(n); } -#define node_free(n) call_rcu(&n->rcu, __node_free_rcu) +#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) -static inline void free_leaf_info(struct leaf_info *leaf) +static struct tnode *tnode_alloc(int bits) { - kfree_rcu(leaf, rcu); -} + size_t size; + + /* verify bits is within bounds */ + if (bits > TNODE_VMALLOC_MAX) + return NULL; + + /* determine size and verify it is non-zero and didn't overflow */ + size = TNODE_SIZE(1ul << bits); -static struct tnode *tnode_alloc(size_t size) -{ if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); else return vzalloc(size); } -static inline void empty_child_inc(struct tnode *n) +static inline void empty_child_inc(struct key_vector *n) { - ++n->empty_children ? : ++n->full_children; + ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children; } -static inline void empty_child_dec(struct tnode *n) +static inline void empty_child_dec(struct key_vector *n) { - n->empty_children-- ? : n->full_children--; + tn_info(n)->empty_children-- ? : tn_info(n)->full_children--; } -static struct tnode *leaf_new(t_key key) +static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) { - struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); - if (l) { - l->parent = NULL; - /* set key and pos to reflect full key value - * any trailing zeros in the key should be ignored - * as the nodes are searched - */ - l->key = key; - l->slen = 0; - l->pos = 0; - /* set bits to 0 indicating we are not a tnode */ - l->bits = 0; + struct tnode *kv = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); + struct key_vector *l = kv->kv; - INIT_HLIST_HEAD(&l->list); - } - return l; -} + if (!kv) + return NULL; -static struct leaf_info *leaf_info_new(int plen) -{ - struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); - if (li) { - li->plen = plen; - li->mask_plen = ntohl(inet_make_mask(plen)); - INIT_LIST_HEAD(&li->falh); - } - return li; + /* initialize key vector */ + l->key = key; + l->pos = 0; + l->bits = 0; + l->slen = fa->fa_slen; + + /* link leaf to fib alias */ + INIT_HLIST_HEAD(&l->leaf); + hlist_add_head(&fa->fa_list, &l->leaf); + + return l; } -static struct tnode *tnode_new(t_key key, int pos, int bits) +static struct key_vector *tnode_new(t_key key, int pos, int bits) { - size_t sz = offsetof(struct tnode, child[1ul << bits]); - struct tnode *tn = tnode_alloc(sz); + struct tnode *tnode = tnode_alloc(bits); unsigned int shift = pos + bits; + struct key_vector *tn = tnode->kv; /* verify bits and pos their msb bits clear and values are valid */ BUG_ON(!bits || (shift > KEYLENGTH)); - if (tn) { - tn->parent = NULL; - tn->slen = pos; - tn->pos = pos; - tn->bits = bits; - tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; - if (bits == KEYLENGTH) - tn->full_children = 1; - else - tn->empty_children = 1ul << bits; - } + pr_debug("AT %p s=%zu %zu\n", tnode, TNODE_SIZE(0), + sizeof(struct key_vector *) << bits); + + if (!tnode) + return NULL; + + if (bits == KEYLENGTH) + tnode->full_children = 1; + else + tnode->empty_children = 1ul << bits; + + tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; + tn->pos = pos; + tn->bits = bits; + tn->slen = pos; - pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct tnode *) << bits); return tn; } /* Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(const struct tnode *tn, const struct tnode *n) +static inline int tnode_full(struct key_vector *tn, struct key_vector *n) { return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } @@ -381,12 +382,13 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n) /* Add a child at position i overwriting the old value. * Update the value of full_children and empty_children. */ -static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) +static void put_child(struct key_vector *tn, unsigned long i, + struct key_vector *n) { - struct tnode *chi = tnode_get_child(tn, i); + struct key_vector *chi = get_child(tn, i); int isfull, wasfull; - BUG_ON(i >= tnode_child_length(tn)); + BUG_ON(i >= child_length(tn)); /* update emptyChildren, overflow into fullChildren */ if (n == NULL && chi != NULL) @@ -399,23 +401,23 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) isfull = tnode_full(tn, n); if (wasfull && !isfull) - tn->full_children--; + tn_info(tn)->full_children--; else if (!wasfull && isfull) - tn->full_children++; + tn_info(tn)->full_children++; if (n && (tn->slen < n->slen)) tn->slen = n->slen; - rcu_assign_pointer(tn->child[i], n); + rcu_assign_pointer(tn->tnode[i], n); } -static void update_children(struct tnode *tn) +static void update_children(struct key_vector *tn) { unsigned long i; /* update all of the child parent pointers */ - for (i = tnode_child_length(tn); i;) { - struct tnode *inode = tnode_get_child(tn, --i); + for (i = child_length(tn); i;) { + struct key_vector *inode = get_child(tn, --i); if (!inode) continue; @@ -431,36 +433,37 @@ static void update_children(struct tnode *tn) } } -static inline void put_child_root(struct tnode *tp, struct trie *t, - t_key key, struct tnode *n) +static inline void put_child_root(struct key_vector *tp, t_key key, + struct key_vector *n) { - if (tp) - put_child(tp, get_index(key, tp), n); + if (IS_TRIE(tp)) + rcu_assign_pointer(tp->tnode[0], n); else - rcu_assign_pointer(t->trie, n); + put_child(tp, get_index(key, tp), n); } -static inline void tnode_free_init(struct tnode *tn) +static inline void tnode_free_init(struct key_vector *tn) { - tn->rcu.next = NULL; + tn_info(tn)->rcu.next = NULL; } -static inline void tnode_free_append(struct tnode *tn, struct tnode *n) +static inline void tnode_free_append(struct key_vector *tn, + struct key_vector *n) { - n->rcu.next = tn->rcu.next; - tn->rcu.next = &n->rcu; + tn_info(n)->rcu.next = tn_info(tn)->rcu.next; + tn_info(tn)->rcu.next = &tn_info(n)->rcu; } -static void tnode_free(struct tnode *tn) +static void tnode_free(struct key_vector *tn) { - struct callback_head *head = &tn->rcu; + struct callback_head *head = &tn_info(tn)->rcu; while (head) { head = head->next; - tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]); + tnode_free_size += TNODE_SIZE(1ul << tn->bits); node_free(tn); - tn = container_of(head, struct tnode, rcu); + tn = container_of(head, struct tnode, rcu)->kv; } if (tnode_free_size >= PAGE_SIZE * sync_pages) { @@ -469,14 +472,16 @@ static void tnode_free(struct tnode *tn) } } -static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn) +static struct key_vector *replace(struct trie *t, + struct key_vector *oldtnode, + struct key_vector *tn) { - struct tnode *tp = node_parent(oldtnode); + struct key_vector *tp = node_parent(oldtnode); unsigned long i; /* setup the parent pointer out of and back into this node */ NODE_INIT_PARENT(tn, tp); - put_child_root(tp, t, tn->key, tn); + put_child_root(tp, tn->key, tn); /* update all of the child parent pointers */ update_children(tn); @@ -485,18 +490,21 @@ static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn) tnode_free(oldtnode); /* resize children now that oldtnode is freed */ - for (i = tnode_child_length(tn); i;) { - struct tnode *inode = tnode_get_child(tn, --i); + for (i = child_length(tn); i;) { + struct key_vector *inode = get_child(tn, --i); /* resize child node */ if (tnode_full(tn, inode)) - resize(t, inode); + tn = resize(t, inode); } + + return tp; } -static int inflate(struct trie *t, struct tnode *oldtnode) +static struct key_vector *inflate(struct trie *t, + struct key_vector *oldtnode) { - struct tnode *tn; + struct key_vector *tn; unsigned long i; t_key m; @@ -504,7 +512,7 @@ static int inflate(struct trie *t, struct tnode *oldtnode) tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); if (!tn) - return -ENOMEM; + goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); @@ -514,9 +522,9 @@ static int inflate(struct trie *t, struct tnode *oldtnode) * point to existing tnodes and the links between our allocated * nodes. */ - for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) { - struct tnode *inode = tnode_get_child(oldtnode, --i); - struct tnode *node0, *node1; + for (i = child_length(oldtnode), m = 1u << tn->pos; i;) { + struct key_vector *inode = get_child(oldtnode, --i); + struct key_vector *node0, *node1; unsigned long j, k; /* An empty child */ @@ -534,8 +542,8 @@ static int inflate(struct trie *t, struct tnode *oldtnode) /* An internal node with two children */ if (inode->bits == 1) { - put_child(tn, 2 * i + 1, tnode_get_child(inode, 1)); - put_child(tn, 2 * i, tnode_get_child(inode, 0)); + put_child(tn, 2 * i + 1, get_child(inode, 1)); + put_child(tn, 2 * i, get_child(inode, 0)); continue; } @@ -564,11 +572,11 @@ static int inflate(struct trie *t, struct tnode *oldtnode) tnode_free_append(tn, node0); /* populate child pointers in new nodes */ - for (k = tnode_child_length(inode), j = k / 2; j;) { - put_child(node1, --j, tnode_get_child(inode, --k)); - put_child(node0, j, tnode_get_child(inode, j)); - put_child(node1, --j, tnode_get_child(inode, --k)); - put_child(node0, j, tnode_get_child(inode, j)); + for (k = child_length(inode), j = k / 2; j;) { + put_child(node1, --j, get_child(inode, --k)); + put_child(node0, j, get_child(inode, j)); + put_child(node1, --j, get_child(inode, --k)); + put_child(node0, j, get_child(inode, j)); } /* link new nodes to parent */ @@ -581,25 +589,25 @@ static int inflate(struct trie *t, struct tnode *oldtnode) } /* setup the parent pointers into and out of this node */ - replace(t, oldtnode, tn); - - return 0; + return replace(t, oldtnode, tn); nomem: /* all pointers should be clean so we are done */ tnode_free(tn); - return -ENOMEM; +notnode: + return NULL; } -static int halve(struct trie *t, struct tnode *oldtnode) +static struct key_vector *halve(struct trie *t, + struct key_vector *oldtnode) { - struct tnode *tn; + struct key_vector *tn; unsigned long i; pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); if (!tn) - return -ENOMEM; + goto notnode; /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); @@ -609,10 +617,10 @@ static int halve(struct trie *t, struct tnode *oldtnode) * point to existing tnodes and the links between our allocated * nodes. */ - for (i = tnode_child_length(oldtnode); i;) { - struct tnode *node1 = tnode_get_child(oldtnode, --i); - struct tnode *node0 = tnode_get_child(oldtnode, --i); - struct tnode *inode; + for (i = child_length(oldtnode); i;) { + struct key_vector *node1 = get_child(oldtnode, --i); + struct key_vector *node0 = get_child(oldtnode, --i); + struct key_vector *inode; /* At least one of the children is empty */ if (!node1 || !node0) { @@ -622,10 +630,8 @@ static int halve(struct trie *t, struct tnode *oldtnode) /* Two nonempty children */ inode = tnode_new(node0->key, oldtnode->pos, 1); - if (!inode) { - tnode_free(tn); - return -ENOMEM; - } + if (!inode) + goto nomem; tnode_free_append(tn, inode); /* initialize pointers out of node */ @@ -638,30 +644,36 @@ static int halve(struct trie *t, struct tnode *oldtnode) } /* setup the parent pointers into and out of this node */ - replace(t, oldtnode, tn); - - return 0; + return replace(t, oldtnode, tn); +nomem: + /* all pointers should be clean so we are done */ + tnode_free(tn); +notnode: + return NULL; } -static void collapse(struct trie *t, struct tnode *oldtnode) +static struct key_vector *collapse(struct trie *t, + struct key_vector *oldtnode) { - struct tnode *n, *tp; + struct key_vector *n, *tp; unsigned long i; /* scan the tnode looking for that one child that might still exist */ - for (n = NULL, i = tnode_child_length(oldtnode); !n && i;) - n = tnode_get_child(oldtnode, --i); + for (n = NULL, i = child_length(oldtnode); !n && i;) + n = get_child(oldtnode, --i); /* compress one level */ tp = node_parent(oldtnode); - put_child_root(tp, t, oldtnode->key, n); + put_child_root(tp, oldtnode->key, n); node_set_parent(n, tp); /* drop dead node */ node_free(oldtnode); + + return tp; } -static unsigned char update_suffix(struct tnode *tn) +static unsigned char update_suffix(struct key_vector *tn) { unsigned char slen = tn->pos; unsigned long stride, i; @@ -671,8 +683,8 @@ static unsigned char update_suffix(struct tnode *tn) * why we start with a stride of 2 since a stride of 1 would * represent the nodes with suffix length equal to tn->pos */ - for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) { - struct tnode *n = tnode_get_child(tn, i); + for (i = 0, stride = 0x2ul ; i < child_length(tn); i += stride) { + struct key_vector *n = get_child(tn, i); if (!n || (n->slen <= slen)) continue; @@ -704,12 +716,12 @@ static unsigned char update_suffix(struct tnode *tn) * * 'high' in this instance is the variable 'inflate_threshold'. It * is expressed as a percentage, so we multiply it with - * tnode_child_length() and instead of multiplying by 2 (since the + * child_length() and instead of multiplying by 2 (since the * child array will be doubled by inflate()) and multiplying * the left-hand side by 100 (to handle the percentage thing) we * multiply the left-hand side by 50. * - * The left-hand side may look a bit weird: tnode_child_length(tn) + * The left-hand side may look a bit weird: child_length(tn) * - tn->empty_children is of course the number of non-null children * in the current node. tn->full_children is the number of "full" * children, that is non-null tnodes with a skip value of 0. @@ -719,10 +731,10 @@ static unsigned char update_suffix(struct tnode *tn) * A clearer way to write this would be: * * to_be_doubled = tn->full_children; - * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - + * not_to_be_doubled = child_length(tn) - tn->empty_children - * tn->full_children; * - * new_child_length = tnode_child_length(tn) * 2; + * new_child_length = child_length(tn) * 2; * * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / * new_child_length; @@ -739,57 +751,57 @@ static unsigned char update_suffix(struct tnode *tn) * inflate_threshold * new_child_length * * expand not_to_be_doubled and to_be_doubled, and shorten: - * 100 * (tnode_child_length(tn) - tn->empty_children + + * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: - * 100 * (tnode_child_length(tn) - tn->empty_children + + * 100 * (child_length(tn) - tn->empty_children + * tn->full_children) >= - * inflate_threshold * tnode_child_length(tn) * 2 + * inflate_threshold * child_length(tn) * 2 * * shorten again: - * 50 * (tn->full_children + tnode_child_length(tn) - + * 50 * (tn->full_children + child_length(tn) - * tn->empty_children) >= inflate_threshold * - * tnode_child_length(tn) + * child_length(tn) * */ -static bool should_inflate(const struct tnode *tp, const struct tnode *tn) +static inline bool should_inflate(struct key_vector *tp, struct key_vector *tn) { - unsigned long used = tnode_child_length(tn); + unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ - threshold *= tp ? inflate_threshold : inflate_threshold_root; - used -= tn->empty_children; - used += tn->full_children; + threshold *= IS_TRIE(tp) ? inflate_threshold_root : inflate_threshold; + used -= tn_info(tn)->empty_children; + used += tn_info(tn)->full_children; /* if bits == KEYLENGTH then pos = 0, and will fail below */ return (used > 1) && tn->pos && ((50 * used) >= threshold); } -static bool should_halve(const struct tnode *tp, const struct tnode *tn) +static inline bool should_halve(struct key_vector *tp, struct key_vector *tn) { - unsigned long used = tnode_child_length(tn); + unsigned long used = child_length(tn); unsigned long threshold = used; /* Keep root node larger */ - threshold *= tp ? halve_threshold : halve_threshold_root; - used -= tn->empty_children; + threshold *= IS_TRIE(tp) ? halve_threshold_root : halve_threshold; + used -= tn_info(tn)->empty_children; /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold); } -static bool should_collapse(const struct tnode *tn) +static inline bool should_collapse(struct key_vector *tn) { - unsigned long used = tnode_child_length(tn); + unsigned long used = child_length(tn); - used -= tn->empty_children; + used -= tn_info(tn)->empty_children; /* account for bits == KEYLENGTH case */ - if ((tn->bits == KEYLENGTH) && tn->full_children) + if ((tn->bits == KEYLENGTH) && tn_info(tn)->full_children) used -= KEY_MAX; /* One child or none, time to drop us from the trie */ @@ -797,10 +809,13 @@ static bool should_collapse(const struct tnode *tn) } #define MAX_WORK 10 -static void resize(struct trie *t, struct tnode *tn) +static struct key_vector *resize(struct trie *t, struct key_vector *tn) { - struct tnode *tp = node_parent(tn); - struct tnode __rcu **cptr; +#ifdef CONFIG_IP_FIB_TRIE_STATS + struct trie_use_stats __percpu *stats = t->stats; +#endif + struct key_vector *tp = node_parent(tn); + unsigned long cindex = get_index(tn->key, tp); int max_work = MAX_WORK; pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", @@ -810,183 +825,128 @@ static void resize(struct trie *t, struct tnode *tn) * doing it ourselves. This way we can let RCU fully do its * thing without us interfering */ - cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie; - BUG_ON(tn != rtnl_dereference(*cptr)); + BUG_ON(tn != get_child(tp, cindex)); /* Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ while (should_inflate(tp, tn) && max_work) { - if (inflate(t, tn)) { + tp = inflate(t, tn); + if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->resize_node_skipped); + this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; - tn = rtnl_dereference(*cptr); + tn = get_child(tp, cindex); } + /* update parent in case inflate failed */ + tp = node_parent(tn); + /* Return if at least one inflate is run */ if (max_work != MAX_WORK) - return; + return tp; /* Halve as long as the number of empty children in this * node is above threshold. */ while (should_halve(tp, tn) && max_work) { - if (halve(t, tn)) { + tp = halve(t, tn); + if (!tp) { #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->resize_node_skipped); + this_cpu_inc(stats->resize_node_skipped); #endif break; } max_work--; - tn = rtnl_dereference(*cptr); + tn = get_child(tp, cindex); } /* Only one child remains */ - if (should_collapse(tn)) { - collapse(t, tn); - return; - } + if (should_collapse(tn)) + return collapse(t, tn); + + /* update parent in case halve failed */ + tp = node_parent(tn); /* Return if at least one deflate was run */ if (max_work != MAX_WORK) - return; + return tp; /* push the suffix length to the parent node */ if (tn->slen > tn->pos) { unsigned char slen = update_suffix(tn); - if (tp && (slen > tp->slen)) + if (slen > tp->slen) tp->slen = slen; } -} - -/* readside must use rcu_read_lock currently dump routines - via get_fa_head and dump */ - -static struct leaf_info *find_leaf_info(struct tnode *l, int plen) -{ - struct hlist_head *head = &l->list; - struct leaf_info *li; - - hlist_for_each_entry_rcu(li, head, hlist) - if (li->plen == plen) - return li; - - return NULL; -} - -static inline struct list_head *get_fa_head(struct tnode *l, int plen) -{ - struct leaf_info *li = find_leaf_info(l, plen); - - if (!li) - return NULL; - return &li->falh; + return tp; } -static void leaf_pull_suffix(struct tnode *l) +static void leaf_pull_suffix(struct key_vector *tp, struct key_vector *l) { - struct tnode *tp = node_parent(l); - - while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) { + while ((tp->slen > tp->pos) && (tp->slen > l->slen)) { if (update_suffix(tp) > l->slen) break; tp = node_parent(tp); } } -static void leaf_push_suffix(struct tnode *l) +static void leaf_push_suffix(struct key_vector *tn, struct key_vector *l) { - struct tnode *tn = node_parent(l); - /* if this is a new leaf then tn will be NULL and we can sort * out parent suffix lengths as a part of trie_rebalance */ - while (tn && (tn->slen < l->slen)) { + while (tn->slen < l->slen) { tn->slen = l->slen; tn = node_parent(tn); } } -static void remove_leaf_info(struct tnode *l, struct leaf_info *old) -{ - /* record the location of the previous list_info entry */ - struct hlist_node **pprev = old->hlist.pprev; - struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next); - - /* remove the leaf info from the list */ - hlist_del_rcu(&old->hlist); - - /* only access li if it is pointing at the last valid hlist_node */ - if (hlist_empty(&l->list) || (*pprev)) - return; - - /* update the trie with the latest suffix length */ - l->slen = KEYLENGTH - li->plen; - leaf_pull_suffix(l); -} - -static void insert_leaf_info(struct tnode *l, struct leaf_info *new) +/* rcu_read_lock needs to be hold by caller from readside */ +static struct key_vector *fib_find_node(struct trie *t, + struct key_vector **tp, u32 key) { - struct hlist_head *head = &l->list; - struct leaf_info *li = NULL, *last = NULL; + struct key_vector *pn, *n = t->kv; + unsigned long index = 0; - if (hlist_empty(head)) { - hlist_add_head_rcu(&new->hlist, head); - } else { - hlist_for_each_entry(li, head, hlist) { - if (new->plen > li->plen) - break; - - last = li; - } - if (last) - hlist_add_behind_rcu(&new->hlist, &last->hlist); - else - hlist_add_before_rcu(&new->hlist, &li->hlist); - } - - /* if we added to the tail node then we need to update slen */ - if (l->slen < (KEYLENGTH - new->plen)) { - l->slen = KEYLENGTH - new->plen; - leaf_push_suffix(l); - } -} + do { + pn = n; + n = get_child_rcu(n, index); -/* rcu_read_lock needs to be hold by caller from readside */ -static struct tnode *fib_find_node(struct trie *t, u32 key) -{ - struct tnode *n = rcu_dereference_rtnl(t->trie); + if (!n) + break; - while (n) { - unsigned long index = get_index(key, n); + index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the bits in the cindex. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. - * if (index & (~0ul << bits)) + * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex + * + * This check is safe even if bits == KEYLENGTH due to the + * fact that we can only allocate a node with 32 bits if a + * long is greater than 32 bits. */ - if (index & (~0ul << n->bits)) - return NULL; - - /* we have found a leaf. Prefixes have already been compared */ - if (IS_LEAF(n)) + if (index >= (1ul << n->bits)) { + n = NULL; break; + } - n = tnode_get_child_rcu(n, index); - } + /* keep searching until we find a perfect match leaf or NULL */ + } while (IS_TNODE(n)); + + *tp = pn; return n; } @@ -994,14 +954,23 @@ static struct tnode *fib_find_node(struct trie *t, u32 key) /* Return the first fib alias matching TOS with * priority less than or equal to PRIO. */ -static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) +static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, + u8 tos, u32 prio, u32 tb_id) { struct fib_alias *fa; if (!fah) return NULL; - list_for_each_entry(fa, fah, fa_list) { + hlist_for_each_entry(fa, fah, fa_list) { + if (fa->fa_slen < slen) + continue; + if (fa->fa_slen != slen) + break; + if (fa->tb_id > tb_id) + continue; + if (fa->tb_id != tb_id) + break; if (fa->fa_tos > tos) continue; if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) @@ -1011,77 +980,23 @@ static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) return NULL; } -static void trie_rebalance(struct trie *t, struct tnode *tn) +static void trie_rebalance(struct trie *t, struct key_vector *tn) { - struct tnode *tp; - - while ((tp = node_parent(tn)) != NULL) { - resize(t, tn); - tn = tp; - } - - /* Handle last (top) tnode */ - if (IS_TNODE(tn)) - resize(t, tn); + while (!IS_TRIE(tn)) + tn = resize(t, tn); } -/* only used from updater-side */ - -static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) +static int fib_insert_node(struct trie *t, struct key_vector *tp, + struct fib_alias *new, t_key key) { - struct list_head *fa_head = NULL; - struct tnode *l, *n, *tp = NULL; - struct leaf_info *li; - - li = leaf_info_new(plen); - if (!li) - return NULL; - fa_head = &li->falh; + struct key_vector *n, *l; - n = rtnl_dereference(t->trie); - - /* If we point to NULL, stop. Either the tree is empty and we should - * just put a new leaf in if, or we have reached an empty child slot, - * and we should just put our new leaf in that. - * - * If we hit a node with a key that does't match then we should stop - * and create a new tnode to replace that node and insert ourselves - * and the other node into the new tnode. - */ - while (n) { - unsigned long index = get_index(key, n); - - /* This bit of code is a bit tricky but it combines multiple - * checks into a single check. The prefix consists of the - * prefix plus zeros for the "bits" in the prefix. The index - * is the difference between the key and this value. From - * this we can actually derive several pieces of data. - * if !(index >> bits) - * we know the value is child index - * else - * we have a mismatch in skip bits and failed - */ - if (index >> n->bits) - break; - - /* we have found a leaf. Prefixes have already been compared */ - if (IS_LEAF(n)) { - /* Case 1: n is a leaf, and prefixes match*/ - insert_leaf_info(n, li); - return fa_head; - } - - tp = n; - n = tnode_get_child_rcu(n, index); - } - - l = leaf_new(key); - if (!l) { - free_leaf_info(li); - return NULL; - } + l = leaf_new(key, new); + if (!l) + goto noleaf; - insert_leaf_info(l, li); + /* retrieve child from parent node */ + n = get_child(tp, get_index(key, tp)); /* Case 2: n is a LEAF or a TNODE and the key doesn't match. * @@ -1090,21 +1005,18 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) * leaves us in position for handling as case 3 */ if (n) { - struct tnode *tn; + struct key_vector *tn; tn = tnode_new(key, __fls(key ^ n->key), 1); - if (!tn) { - free_leaf_info(li); - node_free(l); - return NULL; - } + if (!tn) + goto notnode; /* initialize routes out of node */ NODE_INIT_PARENT(tn, tp); put_child(tn, get_index(key, tn) ^ 1, n); /* start adding routes into the node */ - put_child_root(tp, t, key, tn); + put_child_root(tp, key, tn); node_set_parent(n, tn); /* parent now has a NULL spot where the leaf can go */ @@ -1112,69 +1024,93 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) } /* Case 3: n is NULL, and will just insert a new leaf */ - if (tp) { - NODE_INIT_PARENT(l, tp); - put_child(tp, get_index(key, tp), l); - trie_rebalance(t, tp); + NODE_INIT_PARENT(l, tp); + put_child_root(tp, key, l); + trie_rebalance(t, tp); + + return 0; +notnode: + node_free(l); +noleaf: + return -ENOMEM; +} + +static int fib_insert_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *new, + struct fib_alias *fa, t_key key) +{ + if (!l) + return fib_insert_node(t, tp, new, key); + + if (fa) { + hlist_add_before_rcu(&new->fa_list, &fa->fa_list); } else { - rcu_assign_pointer(t->trie, l); + struct fib_alias *last; + + hlist_for_each_entry(last, &l->leaf, fa_list) { + if (new->fa_slen < last->fa_slen) + break; + if ((new->fa_slen == last->fa_slen) && + (new->tb_id > last->tb_id)) + break; + fa = last; + } + + if (fa) + hlist_add_behind_rcu(&new->fa_list, &fa->fa_list); + else + hlist_add_head_rcu(&new->fa_list, &l->leaf); } - return fa_head; + /* if we added to the tail node then we need to update slen */ + if (l->slen < new->fa_slen) { + l->slen = new->fa_slen; + leaf_push_suffix(tp, l); + } + + return 0; } -/* - * Caller must hold RTNL. - */ +/* Caller must hold RTNL. */ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { - struct trie *t = (struct trie *) tb->tb_data; + struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; - struct list_head *fa_head = NULL; + struct key_vector *l, *tp; struct fib_info *fi; - int plen = cfg->fc_dst_len; + u8 plen = cfg->fc_dst_len; + u8 slen = KEYLENGTH - plen; u8 tos = cfg->fc_tos; - u32 key, mask; + u32 key; int err; - struct tnode *l; - if (plen > 32) + if (plen > KEYLENGTH) return -EINVAL; key = ntohl(cfg->fc_dst); pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); - mask = ntohl(inet_make_mask(plen)); - - if (key & ~mask) + if ((plen < KEYLENGTH) && (key << plen)) return -EINVAL; - key = key & mask; - fi = fib_create_info(cfg); if (IS_ERR(fi)) { err = PTR_ERR(fi); goto err; } - l = fib_find_node(t, key); - fa = NULL; - - if (l) { - fa_head = get_fa_head(l, plen); - fa = fib_find_alias(fa_head, tos, fi->fib_priority); - } + l = fib_find_node(t, &tp, key); + fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority, + tb->tb_id) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,tos,priority], if such key already * exists or to the node before which we will insert new one. * * If fa is NULL, we will need to allocate a new one and - * insert to the head of f. - * - * If f is NULL, no fib node matched the destination key - * and we need to allocate a new one of those as well. + * insert to the tail of the section matching the suffix length + * of the new alias. */ if (fa && fa->fa_tos == tos && @@ -1192,9 +1128,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) */ fa_match = NULL; fa_first = fa; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, fa_head, fa_list) { - if (fa->fa_tos != tos) + hlist_for_each_entry_from(fa, fa_list) { + if ((fa->fa_slen != slen) || + (fa->tb_id != tb->tb_id) || + (fa->fa_tos != tos)) break; if (fa->fa_info->fib_priority != fi->fib_priority) break; @@ -1226,8 +1163,21 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_type = cfg->fc_type; state = fa->fa_state; new_fa->fa_state = state & ~FA_S_ACCESSED; + new_fa->fa_slen = fa->fa_slen; + + err = netdev_switch_fib_ipv4_add(key, plen, fi, + new_fa->fa_tos, + cfg->fc_type, + cfg->fc_nlflags, + tb->tb_id); + if (err) { + netdev_switch_fib_ipv4_abort(fi); + kmem_cache_free(fn_alias_kmem, new_fa); + goto out; + } + + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); - list_replace_rcu(&fa->fa_list, &new_fa->fa_list); alias_free_mem_rcu(fa); fib_release_info(fi_drop); @@ -1261,30 +1211,35 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; new_fa->fa_state = 0; - /* - * Insert new entry to the list. - */ - - if (!fa_head) { - fa_head = fib_insert_node(t, key, plen); - if (unlikely(!fa_head)) { - err = -ENOMEM; - goto out_free_new_fa; - } + new_fa->fa_slen = slen; + new_fa->tb_id = tb->tb_id; + + /* (Optionally) offload fib entry to switch hardware. */ + err = netdev_switch_fib_ipv4_add(key, plen, fi, tos, + cfg->fc_type, + cfg->fc_nlflags, + tb->tb_id); + if (err) { + netdev_switch_fib_ipv4_abort(fi); + goto out_free_new_fa; } + /* Insert new entry to the list. */ + err = fib_insert_alias(t, tp, l, new_fa, fa, key); + if (err) + goto out_sw_fib_del; + if (!plen) tb->tb_num_default++; - list_add_tail_rcu(&new_fa->fa_list, - (fa ? &fa->fa_list : fa_head)); - rt_cache_flush(cfg->fc_nlinfo.nl_net); - rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, + rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, 0); succeeded: return 0; +out_sw_fib_del: + netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1293,7 +1248,7 @@ err: return err; } -static inline t_key prefix_mismatch(t_key key, struct tnode *n) +static inline t_key prefix_mismatch(t_key key, struct key_vector *n) { t_key prefix = n->key; @@ -1304,16 +1259,20 @@ static inline t_key prefix_mismatch(t_key key, struct tnode *n) int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { - struct trie *t = (struct trie *)tb->tb_data; + struct trie *t = (struct trie *) tb->tb_data; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif const t_key key = ntohl(flp->daddr); - struct tnode *n, *pn; - struct leaf_info *li; + struct key_vector *n, *pn; + struct fib_alias *fa; + unsigned long index; t_key cindex; - n = rcu_dereference(t->trie); + pn = t->kv; + cindex = 0; + + n = get_child_rcu(pn, cindex); if (!n) return -EAGAIN; @@ -1321,24 +1280,25 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, this_cpu_inc(stats->gets); #endif - pn = n; - cindex = 0; - /* Step 1: Travel to the longest prefix match in the trie */ for (;;) { - unsigned long index = get_index(key, n); + index = get_cindex(key, n); /* This bit of code is a bit tricky but it combines multiple * checks into a single check. The prefix consists of the * prefix plus zeros for the "bits" in the prefix. The index * is the difference between the key and this value. From * this we can actually derive several pieces of data. - * if (index & (~0ul << bits)) + * if (index >= (1ul << bits)) * we have a mismatch in skip bits and failed * else * we know the value is cindex + * + * This check is safe even if bits == KEYLENGTH due to the + * fact that we can only allocate a node with 32 bits if a + * long is greater than 32 bits. */ - if (index & (~0ul << n->bits)) + if (index >= (1ul << n->bits)) break; /* we have found a leaf. Prefixes have already been compared */ @@ -1353,7 +1313,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, cindex = index; } - n = tnode_get_child_rcu(n, index); + n = get_child_rcu(n, index); if (unlikely(!n)) goto backtrace; } @@ -1361,7 +1321,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, /* Step 2: Sort out leaves and begin backtracing for longest prefix */ for (;;) { /* record the pointer where our next node pointer is stored */ - struct tnode __rcu **cptr = n->child; + struct key_vector __rcu **cptr = n->tnode; /* This test verifies that none of the bits that differ * between the key and the prefix exist in the region of @@ -1393,13 +1353,17 @@ backtrace: while (!cindex) { t_key pkey = pn->key; - pn = node_parent_rcu(pn); - if (unlikely(!pn)) + /* If we don't have a parent then there is + * nothing for us to do as we do not have any + * further nodes to parse. + */ + if (IS_TRIE(pn)) return -EAGAIN; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif /* Get Child's index */ + pn = node_parent_rcu(pn); cindex = get_index(pkey, pn); } @@ -1407,138 +1371,134 @@ backtrace: cindex &= cindex - 1; /* grab pointer for next child node */ - cptr = &pn->child[cindex]; + cptr = &pn->tnode[cindex]; } } found: + /* this line carries forward the xor from earlier in the function */ + index = key ^ n->key; + /* Step 3: Process the leaf, if that fails fall back to backtracing */ - hlist_for_each_entry_rcu(li, &n->list, hlist) { - struct fib_alias *fa; + hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + int nhsel, err; - if ((key ^ n->key) & li->mask_plen) + if ((index >= (1ul << fa->fa_slen)) && + ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH))) continue; - - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - struct fib_info *fi = fa->fa_info; - int nhsel, err; - - if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) - continue; - if (fi->fib_dead) - continue; - if (fa->fa_info->fib_scope < flp->flowi4_scope) - continue; - fib_alias_accessed(fa); - err = fib_props[fa->fa_type].error; - if (unlikely(err < 0)) { + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fi->fib_dead) + continue; + if (fa->fa_info->fib_scope < flp->flowi4_scope) + continue; + fib_alias_accessed(fa); + err = fib_props[fa->fa_type].error; + if (unlikely(err < 0)) { #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->semantic_match_passed); + this_cpu_inc(stats->semantic_match_passed); #endif - return err; - } - if (fi->fib_flags & RTNH_F_DEAD) + return err; + } + if (fi->fib_flags & RTNH_F_DEAD) + continue; + for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + const struct fib_nh *nh = &fi->fib_nh[nhsel]; + + if (nh->nh_flags & RTNH_F_DEAD) continue; - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { - const struct fib_nh *nh = &fi->fib_nh[nhsel]; - - if (nh->nh_flags & RTNH_F_DEAD) - continue; - if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) - continue; - - if (!(fib_flags & FIB_LOOKUP_NOREF)) - atomic_inc(&fi->fib_clntref); - - res->prefixlen = li->plen; - res->nh_sel = nhsel; - res->type = fa->fa_type; - res->scope = fi->fib_scope; - res->fi = fi; - res->table = tb; - res->fa_head = &li->falh; + if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) + continue; + + if (!(fib_flags & FIB_LOOKUP_NOREF)) + atomic_inc(&fi->fib_clntref); + + res->prefixlen = KEYLENGTH - fa->fa_slen; + res->nh_sel = nhsel; + res->type = fa->fa_type; + res->scope = fi->fib_scope; + res->fi = fi; + res->table = tb; + res->fa_head = &n->leaf; #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->semantic_match_passed); + this_cpu_inc(stats->semantic_match_passed); #endif - return err; - } + return err; } - + } #ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->semantic_match_miss); + this_cpu_inc(stats->semantic_match_miss); #endif - } goto backtrace; } EXPORT_SYMBOL_GPL(fib_table_lookup); -/* - * Remove the leaf and return parent. - */ -static void trie_leaf_remove(struct trie *t, struct tnode *l) +static void fib_remove_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *old) { - struct tnode *tp = node_parent(l); + /* record the location of the previous list_info entry */ + struct hlist_node **pprev = old->fa_list.pprev; + struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next); - pr_debug("entering trie_leaf_remove(%p)\n", l); + /* remove the fib_alias from the list */ + hlist_del_rcu(&old->fa_list); - if (tp) { - put_child(tp, get_index(l->key, tp), NULL); + /* if we emptied the list this leaf will be freed and we can sort + * out parent suffix lengths as a part of trie_rebalance + */ + if (hlist_empty(&l->leaf)) { + put_child_root(tp, l->key, NULL); + node_free(l); trie_rebalance(t, tp); - } else { - RCU_INIT_POINTER(t->trie, NULL); + return; } - node_free(l); + /* only access fa if it is pointing at the last valid hlist_node */ + if (*pprev) + return; + + /* update the trie with the latest suffix length */ + l->slen = fa->fa_slen; + leaf_pull_suffix(tp, l); } -/* - * Caller must hold RTNL. - */ +/* Caller must hold RTNL. */ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; - u32 key, mask; - int plen = cfg->fc_dst_len; - u8 tos = cfg->fc_tos; struct fib_alias *fa, *fa_to_delete; - struct list_head *fa_head; - struct tnode *l; - struct leaf_info *li; + struct key_vector *l, *tp; + u8 plen = cfg->fc_dst_len; + u8 slen = KEYLENGTH - plen; + u8 tos = cfg->fc_tos; + u32 key; - if (plen > 32) + if (plen > KEYLENGTH) return -EINVAL; key = ntohl(cfg->fc_dst); - mask = ntohl(inet_make_mask(plen)); - if (key & ~mask) + if ((plen < KEYLENGTH) && (key << plen)) return -EINVAL; - key = key & mask; - l = fib_find_node(t, key); - + l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; - li = find_leaf_info(l, plen); - - if (!li) - return -ESRCH; - - fa_head = &li->falh; - fa = fib_find_alias(fa_head, tos, 0); - + fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id); if (!fa) return -ESRCH; pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); fa_to_delete = NULL; - fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); - list_for_each_entry_continue(fa, fa_head, fa_list) { + hlist_for_each_entry_from(fa, fa_list) { struct fib_info *fi = fa->fa_info; - if (fa->fa_tos != tos) + if ((fa->fa_slen != slen) || + (fa->tb_id != tb->tb_id) || + (fa->fa_tos != tos)) break; if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && @@ -1557,240 +1517,397 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) if (!fa_to_delete) return -ESRCH; - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, - &cfg->fc_nlinfo, 0); + netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos, + cfg->fc_type, tb->tb_id); - list_del_rcu(&fa->fa_list); + rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, + &cfg->fc_nlinfo, 0); if (!plen) tb->tb_num_default--; - if (list_empty(fa_head)) { - remove_leaf_info(l, li); - free_leaf_info(li); - } + fib_remove_alias(t, tp, l, fa_to_delete); - if (hlist_empty(&l->list)) - trie_leaf_remove(t, l); - - if (fa->fa_state & FA_S_ACCESSED) + if (fa_to_delete->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); - fib_release_info(fa->fa_info); - alias_free_mem_rcu(fa); + fib_release_info(fa_to_delete->fa_info); + alias_free_mem_rcu(fa_to_delete); return 0; } -static int trie_flush_list(struct list_head *head) +/* Scan for the next leaf starting at the provided key value */ +static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) { - struct fib_alias *fa, *fa_node; - int found = 0; + struct key_vector *pn, *n = *tn; + unsigned long cindex; - list_for_each_entry_safe(fa, fa_node, head, fa_list) { - struct fib_info *fi = fa->fa_info; + /* this loop is meant to try and find the key in the trie */ + do { + /* record parent and next child index */ + pn = n; + cindex = key ? get_index(key, pn) : 0; - if (fi && (fi->fib_flags & RTNH_F_DEAD)) { - list_del_rcu(&fa->fa_list); - fib_release_info(fa->fa_info); - alias_free_mem_rcu(fa); - found++; + if (cindex >> pn->bits) + break; + + /* descend into the next child */ + n = get_child_rcu(pn, cindex++); + if (!n) + break; + + /* guarantee forward progress on the keys */ + if (IS_LEAF(n) && (n->key >= key)) + goto found; + } while (IS_TNODE(n)); + + /* this loop will search for the next leaf with a greater key */ + while (!IS_TRIE(pn)) { + /* if we exhausted the parent node we will need to climb */ + if (cindex >= (1ul << pn->bits)) { + t_key pkey = pn->key; + + pn = node_parent_rcu(pn); + cindex = get_index(pkey, pn) + 1; + continue; } + + /* grab the next available node */ + n = get_child_rcu(pn, cindex++); + if (!n) + continue; + + /* no need to compare keys since we bumped the index */ + if (IS_LEAF(n)) + goto found; + + /* Rescan start scanning in new node */ + pn = n; + cindex = 0; } - return found; + + *tn = pn; + return NULL; /* Root of trie */ +found: + /* if we are at the limit for keys just return NULL for the tnode */ + *tn = pn; + return n; } -static int trie_flush_leaf(struct tnode *l) +static void fib_trie_free(struct fib_table *tb) { - int found = 0; - struct hlist_head *lih = &l->list; + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; struct hlist_node *tmp; - struct leaf_info *li = NULL; - unsigned char plen = KEYLENGTH; + struct fib_alias *fa; + + /* walk trie in reverse order and free everything */ + for (;;) { + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; + + if (IS_TRIE(pn)) + break; + + n = pn; + pn = node_parent(pn); - hlist_for_each_entry_safe(li, tmp, lih, hlist) { - found += trie_flush_list(&li->falh); + /* drop emptied tnode */ + put_child_root(pn, n->key, NULL); + node_free(n); + + cindex = get_index(pkey, pn); - if (list_empty(&li->falh)) { - hlist_del_rcu(&li->hlist); - free_leaf_info(li); continue; } - plen = li->plen; - } + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; - l->slen = KEYLENGTH - plen; + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; - return found; + continue; + } + + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + hlist_del_rcu(&fa->fa_list); + alias_free_mem_rcu(fa); + } + + put_child_root(pn, n->key, NULL); + node_free(n); + } + +#ifdef CONFIG_IP_FIB_TRIE_STATS + free_percpu(t->stats); +#endif + kfree(tb); } -/* - * Scan for the next right leaf starting at node p->child[idx] - * Since we have back pointer, no recursion necessary. - */ -static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c) +struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) { - do { - unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0; + struct trie *ot = (struct trie *)oldtb->tb_data; + struct key_vector *l, *tp = ot->kv; + struct fib_table *local_tb; + struct fib_alias *fa; + struct trie *lt; + t_key key = 0; - while (idx < tnode_child_length(p)) { - c = tnode_get_child_rcu(p, idx++); - if (!c) + if (oldtb->tb_data == oldtb->__data) + return oldtb; + + local_tb = fib_trie_table(RT_TABLE_LOCAL, NULL); + if (!local_tb) + return NULL; + + lt = (struct trie *)local_tb->tb_data; + + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { + struct key_vector *local_l = NULL, *local_tp; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + struct fib_alias *new_fa; + + if (local_tb->tb_id != fa->tb_id) continue; - if (IS_LEAF(c)) - return c; + /* clone fa for new local table */ + new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + if (!new_fa) + goto out; + + memcpy(new_fa, fa, sizeof(*fa)); - /* Rescan start scanning in new node */ - p = c; - idx = 0; + /* insert clone into table */ + if (!local_l) + local_l = fib_find_node(lt, &local_tp, l->key); + + if (fib_insert_alias(lt, local_tp, local_l, new_fa, + NULL, l->key)) + goto out; } - /* Node empty, walk back up to parent */ - c = p; - } while ((p = node_parent_rcu(c)) != NULL); + /* stop loop if key wrapped back to 0 */ + key = l->key + 1; + if (key < l->key) + break; + } - return NULL; /* Root of trie */ + return local_tb; +out: + fib_trie_free(local_tb); + + return NULL; } -static struct tnode *trie_firstleaf(struct trie *t) +/* Caller must hold RTNL */ +void fib_table_flush_external(struct fib_table *tb) { - struct tnode *n = rcu_dereference_rtnl(t->trie); + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct hlist_node *tmp; + struct fib_alias *fa; - if (!n) - return NULL; + /* walk trie in reverse order */ + for (;;) { + unsigned char slen = 0; + struct key_vector *n; - if (IS_LEAF(n)) /* trie is just a leaf */ - return n; + if (!(cindex--)) { + t_key pkey = pn->key; - return leaf_walk_rcu(n, NULL); -} + /* cannot resize the trie vector */ + if (IS_TRIE(pn)) + break; -static struct tnode *trie_nextleaf(struct tnode *l) -{ - struct tnode *p = node_parent_rcu(l); + /* resize completed node */ + pn = resize(t, pn); + cindex = get_index(pkey, pn); - if (!p) - return NULL; /* trie with just one leaf */ + continue; + } - return leaf_walk_rcu(p, l); -} + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; -static struct tnode *trie_leafindex(struct trie *t, int index) -{ - struct tnode *l = trie_firstleaf(t); + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; - while (l && index-- > 0) - l = trie_nextleaf(l); + continue; + } - return l; -} + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + /* if alias was cloned to local then we just + * need to remove the local copy from main + */ + if (tb->tb_id != fa->tb_id) { + hlist_del_rcu(&fa->fa_list); + alias_free_mem_rcu(fa); + continue; + } + /* record local slen */ + slen = fa->fa_slen; -/* - * Caller must hold RTNL. - */ + if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL)) + continue; + + netdev_switch_fib_ipv4_del(n->key, + KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, + fa->fa_type, tb->tb_id); + } + + /* update leaf slen */ + n->slen = slen; + + if (hlist_empty(&n->leaf)) { + put_child_root(pn, n->key, NULL); + node_free(n); + } else { + leaf_pull_suffix(pn, n); + } + } +} + +/* Caller must hold RTNL. */ int fib_table_flush(struct fib_table *tb) { - struct trie *t = (struct trie *) tb->tb_data; - struct tnode *l, *ll = NULL; + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; + struct hlist_node *tmp; + struct fib_alias *fa; int found = 0; - for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { - found += trie_flush_leaf(l); + /* walk trie in reverse order */ + for (;;) { + unsigned char slen = 0; + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; - if (ll) { - if (hlist_empty(&ll->list)) - trie_leaf_remove(t, ll); - else - leaf_pull_suffix(ll); + /* cannot resize the trie vector */ + if (IS_TRIE(pn)) + break; + + /* resize completed node */ + pn = resize(t, pn); + cindex = get_index(pkey, pn); + + continue; } - ll = l; - } + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; - if (ll) { - if (hlist_empty(&ll->list)) - trie_leaf_remove(t, ll); - else - leaf_pull_suffix(ll); + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; + + continue; + } + + hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi || !(fi->fib_flags & RTNH_F_DEAD)) { + slen = fa->fa_slen; + continue; + } + + netdev_switch_fib_ipv4_del(n->key, + KEYLENGTH - fa->fa_slen, + fi, fa->fa_tos, + fa->fa_type, tb->tb_id); + hlist_del_rcu(&fa->fa_list); + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); + found++; + } + + /* update leaf slen */ + n->slen = slen; + + if (hlist_empty(&n->leaf)) { + put_child_root(pn, n->key, NULL); + node_free(n); + } else { + leaf_pull_suffix(pn, n); + } } pr_debug("trie_flush found=%d\n", found); return found; } -void fib_free_table(struct fib_table *tb) +static void __trie_free_rcu(struct rcu_head *head) { + struct fib_table *tb = container_of(head, struct fib_table, rcu); #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie *t = (struct trie *)tb->tb_data; - free_percpu(t->stats); + if (tb->tb_data == tb->__data) + free_percpu(t->stats); #endif /* CONFIG_IP_FIB_TRIE_STATS */ kfree(tb); } -static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, - struct fib_table *tb, - struct sk_buff *skb, struct netlink_callback *cb) +void fib_free_table(struct fib_table *tb) { - int i, s_i; + call_rcu(&tb->rcu, __trie_free_rcu); +} + +static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, + struct sk_buff *skb, struct netlink_callback *cb) +{ + __be32 xkey = htonl(l->key); struct fib_alias *fa; - __be32 xkey = htonl(key); + int i, s_i; - s_i = cb->args[5]; + s_i = cb->args[4]; i = 0; /* rcu_read_lock is hold by caller */ - - list_for_each_entry_rcu(fa, fah, fa_list) { + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { if (i < s_i) { i++; continue; } + if (tb->tb_id != fa->tb_id) { + i++; + continue; + } + if (fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, tb->tb_id, fa->fa_type, xkey, - plen, + KEYLENGTH - fa->fa_slen, fa->fa_tos, fa->fa_info, NLM_F_MULTI) < 0) { - cb->args[5] = i; - return -1; - } - i++; - } - cb->args[5] = i; - return skb->len; -} - -static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, - struct sk_buff *skb, struct netlink_callback *cb) -{ - struct leaf_info *li; - int i, s_i; - - s_i = cb->args[4]; - i = 0; - - /* rcu_read_lock is hold by caller */ - hlist_for_each_entry_rcu(li, &l->list, hlist) { - if (i < s_i) { - i++; - continue; - } - - if (i > s_i) - cb->args[5] = 0; - - if (list_empty(&li->falh)) - continue; - - if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) { cb->args[4] = i; return -1; } @@ -1801,44 +1918,38 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, return skb->len; } +/* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { - struct tnode *l; - struct trie *t = (struct trie *) tb->tb_data; - t_key key = cb->args[2]; - int count = cb->args[3]; - - rcu_read_lock(); + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *l, *tp = t->kv; /* Dump starting at last key. * Note: 0.0.0.0/0 (ie default) is first key. */ - if (count == 0) - l = trie_firstleaf(t); - else { - /* Normally, continue from last key, but if that is missing - * fallback to using slow rescan - */ - l = fib_find_node(t, key); - if (!l) - l = trie_leafindex(t, count); - } + int count = cb->args[2]; + t_key key = cb->args[3]; - while (l) { - cb->args[2] = l->key; + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { - cb->args[3] = count; - rcu_read_unlock(); + cb->args[3] = key; + cb->args[2] = count; return -1; } ++count; - l = trie_nextleaf(l); + key = l->key + 1; + memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0])); + + /* stop loop if key wrapped back to 0 */ + if (key < l->key) + break; } - cb->args[3] = count; - rcu_read_unlock(); + + cb->args[3] = key; + cb->args[2] = count; return skb->len; } @@ -1850,28 +1961,34 @@ void __init fib_trie_init(void) 0, SLAB_PANIC, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", - max(sizeof(struct tnode), - sizeof(struct leaf_info)), + LEAF_SIZE, 0, SLAB_PANIC, NULL); } - -struct fib_table *fib_trie_table(u32 id) +struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) { struct fib_table *tb; struct trie *t; + size_t sz = sizeof(*tb); + + if (!alias) + sz += sizeof(struct trie); - tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie), - GFP_KERNEL); + tb = kzalloc(sz, GFP_KERNEL); if (tb == NULL) return NULL; tb->tb_id = id; tb->tb_default = -1; tb->tb_num_default = 0; + tb->tb_data = (alias ? alias->__data : tb->__data); + + if (alias) + return tb; t = (struct trie *) tb->tb_data; - RCU_INIT_POINTER(t->trie, NULL); + t->kv[0].pos = KEYLENGTH; + t->kv[0].slen = KEYLENGTH; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats = alloc_percpu(struct trie_use_stats); if (!t->stats) { @@ -1888,65 +2005,63 @@ struct fib_table *fib_trie_table(u32 id) struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; - struct tnode *tnode; + struct key_vector *tnode; unsigned int index; unsigned int depth; }; -static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter) +static struct key_vector *fib_trie_get_next(struct fib_trie_iter *iter) { unsigned long cindex = iter->index; - struct tnode *tn = iter->tnode; - struct tnode *p; - - /* A single entry routing table */ - if (!tn) - return NULL; + struct key_vector *pn = iter->tnode; + t_key pkey; pr_debug("get_next iter={node=%p index=%d depth=%d}\n", iter->tnode, iter->index, iter->depth); -rescan: - while (cindex < tnode_child_length(tn)) { - struct tnode *n = tnode_get_child_rcu(tn, cindex); - if (n) { + while (!IS_TRIE(pn)) { + while (cindex < child_length(pn)) { + struct key_vector *n = get_child_rcu(pn, cindex++); + + if (!n) + continue; + if (IS_LEAF(n)) { - iter->tnode = tn; - iter->index = cindex + 1; + iter->tnode = pn; + iter->index = cindex; } else { /* push down one level */ iter->tnode = n; iter->index = 0; ++iter->depth; } + return n; } - ++cindex; - } - - /* Current node exhausted, pop back up */ - p = node_parent_rcu(tn); - if (p) { - cindex = get_index(tn->key, p) + 1; - tn = p; + /* Current node exhausted, pop back up */ + pkey = pn->key; + pn = node_parent_rcu(pn); + cindex = get_index(pkey, pn) + 1; --iter->depth; - goto rescan; } - /* got root? */ + /* record root node so further searches know we are done */ + iter->tnode = pn; + iter->index = 0; + return NULL; } -static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, - struct trie *t) +static struct key_vector *fib_trie_get_first(struct fib_trie_iter *iter, + struct trie *t) { - struct tnode *n; + struct key_vector *n, *pn = t->kv; if (!t) return NULL; - n = rcu_dereference(t->trie); + n = rcu_dereference(pn->tnode[0]); if (!n) return NULL; @@ -1955,7 +2070,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, iter->index = 0; iter->depth = 1; } else { - iter->tnode = NULL; + iter->tnode = pn; iter->index = 0; iter->depth = 0; } @@ -1965,7 +2080,7 @@ static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, static void trie_collect_stats(struct trie *t, struct trie_stat *s) { - struct tnode *n; + struct key_vector *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); @@ -1973,20 +2088,20 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) rcu_read_lock(); for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { - struct leaf_info *li; + struct fib_alias *fa; s->leaves++; s->totdepth += iter.depth; if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; - hlist_for_each_entry_rcu(li, &n->list, hlist) + hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) ++s->prefixes; } else { s->tnodes++; if (n->bits < MAX_STAT_DEPTH) s->nodesizes[n->bits]++; - s->nullpointers += n->empty_children; + s->nullpointers += tn_info(n)->empty_children; } } rcu_read_unlock(); @@ -2009,13 +2124,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); - bytes = sizeof(struct tnode) * stat->leaves; + bytes = LEAF_SIZE * stat->leaves; seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); - bytes += sizeof(struct leaf_info) * stat->prefixes; + bytes += sizeof(struct fib_alias) * stat->prefixes; seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); - bytes += sizeof(struct tnode) * stat->tnodes; + bytes += TNODE_SIZE(0) * stat->tnodes; max = MAX_STAT_DEPTH; while (max > 0 && stat->nodesizes[max-1] == 0) @@ -2030,7 +2145,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); - bytes += sizeof(struct tnode *) * pointers; + bytes += sizeof(struct key_vector *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } @@ -2084,7 +2199,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "Basic info: size of leaf:" " %Zd bytes, size of tnode: %Zd bytes.\n", - sizeof(struct tnode), sizeof(struct tnode)); + LEAF_SIZE, TNODE_SIZE(0)); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; @@ -2123,7 +2238,7 @@ static const struct file_operations fib_triestat_fops = { .release = single_release_net, }; -static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos) +static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -2135,7 +2250,7 @@ static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos) struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { - struct tnode *n; + struct key_vector *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); @@ -2164,7 +2279,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; - struct tnode *n; + struct key_vector *n; ++*pos; /* next node in same table */ @@ -2250,9 +2365,9 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t) static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; - struct tnode *n = v; + struct key_vector *n = v; - if (!node_parent_rcu(n)) + if (IS_TRIE(node_parent_rcu(n))) fib_table_print(seq, iter->tb); if (IS_TNODE(n)) { @@ -2261,30 +2376,28 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", &prf, KEYLENGTH - n->pos - n->bits, n->bits, - n->full_children, n->empty_children); + tn_info(n)->full_children, + tn_info(n)->empty_children); } else { - struct leaf_info *li; __be32 val = htonl(n->key); + struct fib_alias *fa; seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); - hlist_for_each_entry_rcu(li, &n->list, hlist) { - struct fib_alias *fa; - - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - char buf1[32], buf2[32]; - - seq_indent(seq, iter->depth+1); - seq_printf(seq, " /%d %s %s", li->plen, - rtn_scope(buf1, sizeof(buf1), - fa->fa_info->fib_scope), - rtn_type(buf2, sizeof(buf2), - fa->fa_type)); - if (fa->fa_tos) - seq_printf(seq, " tos=%d", fa->fa_tos); - seq_putc(seq, '\n'); - } + hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { + char buf1[32], buf2[32]; + + seq_indent(seq, iter->depth + 1); + seq_printf(seq, " /%zu %s %s", + KEYLENGTH - fa->fa_slen, + rtn_scope(buf1, sizeof(buf1), + fa->fa_info->fib_scope), + rtn_type(buf2, sizeof(buf2), + fa->fa_type)); + if (fa->fa_tos) + seq_printf(seq, " tos=%d", fa->fa_tos); + seq_putc(seq, '\n'); } } @@ -2314,31 +2427,47 @@ static const struct file_operations fib_trie_fops = { struct fib_route_iter { struct seq_net_private p; - struct trie *main_trie; + struct fib_table *main_tb; + struct key_vector *tnode; loff_t pos; t_key key; }; -static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) +static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, + loff_t pos) { - struct tnode *l = NULL; - struct trie *t = iter->main_trie; + struct fib_table *tb = iter->main_tb; + struct key_vector *l, **tp = &iter->tnode; + struct trie *t; + t_key key; - /* use cache location of last found key */ - if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key))) + /* use cache location of next-to-find key */ + if (iter->pos > 0 && pos >= iter->pos) { pos -= iter->pos; - else { + key = iter->key; + } else { + t = (struct trie *)tb->tb_data; + iter->tnode = t->kv; iter->pos = 0; - l = trie_firstleaf(t); + key = 0; } - while (l && pos-- > 0) { + while ((l = leaf_walk_rcu(tp, key)) != NULL) { + key = l->key + 1; iter->pos++; - l = trie_nextleaf(l); + + if (pos-- <= 0) + break; + + l = NULL; + + /* handle unlikely case of a key wrap */ + if (!key) + break; } if (l) - iter->key = pos; /* remember it */ + iter->key = key; /* remember it */ else iter->pos = 0; /* forget it */ @@ -2350,37 +2479,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct fib_table *tb; + struct trie *t; rcu_read_lock(); + tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); if (!tb) return NULL; - iter->main_trie = (struct trie *) tb->tb_data; - if (*pos == 0) - return SEQ_START_TOKEN; - else - return fib_route_get_idx(iter, *pos - 1); + iter->main_tb = tb; + + if (*pos != 0) + return fib_route_get_idx(iter, *pos); + + t = (struct trie *)tb->tb_data; + iter->tnode = t->kv; + iter->pos = 0; + iter->key = 0; + + return SEQ_START_TOKEN; } static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; - struct tnode *l = v; + struct key_vector *l = NULL; + t_key key = iter->key; ++*pos; - if (v == SEQ_START_TOKEN) { - iter->pos = 0; - l = trie_firstleaf(iter->main_trie); - } else { + + /* only allow key of 0 for start of sequence */ + if ((v == SEQ_START_TOKEN) || key) + l = leaf_walk_rcu(&iter->tnode, key); + + if (l) { + iter->key = l->key + 1; iter->pos++; - l = trie_nextleaf(l); + } else { + iter->pos = 0; } - if (l) - iter->key = l->key; - else - iter->pos = 0; return l; } @@ -2412,8 +2550,11 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info */ static int fib_route_seq_show(struct seq_file *seq, void *v) { - struct tnode *l = v; - struct leaf_info *li; + struct fib_route_iter *iter = seq->private; + struct fib_table *tb = iter->main_tb; + struct fib_alias *fa; + struct key_vector *l = v; + __be32 prefix; if (v == SEQ_START_TOKEN) { seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " @@ -2422,45 +2563,43 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) return 0; } - hlist_for_each_entry_rcu(li, &l->list, hlist) { - struct fib_alias *fa; - __be32 mask, prefix; + prefix = htonl(l->key); - mask = inet_make_mask(li->plen); - prefix = htonl(l->key); + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + const struct fib_info *fi = fa->fa_info; + __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); + unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - const struct fib_info *fi = fa->fa_info; - unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); + if ((fa->fa_type == RTN_BROADCAST) || + (fa->fa_type == RTN_MULTICAST)) + continue; - if (fa->fa_type == RTN_BROADCAST - || fa->fa_type == RTN_MULTICAST) - continue; + if (fa->tb_id != tb->tb_id) + continue; - seq_setwidth(seq, 127); - - if (fi) - seq_printf(seq, - "%s\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", - fi->fib_dev ? fi->fib_dev->name : "*", - prefix, - fi->fib_nh->nh_gw, flags, 0, 0, - fi->fib_priority, - mask, - (fi->fib_advmss ? - fi->fib_advmss + 40 : 0), - fi->fib_window, - fi->fib_rtt >> 3); - else - seq_printf(seq, - "*\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", - prefix, 0, flags, 0, 0, 0, - mask, 0, 0, 0); - - seq_pad(seq, '\n'); - } + seq_setwidth(seq, 127); + + if (fi) + seq_printf(seq, + "%s\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u", + fi->fib_dev ? fi->fib_dev->name : "*", + prefix, + fi->fib_nh->nh_gw, flags, 0, 0, + fi->fib_priority, + mask, + (fi->fib_advmss ? + fi->fib_advmss + 40 : 0), + fi->fib_window, + fi->fib_rtt >> 3); + else + seq_printf(seq, + "*\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u", + prefix, 0, flags, 0, 0, 0, + mask, 0, 0, 0); + + seq_pad(seq, '\n'); } return 0; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 666cf364df86..ad09213ac5b2 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -97,6 +97,7 @@ #include <net/route.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/inet_common.h> #include <linux/netfilter_ipv4.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> @@ -369,7 +370,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu) pip->saddr = fl4.saddr; pip->protocol = IPPROTO_IGMP; pip->tot_len = 0; /* filled in later */ - ip_select_ident(skb, NULL); + ip_select_ident(net, skb, NULL); ((u8 *)&pip[1])[0] = IPOPT_RA; ((u8 *)&pip[1])[1] = 4; ((u8 *)&pip[1])[2] = 0; @@ -713,7 +714,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, iph->daddr = dst; iph->saddr = fl4.saddr; iph->protocol = IPPROTO_IGMP; - ip_select_ident(skb, NULL); + ip_select_ident(net, skb, NULL); ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; ((u8 *)&iph[1])[2] = 0; @@ -1849,30 +1850,28 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc) pmc->sfcount[MCAST_EXCLUDE] = 1; } - -/* - * Join a multicast group +/* Join a multicast group */ -int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) + +int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) { - int err; __be32 addr = imr->imr_multiaddr.s_addr; - struct ip_mc_socklist *iml = NULL, *i; + struct ip_mc_socklist *iml, *i; struct in_device *in_dev; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); int ifindex; int count = 0; + int err; + + ASSERT_RTNL(); if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); - in_dev = ip_mc_find_dev(net, imr); if (!in_dev) { - iml = NULL; err = -ENODEV; goto done; } @@ -1900,7 +1899,6 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) ip_mc_inc_group(in_dev, addr); err = 0; done: - rtnl_unlock(); return err; } EXPORT_SYMBOL(ip_mc_join_group); @@ -1925,10 +1923,6 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, return err; } -/* - * Ask a socket to leave a group. - */ - int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) { struct inet_sock *inet = inet_sk(sk); @@ -1940,7 +1934,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) u32 ifindex; int ret = -EADDRNOTAVAIL; - rtnl_lock(); + ASSERT_RTNL(); + in_dev = ip_mc_find_dev(net, imr); if (!in_dev) { ret = -ENODEV; @@ -1964,14 +1959,13 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) *imlp = iml->next_rcu; ip_mc_dec_group(in_dev, group); - rtnl_unlock(); + /* decrease mem now to avoid the memleak warning */ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); kfree_rcu(iml, rcu); return 0; } out: - rtnl_unlock(); return ret; } EXPORT_SYMBOL(ip_mc_leave_group); @@ -1993,7 +1987,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); + ASSERT_RTNL(); imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; imr.imr_address.s_addr = mreqs->imr_interface; @@ -2107,9 +2101,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, &mreqs->imr_sourceaddr, 1); done: - rtnl_unlock(); if (leavegroup) - return ip_mc_leave_group(sk, &imr); + err = ip_mc_leave_group(sk, &imr); return err; } @@ -2131,7 +2124,7 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) msf->imsf_fmode != MCAST_EXCLUDE) return -EINVAL; - rtnl_lock(); + ASSERT_RTNL(); imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; @@ -2193,7 +2186,6 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) pmc->sfmode = msf->imsf_fmode; err = 0; done: - rtnl_unlock(); if (leavegroup) err = ip_mc_leave_group(sk, &imr); return err; @@ -2724,6 +2716,7 @@ static const struct file_operations igmp_mcf_seq_fops = { static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; + int err; pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops); if (!pde) @@ -2732,8 +2725,18 @@ static int __net_init igmp_net_init(struct net *net) &igmp_mcf_seq_fops); if (!pde) goto out_mcfilter; + err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET, + SOCK_DGRAM, 0, net); + if (err < 0) { + pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n", + err); + goto out_sock; + } + return 0; +out_sock: + remove_proc_entry("mcfilter", net->proc_net); out_mcfilter: remove_proc_entry("igmp", net->proc_net); out_igmp: @@ -2744,6 +2747,7 @@ static void __net_exit igmp_net_exit(struct net *net) { remove_proc_entry("mcfilter", net->proc_net); remove_proc_entry("igmp", net->proc_net); + inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk); } static struct pernet_operations igmp_net_ops = { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3e44b9b0b78e..79c0c9439fdc 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -23,6 +23,7 @@ #include <net/route.h> #include <net/tcp_states.h> #include <net/xfrm.h> +#include <net/tcp.h> #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -294,8 +295,8 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock_queue *queue = &icsk->icsk_accept_queue; - struct sock *newsk; struct request_sock *req; + struct sock *newsk; int error; lock_sock(sk); @@ -324,9 +325,11 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) newsk = req->sk; sk_acceptq_removed(sk); - if (sk->sk_protocol == IPPROTO_TCP && queue->fastopenq != NULL) { + if (sk->sk_protocol == IPPROTO_TCP && + tcp_rsk(req)->tfo_listener && + queue->fastopenq) { spin_lock_bh(&queue->fastopenq->lock); - if (tcp_rsk(req)->listener) { + if (tcp_rsk(req)->tfo_listener) { /* We are still waiting for the final ACK from 3WHS * so can't free req now. Instead, we set req->sk to * NULL to signify that the child socket is taken @@ -341,7 +344,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) out: release_sock(sk); if (req) - __reqsk_free(req); + reqsk_put(req); return newsk; out_err: newsk = NULL; @@ -400,18 +403,17 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { - struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); - struct ip_options_rcu *opt = inet_rsk(req)->opt; - struct net *net = sock_net(sk); - int flags = inet_sk_flowi_flags(sk); + struct net *net = read_pnet(&ireq->ireq_net); + struct ip_options_rcu *opt = ireq->opt; + struct rtable *rt; - flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, - sk->sk_protocol, - flags, + sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); + ireq->ir_loc_addr, ireq->ir_rmt_port, + htons(ireq->ir_num)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -433,9 +435,9 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); + struct net *net = read_pnet(&ireq->ireq_net); struct inet_sock *newinet = inet_sk(newsk); struct ip_options_rcu *opt; - struct net *net = sock_net(sk); struct flowi4 *fl4; struct rtable *rt; @@ -443,11 +445,12 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, rcu_read_lock(); opt = rcu_dereference(newinet->inet_opt); - flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); + ireq->ir_loc_addr, ireq->ir_rmt_port, + htons(ireq->ir_num)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -475,33 +478,37 @@ static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, #if IS_ENABLED(CONFIG_IPV6) #define AF_INET_FAMILY(fam) ((fam) == AF_INET) #else -#define AF_INET_FAMILY(fam) 1 +#define AF_INET_FAMILY(fam) true #endif -struct request_sock *inet_csk_search_req(const struct sock *sk, - struct request_sock ***prevp, - const __be16 rport, const __be32 raddr, +/* Note: this is temporary : + * req sock will no longer be in listener hash table +*/ +struct request_sock *inet_csk_search_req(struct sock *sk, + const __be16 rport, + const __be32 raddr, const __be32 laddr) { - const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req, **prev; + struct request_sock *req; + u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, + lopt->nr_table_entries); - for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries)]; - (req = *prev) != NULL; - prev = &req->dl_next) { + spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); + for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); if (ireq->ir_rmt_port == rport && ireq->ir_rmt_addr == raddr && ireq->ir_loc_addr == laddr && AF_INET_FAMILY(req->rsk_ops->family)) { + atomic_inc(&req->rsk_refcnt); WARN_ON(req->sk); - *prevp = prev; break; } } + spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); return req; } @@ -557,23 +564,24 @@ int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req) } EXPORT_SYMBOL(inet_rtx_syn_ack); -void inet_csk_reqsk_queue_prune(struct sock *parent, - const unsigned long interval, - const unsigned long timeout, - const unsigned long max_rto) +static void reqsk_timer_handler(unsigned long data) { - struct inet_connection_sock *icsk = inet_csk(parent); + struct request_sock *req = (struct request_sock *)data; + struct sock *sk_listener = req->rsk_listener; + struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct listen_sock *lopt = queue->listen_opt; - int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; - int thresh = max_retries; - unsigned long now = jiffies; - struct request_sock **reqp, *req; - int i, budget; + int qlen, expire = 0, resend = 0; + int max_retries, thresh; + u8 defer_accept; - if (lopt == NULL || lopt->qlen == 0) + if (sk_listener->sk_state != TCP_LISTEN || !lopt) { + reqsk_put(req); return; + } + max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + thresh = max_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means @@ -591,67 +599,65 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ - if (lopt->qlen>>(lopt->max_qlen_log-1)) { - int young = (lopt->qlen_young<<1); + qlen = listen_sock_qlen(lopt); + if (qlen >> (lopt->max_qlen_log - 1)) { + int young = listen_sock_young(lopt) << 1; while (thresh > 2) { - if (lopt->qlen < young) + if (qlen < young) break; thresh--; young <<= 1; } } + defer_accept = READ_ONCE(queue->rskq_defer_accept); + if (defer_accept) + max_retries = defer_accept; + syn_ack_recalc(req, thresh, max_retries, defer_accept, + &expire, &resend); + req->rsk_ops->syn_ack_timeout(req); + if (!expire && + (!resend || + !inet_rtx_syn_ack(sk_listener, req) || + inet_rsk(req)->acked)) { + unsigned long timeo; + + if (req->num_timeout++ == 0) + atomic_inc(&lopt->young_dec); + timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); + mod_timer_pinned(&req->rsk_timer, jiffies + timeo); + return; + } + inet_csk_reqsk_queue_drop(sk_listener, req); + reqsk_put(req); +} - if (queue->rskq_defer_accept) - max_retries = queue->rskq_defer_accept; - - budget = 2 * (lopt->nr_table_entries / (timeout / interval)); - i = lopt->clock_hand; - - do { - reqp=&lopt->syn_table[i]; - while ((req = *reqp) != NULL) { - if (time_after_eq(now, req->expires)) { - int expire = 0, resend = 0; - - syn_ack_recalc(req, thresh, max_retries, - queue->rskq_defer_accept, - &expire, &resend); - req->rsk_ops->syn_ack_timeout(parent, req); - if (!expire && - (!resend || - !inet_rtx_syn_ack(parent, req) || - inet_rsk(req)->acked)) { - unsigned long timeo; - - if (req->num_timeout++ == 0) - lopt->qlen_young--; - timeo = min(timeout << req->num_timeout, - max_rto); - req->expires = now + timeo; - reqp = &req->dl_next; - continue; - } - - /* Drop this request */ - inet_csk_reqsk_queue_unlink(parent, req, reqp); - reqsk_queue_removed(queue, req); - reqsk_free(req); - continue; - } - reqp = &req->dl_next; - } +void reqsk_queue_hash_req(struct request_sock_queue *queue, + u32 hash, struct request_sock *req, + unsigned long timeout) +{ + struct listen_sock *lopt = queue->listen_opt; - i = (i + 1) & (lopt->nr_table_entries - 1); + req->num_retrans = 0; + req->num_timeout = 0; + req->sk = NULL; - } while (--budget > 0); + /* before letting lookups find us, make sure all req fields + * are committed to memory and refcnt initialized. + */ + smp_wmb(); + atomic_set(&req->rsk_refcnt, 2); + setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); + req->rsk_hash = hash; - lopt->clock_hand = i; + spin_lock(&queue->syn_wait_lock); + req->dl_next = lopt->syn_table[hash]; + lopt->syn_table[hash] = req; + spin_unlock(&queue->syn_wait_lock); - if (lopt->qlen) - inet_csk_reset_keepalive_timer(parent, interval); + mod_timer_pinned(&req->rsk_timer, jiffies + timeout); } -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); +EXPORT_SYMBOL(reqsk_queue_hash_req); /** * inet_csk_clone_lock - clone an inet socket, and lock its clone @@ -679,6 +685,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, newsk->sk_write_space = sk_stream_write_space; newsk->sk_mark = inet_rsk(req)->ir_mark; + atomic64_set(&newsk->sk_cookie, + atomic64_read(&inet_rsk(req)->ir_cookie)); newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; @@ -785,8 +793,6 @@ void inet_csk_listen_stop(struct sock *sk) struct request_sock *acc_req; struct request_sock *req; - inet_csk_delete_keepalive_timer(sk); - /* make all the listen_opt local to us */ acc_req = reqsk_queue_yank_acceptq(queue); @@ -816,9 +822,9 @@ void inet_csk_listen_stop(struct sock *sk) percpu_counter_inc(sk->sk_prot->orphan_count); - if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) { + if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(tcp_sk(child)->fastopen_rsk != req); - BUG_ON(sk != tcp_rsk(req)->listener); + BUG_ON(sk != req->rsk_listener); /* Paranoid, to prevent race condition if * an inbound pkt destined for child is @@ -827,7 +833,6 @@ void inet_csk_listen_stop(struct sock *sk) * tcp_v4_destroy_sock(). */ tcp_sk(child)->fastopen_rsk = NULL; - sock_put(sk); } inet_csk_destroy_sock(child); @@ -836,7 +841,7 @@ void inet_csk_listen_stop(struct sock *sk) sock_put(child); sk_acceptq_removed(sk); - __reqsk_free(req); + reqsk_put(req); } if (queue->fastopenq != NULL) { /* Free all the reqs queued in rskq_rst_head. */ @@ -846,7 +851,7 @@ void inet_csk_listen_stop(struct sock *sk) spin_unlock_bh(&queue->fastopenq->lock); while ((req = acc_req) != NULL) { acc_req = req->dl_next; - __reqsk_free(req); + reqsk_put(req); } } WARN_ON(sk->sk_ack_backlog); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 592aff37366b..76322c9867d5 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -38,16 +38,12 @@ static const struct inet_diag_handler **inet_diag_table; struct inet_diag_entry { - __be32 *saddr; - __be32 *daddr; + const __be32 *saddr; + const __be32 *daddr; u16 sport; u16 dport; u16 family; u16 userlocks; -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */ - struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */ -#endif }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -65,12 +61,35 @@ static const struct inet_diag_handler *inet_diag_lock_handler(int proto) return inet_diag_table[proto]; } -static inline void inet_diag_unlock_handler( - const struct inet_diag_handler *handler) +static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) { mutex_unlock(&inet_diag_table_mutex); } +static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) +{ + r->idiag_family = sk->sk_family; + + r->id.idiag_sport = htons(sk->sk_num); + r->id.idiag_dport = sk->sk_dport; + r->id.idiag_if = sk->sk_bound_dev_if; + sock_diag_save_cookie(sk, r->id.idiag_cookie); + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; + *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; + } else +#endif + { + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + + r->id.idiag_src[0] = sk->sk_rcv_saddr; + r->id.idiag_dst[0] = sk->sk_daddr; + } +} + static size_t inet_sk_attr_size(void) { return nla_total_size(sizeof(struct tcp_info)) @@ -86,21 +105,21 @@ static size_t inet_sk_attr_size(void) } int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, - struct sk_buff *skb, struct inet_diag_req_v2 *req, - struct user_namespace *user_ns, - u32 portid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) + struct sk_buff *skb, const struct inet_diag_req_v2 *req, + struct user_namespace *user_ns, + u32 portid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) { const struct inet_sock *inet = inet_sk(sk); + const struct inet_diag_handler *handler; + int ext = req->idiag_ext; struct inet_diag_msg *r; struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; - const struct inet_diag_handler *handler; - int ext = req->idiag_ext; handler = inet_diag_table[req->sdiag_protocol]; - BUG_ON(handler == NULL); + BUG_ON(!handler); nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlmsg_flags); @@ -108,25 +127,13 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, return -EMSGSIZE; r = nlmsg_data(nlh); - BUG_ON(sk->sk_state == TCP_TIME_WAIT); + BUG_ON(!sk_fullsock(sk)); - r->idiag_family = sk->sk_family; + inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; r->idiag_timer = 0; r->idiag_retrans = 0; - r->id.idiag_if = sk->sk_bound_dev_if; - sock_diag_save_cookie(sk, r->id.idiag_cookie); - - r->id.idiag_sport = inet->inet_sport; - r->id.idiag_dport = inet->inet_dport; - - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - - r->id.idiag_src[0] = inet->inet_rcv_saddr; - r->id.idiag_dst[0] = inet->inet_daddr; - if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; @@ -139,10 +146,6 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { - - *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; - if (ext & (1 << (INET_DIAG_TCLASS - 1))) if (nla_put_u8(skb, INET_DIAG_TCLASS, inet6_sk(sk)->tclass) < 0) @@ -169,7 +172,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) goto errout; - if (icsk == NULL) { + if (!icsk) { handler->idiag_get_info(sk, r, NULL); goto out; } @@ -227,23 +230,25 @@ errout: EXPORT_SYMBOL_GPL(inet_sk_diag_fill); static int inet_csk_diag_fill(struct sock *sk, - struct sk_buff *skb, struct inet_diag_req_v2 *req, + struct sk_buff *skb, + const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { - return inet_sk_diag_fill(sk, inet_csk(sk), - skb, req, user_ns, portid, seq, nlmsg_flags, unlh); + return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, + user_ns, portid, seq, nlmsg_flags, unlh); } -static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, - struct sk_buff *skb, struct inet_diag_req_v2 *req, +static int inet_twsk_diag_fill(struct sock *sk, + struct sk_buff *skb, u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { - s32 tmo; + struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; + s32 tmo; nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), nlmsg_flags); @@ -257,21 +262,9 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, if (tmo < 0) tmo = 0; - r->idiag_family = tw->tw_family; + inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; - r->id.idiag_if = tw->tw_bound_dev_if; - sock_diag_save_cookie(tw, r->id.idiag_cookie); - - r->id.idiag_sport = tw->tw_sport; - r->id.idiag_dport = tw->tw_dport; - - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - - r->id.idiag_src[0] = tw->tw_rcv_saddr; - r->id.idiag_dst[0] = tw->tw_daddr; - r->idiag_state = tw->tw_substate; r->idiag_timer = 3; r->idiag_expires = jiffies_to_msecs(tmo); @@ -279,61 +272,91 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, r->idiag_wqueue = 0; r->idiag_uid = 0; r->idiag_inode = 0; -#if IS_ENABLED(CONFIG_IPV6) - if (tw->tw_family == AF_INET6) { - *(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr; - } -#endif + + nlmsg_end(skb, nlh); + return 0; +} + +static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, + u32 portid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), + nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + inet_diag_msg_common_fill(r, sk); + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = inet_reqsk(sk)->num_retrans; + + BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != + offsetof(struct sock, sk_cookie)); + + tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; + r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0; + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; nlmsg_end(skb, nlh); return 0; } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, - struct inet_diag_req_v2 *r, + const struct inet_diag_req_v2 *r, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq, + return inet_twsk_diag_fill(sk, skb, portid, seq, nlmsg_flags, unlh); + if (sk->sk_state == TCP_NEW_SYN_RECV) + return inet_req_diag_fill(sk, skb, portid, seq, + nlmsg_flags, unlh); + return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq, nlmsg_flags, unlh); } -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb, - const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req) +int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, + struct sk_buff *in_skb, + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *req) { - int err; - struct sock *sk; - struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); + struct sk_buff *rep; + struct sock *sk; + int err; err = -EINVAL; - if (req->sdiag_family == AF_INET) { + if (req->sdiag_family == AF_INET) sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); - } #if IS_ENABLED(CONFIG_IPV6) - else if (req->sdiag_family == AF_INET6) { + else if (req->sdiag_family == AF_INET6) sk = inet6_lookup(net, hashinfo, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if); - } #endif - else { + else goto out_nosk; - } err = -ENOENT; - if (sk == NULL) + if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); @@ -371,7 +394,7 @@ EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) + const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; int err; @@ -412,9 +435,8 @@ static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits) return 1; } - static int inet_diag_bc_run(const struct nlattr *_bc, - const struct inet_diag_entry *entry) + const struct inet_diag_entry *entry) { const void *bc = nla_data(_bc); int len = nla_len(_bc); @@ -446,10 +468,10 @@ static int inet_diag_bc_run(const struct nlattr *_bc, break; case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: { - struct inet_diag_hostcond *cond; - __be32 *addr; + const struct inet_diag_hostcond *cond; + const __be32 *addr; - cond = (struct inet_diag_hostcond *)(op + 1); + cond = (const struct inet_diag_hostcond *)(op + 1); if (cond->port != -1 && cond->port != (op->code == INET_DIAG_BC_S_COND ? entry->sport : entry->dport)) { @@ -498,29 +520,36 @@ static int inet_diag_bc_run(const struct nlattr *_bc, return len == 0; } +/* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV) + */ +static void entry_fill_addrs(struct inet_diag_entry *entry, + const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32; + entry->daddr = sk->sk_v6_daddr.s6_addr32; + } else +#endif + { + entry->saddr = &sk->sk_rcv_saddr; + entry->daddr = &sk->sk_daddr; + } +} + int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) { - struct inet_diag_entry entry; struct inet_sock *inet = inet_sk(sk); + struct inet_diag_entry entry; - if (bc == NULL) + if (!bc) return 1; entry.family = sk->sk_family; -#if IS_ENABLED(CONFIG_IPV6) - if (entry.family == AF_INET6) { - - entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32; - entry.daddr = sk->sk_v6_daddr.s6_addr32; - } else -#endif - { - entry.saddr = &inet->inet_rcv_saddr; - entry.daddr = &inet->inet_daddr; - } + entry_fill_addrs(&entry, sk); entry.sport = inet->inet_num; entry.dport = ntohs(inet->inet_dport); - entry.userlocks = sk->sk_userlocks; + entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; return inet_diag_bc_run(bc, &entry); } @@ -547,8 +576,8 @@ static int valid_cc(const void *bc, int len, int cc) static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, int *min_len) { - int addr_len; struct inet_diag_hostcond *cond; + int addr_len; /* Check hostcond space. */ *min_len += sizeof(struct inet_diag_hostcond); @@ -582,8 +611,8 @@ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, } /* Validate a port comparison operator. */ -static inline bool valid_port_comparison(const struct inet_diag_bc_op *op, - int len, int *min_len) +static bool valid_port_comparison(const struct inet_diag_bc_op *op, + int len, int *min_len) { /* Port comparisons put the port in a follow-on inet_diag_bc_op. */ *min_len += sizeof(struct inet_diag_bc_op); @@ -598,10 +627,9 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) int len = bytecode_len; while (len > 0) { - const struct inet_diag_bc_op *op = bc; int min_len = sizeof(struct inet_diag_bc_op); + const struct inet_diag_bc_op *op = bc; -//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); switch (op->code) { case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: @@ -642,7 +670,7 @@ static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) static int inet_csk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, + const struct inet_diag_req_v2 *r, const struct nlattr *bc) { if (!inet_diag_bc_sk(bc, sk)) @@ -654,139 +682,42 @@ static int inet_csk_diag_dump(struct sock *sk, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } -static int inet_twsk_diag_dump(struct sock *sk, - struct sk_buff *skb, - struct netlink_callback *cb, - struct inet_diag_req_v2 *r, - const struct nlattr *bc) +static void twsk_build_assert(void) { - struct inet_timewait_sock *tw = inet_twsk(sk); - - if (bc != NULL) { - struct inet_diag_entry entry; + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != + offsetof(struct sock, sk_family)); - entry.family = tw->tw_family; -#if IS_ENABLED(CONFIG_IPV6) - if (tw->tw_family == AF_INET6) { - entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32; - entry.daddr = tw->tw_v6_daddr.s6_addr32; - } else -#endif - { - entry.saddr = &tw->tw_rcv_saddr; - entry.daddr = &tw->tw_daddr; - } - entry.sport = tw->tw_num; - entry.dport = ntohs(tw->tw_dport); - entry.userlocks = 0; + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != + offsetof(struct inet_sock, inet_num)); - if (!inet_diag_bc_run(bc, &entry)) - return 0; - } + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != + offsetof(struct inet_sock, inet_dport)); - return inet_twsk_diag_fill(tw, skb, r, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); -} + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != + offsetof(struct inet_sock, inet_rcv_saddr)); -/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses - * from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6. - */ -static inline void inet_diag_req_addrs(const struct sock *sk, - const struct request_sock *req, - struct inet_diag_entry *entry) -{ - struct inet_request_sock *ireq = inet_rsk(req); + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != + offsetof(struct inet_sock, inet_daddr)); #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - if (req->rsk_ops->family == AF_INET6) { - entry->saddr = ireq->ir_v6_loc_addr.s6_addr32; - entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32; - } else if (req->rsk_ops->family == AF_INET) { - ipv6_addr_set_v4mapped(ireq->ir_loc_addr, - &entry->saddr_storage); - ipv6_addr_set_v4mapped(ireq->ir_rmt_addr, - &entry->daddr_storage); - entry->saddr = entry->saddr_storage.s6_addr32; - entry->daddr = entry->daddr_storage.s6_addr32; - } - } else -#endif - { - entry->saddr = &ireq->ir_loc_addr; - entry->daddr = &ireq->ir_rmt_addr; - } -} - -static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, - struct request_sock *req, - struct user_namespace *user_ns, - u32 portid, u32 seq, - const struct nlmsghdr *unlh) -{ - const struct inet_request_sock *ireq = inet_rsk(req); - struct inet_sock *inet = inet_sk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), - NLM_F_MULTI); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - r->idiag_family = sk->sk_family; - r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; - r->idiag_retrans = req->num_retrans; - - r->id.idiag_if = sk->sk_bound_dev_if; - sock_diag_save_cookie(req, r->id.idiag_cookie); + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != + offsetof(struct sock, sk_v6_rcv_saddr)); - tmo = req->expires - jiffies; - if (tmo < 0) - tmo = 0; - - r->id.idiag_sport = inet->inet_sport; - r->id.idiag_dport = ireq->ir_rmt_port; - - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - - r->id.idiag_src[0] = ireq->ir_loc_addr; - r->id.idiag_dst[0] = ireq->ir_rmt_addr; - - r->idiag_expires = jiffies_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); - r->idiag_inode = 0; -#if IS_ENABLED(CONFIG_IPV6) - if (r->idiag_family == AF_INET6) { - struct inet_diag_entry entry; - inet_diag_req_addrs(sk, req, &entry); - memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr)); - memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr)); - } + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != + offsetof(struct sock, sk_v6_daddr)); #endif - - nlmsg_end(skb, nlh); - return 0; } static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, + const struct inet_diag_req_v2 *r, const struct nlattr *bc) { - struct inet_diag_entry entry; struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt; struct inet_sock *inet = inet_sk(sk); - int j, s_j; - int reqnum, s_reqnum; + struct inet_diag_entry entry; + int j, s_j, reqnum, s_reqnum; + struct listen_sock *lopt; int err = 0; s_j = cb->args[3]; @@ -797,13 +728,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.family = sk->sk_family; - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !lopt->qlen) + if (!lopt || !listen_sock_qlen(lopt)) goto out; - if (bc != NULL) { + if (bc) { entry.sport = inet->inet_num; entry.userlocks = sk->sk_userlocks; } @@ -822,17 +753,18 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, continue; if (bc) { - inet_diag_req_addrs(sk, req, &entry); + /* Note: entry.sport and entry.userlocks are already set */ + entry_fill_addrs(&entry, req_to_sk(req)); entry.dport = ntohs(ireq->ir_rmt_port); if (!inet_diag_bc_run(bc, &entry)) continue; } - err = inet_diag_fill_req(skb, sk, req, - sk_user_ns(NETLINK_CB(cb->skb).sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, cb->nlh); + err = inet_req_diag_fill(req_to_sk(req), skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, cb->nlh); if (err < 0) { cb->args[3] = j + 1; cb->args[4] = reqnum; @@ -844,17 +776,17 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, } out: - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); return err; } void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, - struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc) + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, struct nlattr *bc) { - int i, num; - int s_i, s_num; struct net *net = sock_net(skb->sk); + int i, num, s_i, s_num; s_i = cb->args[1]; s_num = num = cb->args[2]; @@ -864,9 +796,9 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, goto skip_listen_ht; for (i = s_i; i < INET_LHTABLE_SIZE; i++) { - struct sock *sk; - struct hlist_nulls_node *node; struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; + struct sock *sk; num = 0; ilb = &hashinfo->listening_hash[i]; @@ -883,7 +815,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, } if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) + sk->sk_family != r->sdiag_family) goto next_listen; if (r->id.idiag_sport != inet->inet_sport && @@ -931,8 +863,8 @@ skip_listen_ht: for (i = s_i; i <= hashinfo->ehash_mask; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; spinlock_t *lock = inet_ehash_lockp(hashinfo, i); - struct sock *sk; struct hlist_nulls_node *node; + struct sock *sk; num = 0; @@ -944,8 +876,7 @@ skip_listen_ht: spin_lock_bh(lock); sk_nulls_for_each(sk, node, &head->chain) { - int res; - int state; + int state, res; if (!net_eq(sock_net(sk), net)) continue; @@ -964,10 +895,16 @@ skip_listen_ht: if (r->id.idiag_dport != sk->sk_dport && r->id.idiag_dport) goto next_normal; - if (sk->sk_state == TCP_TIME_WAIT) - res = inet_twsk_diag_dump(sk, skb, cb, r, bc); - else - res = inet_csk_diag_dump(sk, skb, cb, r, bc); + twsk_build_assert(); + + if (!inet_diag_bc_sk(bc, sk)) + goto next_normal; + + res = sk_diag_fill(sk, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh); if (res < 0) { spin_unlock_bh(lock); goto done; @@ -988,7 +925,8 @@ out: EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r, + struct nlattr *bc) { const struct inet_diag_handler *handler; int err = 0; @@ -1005,8 +943,8 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct nlattr *bc = NULL; int hdrlen = sizeof(struct inet_diag_req_v2); + struct nlattr *bc = NULL; if (nlmsg_attrlen(cb->nlh, hdrlen)) bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE); @@ -1014,7 +952,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc); } -static inline int inet_diag_type2proto(int type) +static int inet_diag_type2proto(int type) { switch (type) { case TCPDIAG_GETSOCK: @@ -1026,12 +964,13 @@ static inline int inet_diag_type2proto(int type) } } -static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) +static int inet_diag_dump_compat(struct sk_buff *skb, + struct netlink_callback *cb) { struct inet_diag_req *rc = nlmsg_data(cb->nlh); + int hdrlen = sizeof(struct inet_diag_req); struct inet_diag_req_v2 req; struct nlattr *bc = NULL; - int hdrlen = sizeof(struct inet_diag_req); req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); @@ -1046,7 +985,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *c } static int inet_diag_get_exact_compat(struct sk_buff *in_skb, - const struct nlmsghdr *nlh) + const struct nlmsghdr *nlh) { struct inet_diag_req *rc = nlmsg_data(nlh); struct inet_diag_req_v2 req; @@ -1075,7 +1014,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) attr = nlmsg_find_attr(nlh, hdrlen, INET_DIAG_REQ_BYTECODE); - if (attr == NULL || + if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op) || inet_diag_bc_audit(nla_data(attr), nla_len(attr))) return -EINVAL; @@ -1102,9 +1041,10 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) if (h->nlmsg_flags & NLM_F_DUMP) { if (nlmsg_attrlen(h, hdrlen)) { struct nlattr *attr; + attr = nlmsg_find_attr(h, hdrlen, INET_DIAG_REQ_BYTECODE); - if (attr == NULL || + if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op) || inet_diag_bc_audit(nla_data(attr), nla_len(attr))) return -EINVAL; @@ -1140,7 +1080,7 @@ int inet_diag_register(const struct inet_diag_handler *h) mutex_lock(&inet_diag_table_mutex); err = -EEXIST; - if (inet_diag_table[type] == NULL) { + if (!inet_diag_table[type]) { inet_diag_table[type] = h; err = 0; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9111a4e22155..0fb841b9d834 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -24,9 +24,9 @@ #include <net/secure_seq.h> #include <net/ip.h> -static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +static u32 inet_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) { static u32 inet_ehash_secret __read_mostly; @@ -36,17 +36,21 @@ static unsigned int inet_ehashfn(struct net *net, const __be32 laddr, inet_ehash_secret + net_hash_mix(net)); } - -static unsigned int inet_sk_ehashfn(const struct sock *sk) +/* This function handles inet_sock, but also timewait and request sockets + * for IPv4/IPv6. + */ +u32 sk_ehashfn(const struct sock *sk) { - const struct inet_sock *inet = inet_sk(sk); - const __be32 laddr = inet->inet_rcv_saddr; - const __u16 lport = inet->inet_num; - const __be32 faddr = inet->inet_daddr; - const __be16 fport = inet->inet_dport; - struct net *net = sock_net(sk); - - return inet_ehashfn(net, laddr, lport, faddr, fport); +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) + return inet6_ehashfn(sock_net(sk), + &sk->sk_v6_rcv_saddr, sk->sk_num, + &sk->sk_v6_daddr, sk->sk_dport); +#endif + return inet_ehashfn(sock_net(sk), + sk->sk_rcv_saddr, sk->sk_num, + sk->sk_daddr, sk->sk_dport); } /* @@ -61,7 +65,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb != NULL) { - write_pnet(&tb->ib_net, hold_net(net)); + write_pnet(&tb->ib_net, net); tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; @@ -79,7 +83,6 @@ void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); - release_net(ib_net(tb)); kmem_cache_free(cachep, tb); } } @@ -263,11 +266,19 @@ void sock_gen_put(struct sock *sk) if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_free(inet_twsk(sk)); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + reqsk_free(inet_reqsk(sk)); else sk_free(sk); } EXPORT_SYMBOL_GPL(sock_gen_put); +void sock_edemux(struct sk_buff *skb) +{ + sock_gen_put(skb->sk); +} +EXPORT_SYMBOL(sock_edemux); + struct sock *__inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, @@ -400,13 +411,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; - spinlock_t *lock; struct inet_ehash_bucket *head; + spinlock_t *lock; int twrefcnt = 0; WARN_ON(!sk_unhashed(sk)); - sk->sk_hash = inet_sk_ehashfn(sk); + sk->sk_hash = sk_ehashfn(sk); head = inet_ehash_bucket(hashinfo, sk->sk_hash); list = &head->chain; lock = inet_ehash_lockp(hashinfo, sk->sk_hash); @@ -423,15 +434,13 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); -static void __inet_hash(struct sock *sk) +int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; - if (sk->sk_state != TCP_LISTEN) { - __inet_hash_nolisten(sk, NULL); - return; - } + if (sk->sk_state != TCP_LISTEN) + return __inet_hash_nolisten(sk, tw); WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -440,13 +449,15 @@ static void __inet_hash(struct sock *sk) __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); spin_unlock(&ilb->lock); + return 0; } +EXPORT_SYMBOL(__inet_hash); void inet_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk); + __inet_hash(sk, NULL); local_bh_enable(); } } @@ -477,8 +488,7 @@ EXPORT_SYMBOL_GPL(inet_unhash); int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, - struct sock *, __u16, struct inet_timewait_sock **), - int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) + struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; const unsigned short snum = inet_sk(sk)->inet_num; @@ -548,7 +558,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - twrefcnt += hash(sk, tw); + twrefcnt += __inet_hash_nolisten(sk, tw); } if (tw) twrefcnt += inet_twsk_bind_unhash(tw, hinfo); @@ -570,7 +580,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - hash(sk, NULL); + __inet_hash_nolisten(sk, NULL); spin_unlock_bh(&head->lock); return 0; } else { @@ -590,7 +600,7 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk), - __inet_check_established, __inet_hash_nolisten); + __inet_check_established); } EXPORT_SYMBOL_GPL(inet_hash_connect); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 6d592f8555fb..f38e387448fb 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -98,7 +98,6 @@ void inet_twsk_free(struct inet_timewait_sock *tw) #ifdef SOCK_REFCNT_DEBUG pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif - release_net(twsk_net(tw)); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } @@ -195,7 +194,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_ipv6only = 0; tw->tw_transparent = inet->transparent; tw->tw_prot = sk->sk_prot_creator; - twsk_net_set(tw, hold_net(sock_net(sk))); + atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); + twsk_net_set(tw, sock_net(sk)); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this @@ -487,6 +487,7 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; restart_rcu: + cond_resched(); rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6207275fc749..1060ca0bc23a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -456,6 +456,7 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_do_ioctl = ipgre_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, }; #define GRE_FEATURES (NETIF_F_SG | \ @@ -621,10 +622,10 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); if (data[IFLA_GRE_LOCAL]) - parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); + parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]); if (data[IFLA_GRE_REMOTE]) - parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); + parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]); if (data[IFLA_GRE_TTL]) parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); @@ -686,6 +687,7 @@ static const struct net_device_ops gre_tap_netdev_ops = { .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, }; static void ipgre_tap_setup(struct net_device *dev) @@ -776,8 +778,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || - nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || - nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) || + nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || + nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) || nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || nla_put_u8(skb, IFLA_GRE_PMTUDISC, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a7aea2048a0d..8259e777b249 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -148,7 +148,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - ip_select_ident(skb, sk); + ip_select_ident(sock_net(sk), skb, sk); if (opt && opt->opt.optlen) { iph->ihl += opt->opt.optlen>>2; @@ -430,7 +430,8 @@ packet_routed: ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0); } - ip_select_ident_segs(skb, sk, skb_shinfo(skb)->gso_segs ?: 1); + ip_select_ident_segs(sock_net(sk), skb, sk, + skb_shinfo(skb)->gso_segs ?: 1); /* TODO : should we use skb->sk here instead of sk ? */ skb->priority = sk->sk_priority; @@ -636,10 +637,7 @@ slow_path: left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ - /* for bridged IP traffic encapsulated inside f.e. a vlan header, - * we need to make room for the encapsulating header - */ - ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb)); + ll_rs = LL_RESERVED_SPACE(rt->dst.dev); /* * Fragment the datagram. @@ -1382,7 +1380,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); - ip_select_ident(skb, sk); + ip_select_ident(net, skb, sk); if (opt) { iph->ihl += opt->optlen>>2; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5cd99271d3a6..f6a0d54b308a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -536,12 +536,34 @@ out: * Socket option code for IP. This is the end of the line after any * TCP,UDP etc options on an IP socket. */ +static bool setsockopt_needs_rtnl(int optname) +{ + switch (optname) { + case IP_ADD_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_BLOCK_SOURCE: + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case IP_MSFILTER: + case IP_UNBLOCK_SOURCE: + case MCAST_BLOCK_SOURCE: + case MCAST_MSFILTER: + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_UNBLOCK_SOURCE: + return true; + } + return false; +} static int do_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); int val = 0, err; + bool needs_rtnl = setsockopt_needs_rtnl(optname); switch (optname) { case IP_PKTINFO: @@ -584,6 +606,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, return ip_mroute_setsockopt(sk, optname, optval, optlen); err = 0; + if (needs_rtnl) + rtnl_lock(); lock_sock(sk); switch (optname) { @@ -1118,10 +1142,14 @@ mc_msf_out: break; } release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); return err; e_inval: release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); return -EINVAL; } diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 2cd08280c77b..4bb7252110a6 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -389,7 +389,6 @@ static int ip_tunnel_bind_dev(struct net_device *dev) hlen = tdev->hard_header_len + tdev->needed_headroom; mtu = tdev->mtu; } - dev->iflink = tunnel->parms.link; dev->needed_headroom = t_hlen + hlen; mtu -= (dev->hard_header_len + t_hlen); @@ -980,6 +979,14 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev) } EXPORT_SYMBOL(ip_tunnel_get_link_net); +int ip_tunnel_get_iflink(const struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + + return tunnel->parms.link; +} +EXPORT_SYMBOL(ip_tunnel_get_iflink); + int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 88c386cf7d85..8c4dcc46acd2 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -74,7 +74,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, iph->daddr = dst; iph->saddr = src; iph->ttl = ttl; - __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); + __ip_select_ident(sock_net(sk), iph, skb_shinfo(skb)->gso_segs ?: 1); err = ip_local_out_sk(sk, skb); if (unlikely(net_xmit_eval(err))) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 94efe148181c..c4f93c0d1104 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -341,6 +341,7 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_do_ioctl = vti_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, }; static void vti_tunnel_setup(struct net_device *dev) @@ -361,7 +362,6 @@ static int vti_tunnel_init(struct net_device *dev) dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); dev->mtu = ETH_DATA_LEN; dev->flags = IFF_NOARP; - dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; netif_keep_dst(dev); @@ -456,10 +456,10 @@ static void vti_netlink_parms(struct nlattr *data[], parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_LOCAL]) - parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]); + parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) - parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]); + parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]); } @@ -505,8 +505,8 @@ static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u32(skb, IFLA_VTI_LINK, p->link); nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key); nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key); - nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr); - nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr); + nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr); + nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr); return 0; } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 915d215a7d14..5c81f6e40842 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -272,6 +272,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_do_ioctl = ipip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, }; #define IPIP_FEATURES (NETIF_F_SG | \ @@ -286,7 +287,6 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; - dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; netif_keep_dst(dev); @@ -325,10 +325,10 @@ static void ipip_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) - parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); + parms->iph.saddr = nla_get_in_addr(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) - parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); + parms->iph.daddr = nla_get_in_addr(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) { parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); @@ -450,8 +450,8 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || - nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || - nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || + nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || + nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 92825443fad6..3ef30cf57f4a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -73,9 +73,7 @@ struct mr_table { struct list_head list; -#ifdef CONFIG_NET_NS - struct net *net; -#endif + possible_net_t net; u32 id; struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; @@ -475,8 +473,14 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +static int reg_vif_get_iflink(const struct net_device *dev) +{ + return 0; +} + static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, + .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) @@ -511,7 +515,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) free_netdev(dev); return NULL; } - dev->iflink = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -803,7 +806,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, v->pkt_out = 0; v->link = dev->ifindex; if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) - v->link = dev->iflink; + v->link = dev_get_iflink(dev); /* And finish update writing critical data */ write_lock_bh(&mrt_lock); @@ -1644,7 +1647,8 @@ static struct notifier_block ip_mr_notifier = { * important for multicast video. */ -static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) +static void ip_encap(struct net *net, struct sk_buff *skb, + __be32 saddr, __be32 daddr) { struct iphdr *iph; const struct iphdr *old_iph = ip_hdr(skb); @@ -1663,7 +1667,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; iph->tot_len = htons(skb->len); - ip_select_ident(skb, NULL); + ip_select_ident(net, skb, NULL); ip_send_check(iph); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); @@ -1760,7 +1764,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * What do we do with netfilter? -- RR */ if (vif->flags & VIFF_TUNNEL) { - ip_encap(skb, vif->local, vif->remote); + ip_encap(net, skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ vif->dev->stats.tx_packets++; vif->dev->stats.tx_bytes += skb->len; @@ -2282,8 +2286,8 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; - if (nla_put_be32(skb, RTA_SRC, c->mfc_origin) || - nla_put_be32(skb, RTA_DST, c->mfc_mcastgrp)) + if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) || + nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp)) goto nla_put_failure; err = __ipmr_fill_mroute(mrt, skb, c, rtm); /* do not break the dump if cache is unresolved */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 59f883d9cadf..fb20f363151f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -36,24 +36,16 @@ config NF_CONNTRACK_PROC_COMPAT If unsure, say Y. -config NF_LOG_ARP - tristate "ARP packet logging" - default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON - -config NF_LOG_IPV4 - tristate "IPv4 packet logging" - default m if NETFILTER_ADVANCED=n - select NF_LOG_COMMON +if NF_TABLES config NF_TABLES_IPV4 - depends on NF_TABLES tristate "IPv4 nf_tables support" help This option enables the IPv4 support for nf_tables. +if NF_TABLES_IPV4 + config NFT_CHAIN_ROUTE_IPV4 - depends on NF_TABLES_IPV4 tristate "IPv4 nf_tables route chain support" help This option enables the "route" chain for IPv4 in nf_tables. This @@ -61,22 +53,34 @@ config NFT_CHAIN_ROUTE_IPV4 fields such as the source, destination, type of service and the packet mark. -config NF_REJECT_IPV4 - tristate "IPv4 packet rejection" - default m if NETFILTER_ADVANCED=n - config NFT_REJECT_IPV4 - depends on NF_TABLES_IPV4 select NF_REJECT_IPV4 default NFT_REJECT tristate +endif # NF_TABLES_IPV4 + config NF_TABLES_ARP - depends on NF_TABLES tristate "ARP nf_tables support" help This option enables the ARP support for nf_tables. +endif # NF_TABLES + +config NF_LOG_ARP + tristate "ARP packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + +config NF_LOG_IPV4 + tristate "IPv4 packet logging" + default m if NETFILTER_ADVANCED=n + select NF_LOG_COMMON + +config NF_REJECT_IPV4 + tristate "IPv4 packet rejection" + default m if NETFILTER_ADVANCED=n + config NF_NAT_IPV4 tristate "IPv4 NAT" depends on NF_CONNTRACK_IPV4 diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index e90f83a3415b..f75e9df5e017 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -418,6 +418,13 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); + + if (!par->net->xt.clusterip_deprecated_warning) { + pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " + "use xt_cluster instead\n"); + par->net->xt.clusterip_deprecated_warning = true; + } + return ret; } diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 8f48f5517e33..87907d4bd259 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -34,31 +34,32 @@ static unsigned int reject_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_reject_info *reject = par->targinfo; + int hook = par->hooknum; switch (reject->with) { case IPT_ICMP_NET_UNREACHABLE: - nf_send_unreach(skb, ICMP_NET_UNREACH); + nf_send_unreach(skb, ICMP_NET_UNREACH, hook); break; case IPT_ICMP_HOST_UNREACHABLE: - nf_send_unreach(skb, ICMP_HOST_UNREACH); + nf_send_unreach(skb, ICMP_HOST_UNREACH, hook); break; case IPT_ICMP_PROT_UNREACHABLE: - nf_send_unreach(skb, ICMP_PROT_UNREACH); + nf_send_unreach(skb, ICMP_PROT_UNREACH, hook); break; case IPT_ICMP_PORT_UNREACHABLE: - nf_send_unreach(skb, ICMP_PORT_UNREACH); + nf_send_unreach(skb, ICMP_PORT_UNREACH, hook); break; case IPT_ICMP_NET_PROHIBITED: - nf_send_unreach(skb, ICMP_NET_ANO); + nf_send_unreach(skb, ICMP_NET_ANO, hook); break; case IPT_ICMP_HOST_PROHIBITED: - nf_send_unreach(skb, ICMP_HOST_ANO); + nf_send_unreach(skb, ICMP_HOST_ANO, hook); break; case IPT_ICMP_ADMIN_PROHIBITED: - nf_send_unreach(skb, ICMP_PKT_FILTERED); + nf_send_unreach(skb, ICMP_PKT_FILTERED, hook); break; case IPT_TCP_RESET: - nf_send_reset(skb, par->hooknum); + nf_send_reset(skb, hook); case IPT_ICMP_ECHOREPLY: /* Doesn't happen. */ break; diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5c61328b7704..8c8d6642cbb0 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -322,8 +322,8 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) static int ipv4_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple) { - if (nla_put_be32(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || - nla_put_be32(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) + if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) || + nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip)) goto nla_put_failure; return 0; @@ -342,8 +342,8 @@ static int ipv4_nlattr_to_tuple(struct nlattr *tb[], if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) return -EINVAL; - t->src.u3.ip = nla_get_be32(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_be32(tb[CTA_IP_V4_DST]); + t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); + t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); return 0; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index a460a87e14f8..f0dfe92a00d6 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -300,7 +300,9 @@ static int exp_seq_show(struct seq_file *s, void *v) __nf_ct_l3proto_find(exp->tuple.src.l3num), __nf_ct_l4proto_find(exp->tuple.src.l3num, exp->tuple.dst.protonum)); - return seq_putc(s, '\n'); + seq_putc(s, '\n'); + + return 0; } static const struct seq_operations exp_seq_ops = { diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index d059182c1466..e7ad950cf9ef 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -10,8 +10,10 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> @@ -27,7 +29,7 @@ static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { - .level = 5, + .level = LOGLEVEL_NOTICE, .logflags = NF_LOG_MASK, }, }, diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 75101980eeee..076aadda0473 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -5,8 +5,10 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/skbuff.h> @@ -26,7 +28,7 @@ static struct nf_loginfo default_loginfo = { .type = NF_LOG_TYPE_LOG, .u = { .log = { - .level = 5, + .level = LOGLEVEL_NOTICE, .logflags = NF_LOG_MASK, }, }, diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 536da7bc598a..c5b794da51a9 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - __be16 protocol, int ttl) + __u8 protocol, int ttl) { struct iphdr *niph, *oiph = ip_hdr(oldskb); @@ -164,4 +164,27 @@ void nf_send_reset(struct sk_buff *oldskb, int hook) } EXPORT_SYMBOL_GPL(nf_send_reset); +void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) +{ + struct iphdr *iph = ip_hdr(skb_in); + u8 proto; + + if (skb_in->csum_bad || iph->frag_off & htons(IP_OFFSET)) + return; + + if (skb_csum_unnecessary(skb_in)) { + icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); + return; + } + + if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP) + proto = iph->protocol; + else + proto = 0; + + if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0) + icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); +} +EXPORT_SYMBOL_GPL(nf_send_unreach); + MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c index d729542bd1b7..16a5d4d73d75 100644 --- a/net/ipv4/netfilter/nft_reject_ipv4.c +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -27,7 +27,8 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr, switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: - nf_send_unreach(pkt->skb, priv->icmp_code); + nf_send_unreach(pkt->skb, priv->icmp_code, + pkt->ops->hooknum); break; case NFT_REJECT_TCP_RST: nf_send_reset(pkt->skb, pkt->ops->hooknum); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 208d5439e59b..344e7cdfb8d4 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -64,11 +64,11 @@ EXPORT_SYMBOL_GPL(pingv6_ops); static u16 ping_port_rover; -static inline int ping_hashfn(struct net *net, unsigned int num, unsigned int mask) +static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { - int res = (num + net_hash_mix(net)) & mask; + u32 res = (num + net_hash_mix(net)) & mask; - pr_debug("hash(%d) = %d\n", num, res); + pr_debug("hash(%u) = %u\n", num, res); return res; } EXPORT_SYMBOL_GPL(ping_hash); @@ -692,8 +692,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, } EXPORT_SYMBOL_GPL(ping_common_sendmsg); -static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct net *net = sock_net(sk); struct flowi4 fl4; @@ -849,8 +848,8 @@ do_confirm: goto out; } -int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len) +int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) { struct inet_sock *isk = inet_sk(sk); int family = sk->sk_family; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index f027a708b7e0..56946f47d446 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -404,7 +404,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, iph->check = 0; iph->tot_len = htons(length); if (!iph->id) - ip_select_ident(skb, NULL); + ip_select_ident(net, skb, NULL); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } @@ -481,8 +481,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } -static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; @@ -709,8 +708,8 @@ out: return ret; * we return it, otherwise we block. */ -static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len) +static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ad5064362c5c..652b92ebd7ba 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -152,7 +152,6 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, static struct dst_ops ipv4_dst_ops = { .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, @@ -483,7 +482,7 @@ u32 ip_idents_reserve(u32 hash, int segs) } EXPORT_SYMBOL(ip_idents_reserve); -void __ip_select_ident(struct iphdr *iph, int segs) +void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) { static u32 ip_idents_hashrnd __read_mostly; u32 hash, id; @@ -492,7 +491,7 @@ void __ip_select_ident(struct iphdr *iph, int segs) hash = jhash_3words((__force u32)iph->daddr, (__force u32)iph->saddr, - iph->protocol, + iph->protocol ^ net_hash_mix(net), ip_idents_hashrnd); id = ip_idents_reserve(hash, segs); iph->id = htons(id); @@ -2225,7 +2224,6 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), .check = ipv4_blackhole_dst_check, .mtu = ipv4_blackhole_mtu, .default_advmss = ipv4_default_advmss, @@ -2321,11 +2319,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (IPCB(skb)->flags & IPSKB_DOREDIRECT) r->rtm_flags |= RTCF_DOREDIRECT; - if (nla_put_be32(skb, RTA_DST, dst)) + if (nla_put_in_addr(skb, RTA_DST, dst)) goto nla_put_failure; if (src) { r->rtm_src_len = 32; - if (nla_put_be32(skb, RTA_SRC, src)) + if (nla_put_in_addr(skb, RTA_SRC, src)) goto nla_put_failure; } if (rt->dst.dev && @@ -2338,11 +2336,11 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, #endif if (!rt_is_input_route(rt) && fl4->saddr != src) { - if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) + if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } if (rt->rt_uses_gateway && - nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; expires = rt->dst.expires; @@ -2438,8 +2436,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) ip_hdr(skb)->protocol = IPPROTO_ICMP; skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); - src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; - dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; + src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; + dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 45fe60c5238e..df849e5a10f1 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -219,19 +219,20 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, } EXPORT_SYMBOL_GPL(__cookie_v4_check); -static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst) +static struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); - if (child) + if (child) { + atomic_set(&req->rsk_refcnt, 1); inet_csk_reqsk_queue_add(sk, req, child); - else + } else { reqsk_free(req); - + } return child; } @@ -325,7 +326,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; ret = NULL; - req = inet_reqsk_alloc(&tcp_request_sock_ops); /* for safety */ + req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */ if (!req) goto out; @@ -336,8 +337,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) req->mss = mss; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; - ireq->ir_loc_addr = ip_hdr(skb)->daddr; - ireq->ir_rmt_addr = ip_hdr(skb)->saddr; + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); ireq->ir_mark = inet_request_mark(sk, skb); ireq->snd_wscale = tcp_opt.snd_wscale; ireq->sack_ok = tcp_opt.sack_ok; @@ -345,7 +346,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->tstamp_ok = tcp_opt.saw_tstamp; req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->snt_synack = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; - treq->listener = NULL; + treq->tfo_listener = false; + + ireq->ir_iif = sk->sk_bound_dev_if; /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) @@ -357,7 +360,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) goto out; } - req->expires = 0UL; req->num_retrans = 0; /* diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d151539da8e6..fdf899163d44 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -883,6 +883,20 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_probe_threshold", + .data = &init_net.ipv4.sysctl_tcp_probe_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "tcp_probe_interval", + .data = &init_net.ipv4.sysctl_tcp_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 995a2259bcfc..dbd51cefaf02 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1060,8 +1060,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, return err; } -int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t size) +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1539,8 +1538,8 @@ EXPORT_SYMBOL(tcp_read_sock); * Probably, code can be easily improved even more. */ -int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int nonblock, int flags, int *addr_len) +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, + int flags, int *addr_len) { struct tcp_sock *tp = tcp_sk(sk); int copied = 0; @@ -1914,18 +1913,19 @@ EXPORT_SYMBOL_GPL(tcp_set_state); static const unsigned char new_state[16] = { /* current state: new state: action: */ - /* (Invalid) */ TCP_CLOSE, - /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, - /* TCP_SYN_SENT */ TCP_CLOSE, - /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, - /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, - /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, - /* TCP_TIME_WAIT */ TCP_CLOSE, - /* TCP_CLOSE */ TCP_CLOSE, - /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, - /* TCP_LAST_ACK */ TCP_LAST_ACK, - /* TCP_LISTEN */ TCP_CLOSE, - /* TCP_CLOSING */ TCP_CLOSING, + [0 /* (Invalid) */] = TCP_CLOSE, + [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_SYN_SENT] = TCP_CLOSE, + [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, + [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, + [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, + [TCP_TIME_WAIT] = TCP_CLOSE, + [TCP_CLOSE] = TCP_CLOSE, + [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, + [TCP_LAST_ACK] = TCP_LAST_ACK, + [TCP_LISTEN] = TCP_CLOSE, + [TCP_CLOSING] = TCP_CLOSING, + [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) @@ -3001,12 +3001,11 @@ static void __init tcp_init_mem(void) void __init tcp_init(void) { - struct sk_buff *skb = NULL; unsigned long limit; int max_rshare, max_wshare, cnt; unsigned int i; - BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); + sock_skb_cb_check_size(sizeof(struct tcp_skb_cb)); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 62856e185a93..7a5ae50c80c8 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -83,7 +83,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) ret = -EEXIST; } else { list_add_tail_rcu(&ca->list, &tcp_cong_list); - pr_info("%s registered\n", ca->name); + pr_debug("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 0d73f9ddb55b..86dc119a3815 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -34,13 +34,13 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r, struct nlattr *bc) { inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc); } static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) + const struct inet_diag_req_v2 *req) { return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req); } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index ea82fd492c1b..2eb887ec0ce3 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -155,12 +155,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, tp = tcp_sk(child); tp->fastopen_rsk = req; - /* Do a hold on the listner sk so that if the listener is being - * closed, the child that has been accepted can live on and still - * access listen_lock. - */ - sock_hold(sk); - tcp_rsk(req)->listener = sk; + tcp_rsk(req)->tfo_listener = true; /* RFC1323: The window in SYN & SYN/ACK segments is never * scaled. So correct it appropriately. @@ -174,6 +169,7 @@ static bool tcp_fastopen_create_child(struct sock *sk, inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT, TCP_RTO_MAX); + atomic_set(&req->rsk_refcnt, 1); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, child); @@ -221,7 +217,6 @@ static bool tcp_fastopen_create_child(struct sock *sk, WARN_ON(req->sk == NULL); return true; } -EXPORT_SYMBOL(tcp_fastopen_create_child); static bool tcp_fastopen_queue_check(struct sock *sk) { @@ -245,7 +240,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk) struct request_sock *req1; spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; - if ((req1 == NULL) || time_after(req1->expires, jiffies)) { + if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) { spin_unlock(&fastopenq->lock); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); @@ -254,7 +249,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk) fastopenq->rskq_rst_head = req1->dl_next; fastopenq->qlen--; spin_unlock(&fastopenq->lock); - reqsk_free(req1); + reqsk_put(req1); } return true; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb4cf8b8e121..18b80e8bc533 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3321,6 +3321,36 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } +/* Return true if we're currently rate-limiting out-of-window ACKs and + * thus shouldn't send a dupack right now. We rate-limit dupacks in + * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS + * attacks that send repeated SYNs or ACKs for the same connection. To + * do this, we do not send a duplicate SYNACK or ACK if the remote + * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. + */ +bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, + int mib_idx, u32 *last_oow_ack_time) +{ + /* Data packets without SYNs are not likely part of an ACK loop. */ + if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && + !tcp_hdr(skb)->syn) + goto not_rate_limited; + + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS_BH(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + +not_rate_limited: + return false; /* not rate-limited: go ahead, send dupack now! */ +} + /* RFC 5961 7 [ACK Throttling] */ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) { @@ -5664,7 +5694,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (tcp_check_req(sk, skb, req, NULL, true) == NULL) + if (tcp_check_req(sk, skb, req, true) == NULL) goto discard; } @@ -5912,6 +5942,80 @@ static void tcp_ecn_create_request(struct request_sock *req, inet_rsk(req)->ecn_ok = 1; } +static void tcp_openreq_init(struct request_sock *req, + const struct tcp_options_received *rx_opt, + struct sk_buff *skb, const struct sock *sk) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->cookie_ts = 0; + tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; + tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tcp_rsk(req)->snt_synack = tcp_time_stamp; + tcp_rsk(req)->last_oow_ack_time = 0; + req->mss = rx_opt->mss_clamp; + req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; + ireq->tstamp_ok = rx_opt->tstamp_ok; + ireq->sack_ok = rx_opt->sack_ok; + ireq->snd_wscale = rx_opt->snd_wscale; + ireq->wscale_ok = rx_opt->wscale_ok; + ireq->acked = 0; + ireq->ecn_ok = 0; + ireq->ir_rmt_port = tcp_hdr(skb)->source; + ireq->ir_num = ntohs(tcp_hdr(skb)->dest); + ireq->ir_mark = inet_request_mark(sk, skb); +} + +struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener) +{ + struct request_sock *req = reqsk_alloc(ops, sk_listener); + + if (req) { + struct inet_request_sock *ireq = inet_rsk(req); + + kmemcheck_annotate_bitfield(ireq, flags); + ireq->opt = NULL; + atomic64_set(&ireq->ir_cookie, 0); + ireq->ireq_state = TCP_NEW_SYN_RECV; + write_pnet(&ireq->ireq_net, sock_net(sk_listener)); + ireq->ireq_family = sk_listener->sk_family; + } + + return req; +} +EXPORT_SYMBOL(inet_reqsk_alloc); + +/* + * Return true if a syncookie should be sent + */ +static bool tcp_syn_flood_action(struct sock *sk, + const struct sk_buff *skb, + const char *proto) +{ + const char *msg = "Dropping request"; + bool want_cookie = false; + struct listen_sock *lopt; + +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) { + msg = "Sending cookies"; + want_cookie = true; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); + } else +#endif + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); + + lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; + if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { + lopt->synflood_warned = 1; + pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", + proto, ntohs(tcp_hdr(skb)->dest), msg); + } + return want_cookie; +} + int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb) @@ -5949,7 +6053,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop; } - req = inet_reqsk_alloc(rsk_ops); + req = inet_reqsk_alloc(rsk_ops, sk); if (!req) goto drop; @@ -5966,6 +6070,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb, sk); + /* Note: tcp_v6_init_req() might override ir_iif for link locals */ + inet_rsk(req)->ir_iif = sk->sk_bound_dev_if; + af_ops->init_req(req, sk, skb); if (security_inet_conn_request(sk, skb, req)) @@ -6038,7 +6145,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (err || want_cookie) goto drop_and_free; - tcp_rsk(req)->listener = NULL; + tcp_rsk(req)->tfo_listener = false; af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f1756ee02207..69f9cf684744 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -189,7 +189,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; - inet->inet_rcv_saddr = inet->inet_saddr; + sk_rcv_saddr_set(sk, inet->inet_saddr); if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ @@ -204,7 +204,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port; - inet->inet_daddr = daddr; + sk_daddr_set(sk, daddr); inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) @@ -310,6 +310,34 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) dst->ops->redirect(dst, sk, skb); } + +/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ +void tcp_req_err(struct sock *sk, u32 seq) +{ + struct request_sock *req = inet_reqsk(sk); + struct net *net = sock_net(sk); + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + WARN_ON(req->sk); + + if (seq != tcp_rsk(req)->snt_isn) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + reqsk_put(req); + } else { + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + } +} +EXPORT_SYMBOL(tcp_req_err); + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -343,8 +371,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) int err; struct net *net = dev_net(icmp_skb->dev); - sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, - iph->saddr, th->source, inet_iif(icmp_skb)); + sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, + th->dest, iph->saddr, ntohs(th->source), + inet_iif(icmp_skb)); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; @@ -353,6 +382,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) inet_twsk_put(inet_twsk(sk)); return; } + seq = ntohl(th->seq); + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_err(sk, seq); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -374,7 +406,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); - seq = ntohl(th->seq); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; @@ -458,36 +489,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } switch (sk->sk_state) { - struct request_sock *req, **prev; - case TCP_LISTEN: - if (sock_owned_by_user(sk)) - goto out; - - req = inet_csk_search_req(sk, &prev, th->dest, - iph->daddr, iph->saddr); - if (!req) - goto out; - - /* ICMPs are not backlogged, hence we cannot get - an established socket here. - */ - WARN_ON(req->sk); - - if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - /* - * Still in SYN_RECV, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(sk, req, prev); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - goto out; - case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is @@ -647,7 +648,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (!key) goto release_sk1; - genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb); + genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) goto release_sk1; } else { @@ -855,35 +856,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -/* - * Return true if a syncookie should be sent - */ -bool tcp_syn_flood_action(struct sock *sk, - const struct sk_buff *skb, - const char *proto) -{ - const char *msg = "Dropping request"; - bool want_cookie = false; - struct listen_sock *lopt; - -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) { - msg = "Sending cookies"; - want_cookie = true; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); - } else -#endif - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - - lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { - lopt->synflood_warned = 1; - pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", - proto, ntohs(tcp_hdr(skb)->dest), msg); - } - return want_cookie; -} -EXPORT_SYMBOL(tcp_syn_flood_action); #ifdef CONFIG_TCP_MD5SIG /* @@ -897,10 +869,10 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, const union tcp_md5_addr *addr, int family) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; unsigned int size = sizeof(struct in_addr); - struct tcp_md5sig_info *md5sig; + const struct tcp_md5sig_info *md5sig; /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, @@ -923,24 +895,15 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, EXPORT_SYMBOL(tcp_md5_do_lookup); struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, - struct sock *addr_sk) + const struct sock *addr_sk) { union tcp_md5_addr *addr; - addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr; + addr = (union tcp_md5_addr *)&sk->sk_daddr; return tcp_md5_do_lookup(sk, addr, AF_INET); } EXPORT_SYMBOL(tcp_v4_md5_lookup); -static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, - struct request_sock *req) -{ - union tcp_md5_addr *addr; - - addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr; - return tcp_md5_do_lookup(sk, addr, AF_INET); -} - /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) @@ -1101,8 +1064,8 @@ clear_hash_noput: return 1; } -int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, - const struct sock *sk, const struct request_sock *req, +int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb) { struct tcp_md5sig_pool *hp; @@ -1110,12 +1073,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, const struct tcphdr *th = tcp_hdr(skb); __be32 saddr, daddr; - if (sk) { - saddr = inet_sk(sk)->inet_saddr; - daddr = inet_sk(sk)->inet_daddr; - } else if (req) { - saddr = inet_rsk(req)->ir_loc_addr; - daddr = inet_rsk(req)->ir_rmt_addr; + if (sk) { /* valid for establish/request sockets */ + saddr = sk->sk_rcv_saddr; + daddr = sk->sk_daddr; } else { const struct iphdr *iph = ip_hdr(skb); saddr = iph->saddr; @@ -1152,8 +1112,9 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); -static bool __tcp_v4_inbound_md5_hash(struct sock *sk, - const struct sk_buff *skb) +/* Called with rcu_read_lock() */ +static bool tcp_v4_inbound_md5_hash(struct sock *sk, + const struct sk_buff *skb) { /* * This gets called for each TCP segment that arrives @@ -1193,7 +1154,7 @@ static bool __tcp_v4_inbound_md5_hash(struct sock *sk, */ genhash = tcp_v4_md5_hash_skb(newhash, hash_expected, - NULL, NULL, skb); + NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", @@ -1205,28 +1166,16 @@ static bool __tcp_v4_inbound_md5_hash(struct sock *sk, } return false; } - -static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) -{ - bool ret; - - rcu_read_lock(); - ret = __tcp_v4_inbound_md5_hash(sk, skb); - rcu_read_unlock(); - - return ret; -} - #endif -static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, +static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); - ireq->ir_loc_addr = ip_hdr(skb)->daddr; - ireq->ir_rmt_addr = ip_hdr(skb)->saddr; - ireq->no_srccheck = inet_sk(sk)->transparent; + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); + ireq->no_srccheck = inet_sk(sk_listener)->transparent; ireq->opt = tcp_v4_save_options(skb); } @@ -1259,7 +1208,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG - .md5_lookup = tcp_v4_reqsk_md5_lookup, + .req_md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, #endif .init_req = tcp_v4_init_req, @@ -1318,8 +1267,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->inet_daddr = ireq->ir_rmt_addr; - newinet->inet_rcv_saddr = ireq->ir_loc_addr; + sk_daddr_set(newsk, ireq->ir_rmt_addr); + sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newinet->inet_saddr = ireq->ir_loc_addr; inet_opt = ireq->opt; rcu_assign_pointer(newinet->inet_opt, inet_opt); @@ -1391,15 +1340,17 @@ EXPORT_SYMBOL(tcp_v4_syn_recv_sock); static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = tcp_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); + struct request_sock *req; struct sock *nsk; - struct request_sock **prev; - /* Find possible connection requests. */ - struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, - iph->saddr, iph->daddr); - if (req) - return tcp_check_req(sk, skb, req, prev, false); + + req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); + if (req) { + nsk = tcp_check_req(sk, skb, req, false); + reqsk_put(req); + return nsk; + } nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); @@ -1517,7 +1468,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) if (sk) { skb->sk = sk; skb->destructor = sock_edemux; - if (sk->sk_state != TCP_TIME_WAIT) { + if (sk_fullsock(sk)) { struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); if (dst) @@ -1904,13 +1855,13 @@ get_req: } sk = sk_nulls_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { icsk = inet_csk(sk); - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); sk = sk_nulls_next(sk); } get_sk: @@ -1922,7 +1873,7 @@ get_sk: goto out; } icsk = inet_csk(sk); - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: st->uid = sock_i_uid(sk); @@ -1931,7 +1882,7 @@ start_req: st->sbucket = 0; goto get_req; } - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2150,7 +2101,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) case TCP_SEQ_STATE_OPENREQ: if (v) { struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) @@ -2204,17 +2155,17 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) } EXPORT_SYMBOL(tcp_proc_unregister); -static void get_openreq4(const struct sock *sk, const struct request_sock *req, +static void get_openreq4(const struct request_sock *req, struct seq_file *f, int i, kuid_t uid) { const struct inet_request_sock *ireq = inet_rsk(req); - long delta = req->expires - jiffies; + long delta = req->rsk_timer.expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", i, ireq->ir_loc_addr, - ntohs(inet_sk(sk)->inet_sport), + ireq->ir_num, ireq->ir_rmt_addr, ntohs(ireq->ir_rmt_port), TCP_SYN_RECV, @@ -2225,7 +2176,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, from_kuid_munged(seq_user_ns(f), uid), 0, /* non standard timer */ 0, /* open_requests have no inode */ - atomic_read(&sk->sk_refcnt), + 0, req); } @@ -2332,7 +2283,7 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) get_tcp4_sock(v, seq, st->num); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid); + get_openreq4(v, seq, st->num, st->uid); break; } out: @@ -2460,6 +2411,8 @@ static int __net_init tcp_sk_init(struct net *net) } net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; + net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; + net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; return 0; fail: diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index e5f41bd5ec1b..71ec14c87579 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -40,6 +40,7 @@ struct tcp_fastopen_metrics { struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; + possible_net_t tcpm_net; struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; @@ -52,6 +53,11 @@ struct tcp_metrics_block { struct rcu_head rcu_head; }; +static inline struct net *tm_net(struct tcp_metrics_block *tm) +{ + return read_pnet(&tm->tcpm_net); +} + static bool tcp_metric_locked(struct tcp_metrics_block *tm, enum tcp_metric_index idx) { @@ -74,23 +80,20 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { - const struct in6_addr *a6, *b6; - if (a->family != b->family) return false; if (a->family == AF_INET) return a->addr.a4 == b->addr.a4; - - a6 = (const struct in6_addr *) &a->addr.a6[0]; - b6 = (const struct in6_addr *) &b->addr.a6[0]; - - return ipv6_addr_equal(a6, b6); + return ipv6_addr_equal(&a->addr.in6, &b->addr.in6); } struct tcpm_hash_bucket { struct tcp_metrics_block __rcu *chain; }; +static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly; +static unsigned int tcp_metrics_hash_log __read_mostly; + static DEFINE_SPINLOCK(tcp_metrics_lock); static void tcpm_suck_dst(struct tcp_metrics_block *tm, @@ -143,6 +146,9 @@ static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst #define TCP_METRICS_RECLAIM_DEPTH 5 #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL +#define deref_locked(p) \ + rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock)) + static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, struct inetpeer_addr *saddr, struct inetpeer_addr *daddr, @@ -171,9 +177,9 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (unlikely(reclaim)) { struct tcp_metrics_block *oldest; - oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); - for (tm = rcu_dereference(oldest->tcpm_next); tm; - tm = rcu_dereference(tm->tcpm_next)) { + oldest = deref_locked(tcp_metrics_hash[hash].chain); + for (tm = deref_locked(oldest->tcpm_next); tm; + tm = deref_locked(tm->tcpm_next)) { if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp)) oldest = tm; } @@ -183,14 +189,15 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, if (!tm) goto out_unlock; } + write_pnet(&tm->tcpm_net, net); tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; tcpm_suck_dst(tm, dst, true); if (likely(!reclaim)) { - tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; - rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm); + tm->tcpm_next = tcp_metrics_hash[hash].chain; + rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm); } out_unlock: @@ -214,10 +221,11 @@ static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *s struct tcp_metrics_block *tm; int depth = 0; - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, saddr) && - addr_same(&tm->tcpm_daddr, daddr)) + addr_same(&tm->tcpm_daddr, daddr) && + net_eq(tm_net(tm), net)) break; depth++; } @@ -242,8 +250,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr; - *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; + saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr; + daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr; hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif @@ -252,12 +260,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, } net = dev_net(dst->dev); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash ^= net_hash_mix(net); + hash = hash_32(hash, tcp_metrics_hash_log); - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && - addr_same(&tm->tcpm_daddr, &daddr)) + addr_same(&tm->tcpm_daddr, &daddr) && + net_eq(tm_net(tm), net)) break; } tcpm_check_stamp(tm, dst); @@ -288,9 +298,9 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock hash = (__force unsigned int) daddr.addr.a4; } else { saddr.family = AF_INET6; - *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr; + saddr.addr.in6 = tw->tw_v6_rcv_saddr; daddr.family = AF_INET6; - *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr; + daddr.addr.in6 = tw->tw_v6_daddr; hash = ipv6_addr_hash(&tw->tw_v6_daddr); } } @@ -299,12 +309,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock return NULL; net = twsk_net(tw); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash ^= net_hash_mix(net); + hash = hash_32(hash, tcp_metrics_hash_log); - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && - addr_same(&tm->tcpm_daddr, &daddr)) + addr_same(&tm->tcpm_daddr, &daddr) && + net_eq(tm_net(tm), net)) break; } return tm; @@ -336,9 +348,9 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, hash = (__force unsigned int) daddr.addr.a4; } else { saddr.family = AF_INET6; - *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr; + saddr.addr.in6 = sk->sk_v6_rcv_saddr; daddr.family = AF_INET6; - *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr; + daddr.addr.in6 = sk->sk_v6_daddr; hash = ipv6_addr_hash(&sk->sk_v6_daddr); } } @@ -347,7 +359,8 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, return NULL; net = dev_net(dst->dev); - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash ^= net_hash_mix(net); + hash = hash_32(hash, tcp_metrics_hash_log); tm = __tcp_get_metrics(&saddr, &daddr, net, hash); if (tm == TCP_METRICS_RECLAIM_PTR) @@ -773,19 +786,19 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, switch (tm->tcpm_daddr.family) { case AF_INET: - if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4, - tm->tcpm_daddr.addr.a4) < 0) + if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4, + tm->tcpm_daddr.addr.a4) < 0) goto nla_put_failure; - if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4, - tm->tcpm_saddr.addr.a4) < 0) + if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4, + tm->tcpm_saddr.addr.a4) < 0) goto nla_put_failure; break; case AF_INET6: - if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16, - tm->tcpm_daddr.addr.a6) < 0) + if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6, + &tm->tcpm_daddr.addr.in6) < 0) goto nla_put_failure; - if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16, - tm->tcpm_saddr.addr.a6) < 0) + if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6, + &tm->tcpm_saddr.addr.in6) < 0) goto nla_put_failure; break; default: @@ -898,17 +911,19 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; + unsigned int max_rows = 1U << tcp_metrics_hash_log; unsigned int row, s_row = cb->args[0]; int s_col = cb->args[1], col = s_col; for (row = s_row; row < max_rows; row++, s_col = 0) { struct tcp_metrics_block *tm; - struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row; + struct tcpm_hash_bucket *hb = tcp_metrics_hash + row; rcu_read_lock(); for (col = 0, tm = rcu_dereference(hb->chain); tm; tm = rcu_dereference(tm->tcpm_next), col++) { + if (!net_eq(tm_net(tm), net)) + continue; if (col < s_col) continue; if (tcp_metrics_dump_info(skb, cb, tm) < 0) { @@ -933,7 +948,7 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, a = info->attrs[v4]; if (a) { addr->family = AF_INET; - addr->addr.a4 = nla_get_be32(a); + addr->addr.a4 = nla_get_in_addr(a); if (hash) *hash = (__force unsigned int) addr->addr.a4; return 0; @@ -943,9 +958,9 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; addr->family = AF_INET6; - memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6)); + addr->addr.in6 = nla_get_in6_addr(a); if (hash) - *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6); + *hash = ipv6_addr_hash(&addr->addr.in6); return 0; } return optional ? 1 : -EAFNOSUPPORT; @@ -994,13 +1009,15 @@ static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) if (!reply) goto nla_put_failure; - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); + hash ^= net_hash_mix(net); + hash = hash_32(hash, tcp_metrics_hash_log); ret = -ESRCH; rcu_read_lock(); - for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm; + for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_daddr, &daddr) && - (!src || addr_same(&tm->tcpm_saddr, &saddr))) { + (!src || addr_same(&tm->tcpm_saddr, &saddr)) && + net_eq(tm_net(tm), net)) { ret = tcp_metrics_fill_info(msg, tm); break; } @@ -1020,34 +1037,27 @@ out_free: return ret; } -#define deref_locked_genl(p) \ - rcu_dereference_protected(p, lockdep_genl_is_held() && \ - lockdep_is_held(&tcp_metrics_lock)) - -#define deref_genl(p) rcu_dereference_protected(p, lockdep_genl_is_held()) - -static int tcp_metrics_flush_all(struct net *net) +static void tcp_metrics_flush_all(struct net *net) { - unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log; - struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash; + unsigned int max_rows = 1U << tcp_metrics_hash_log; + struct tcpm_hash_bucket *hb = tcp_metrics_hash; struct tcp_metrics_block *tm; unsigned int row; for (row = 0; row < max_rows; row++, hb++) { + struct tcp_metrics_block __rcu **pp; spin_lock_bh(&tcp_metrics_lock); - tm = deref_locked_genl(hb->chain); - if (tm) - hb->chain = NULL; - spin_unlock_bh(&tcp_metrics_lock); - while (tm) { - struct tcp_metrics_block *next; - - next = deref_genl(tm->tcpm_next); - kfree_rcu(tm, rcu_head); - tm = next; + pp = &hb->chain; + for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { + if (net_eq(tm_net(tm), net)) { + *pp = tm->tcpm_next; + kfree_rcu(tm, rcu_head); + } else { + pp = &tm->tcpm_next; + } } + spin_unlock_bh(&tcp_metrics_lock); } - return 0; } static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) @@ -1064,19 +1074,23 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) return ret; - if (ret > 0) - return tcp_metrics_flush_all(net); + if (ret > 0) { + tcp_metrics_flush_all(net); + return 0; + } ret = parse_nl_saddr(info, &saddr); if (ret < 0) src = false; - hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); - hb = net->ipv4.tcp_metrics_hash + hash; + hash ^= net_hash_mix(net); + hash = hash_32(hash, tcp_metrics_hash_log); + hb = tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); - for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) { + for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { if (addr_same(&tm->tcpm_daddr, &daddr) && - (!src || addr_same(&tm->tcpm_saddr, &saddr))) { + (!src || addr_same(&tm->tcpm_saddr, &saddr)) && + net_eq(tm_net(tm), net)) { *pp = tm->tcpm_next; kfree_rcu(tm, rcu_head); found = true; @@ -1126,6 +1140,9 @@ static int __net_init tcp_net_metrics_init(struct net *net) size_t size; unsigned int slots; + if (!net_eq(net, &init_net)) + return 0; + slots = tcpmhash_entries; if (!slots) { if (totalram_pages >= 128 * 1024) @@ -1134,14 +1151,14 @@ static int __net_init tcp_net_metrics_init(struct net *net) slots = 8 * 1024; } - net->ipv4.tcp_metrics_hash_log = order_base_2(slots); - size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log; + tcp_metrics_hash_log = order_base_2(slots); + size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log; - net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!net->ipv4.tcp_metrics_hash) - net->ipv4.tcp_metrics_hash = vzalloc(size); + tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!tcp_metrics_hash) + tcp_metrics_hash = vzalloc(size); - if (!net->ipv4.tcp_metrics_hash) + if (!tcp_metrics_hash) return -ENOMEM; return 0; @@ -1149,19 +1166,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) static void __net_exit tcp_net_metrics_exit(struct net *net) { - unsigned int i; - - for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) { - struct tcp_metrics_block *tm, *next; - - tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1); - while (tm) { - next = rcu_dereference_protected(tm->tcpm_next, 1); - kfree(tm); - tm = next; - } - } - kvfree(net->ipv4.tcp_metrics_hash); + tcp_metrics_flush_all(net); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { @@ -1175,16 +1180,10 @@ void __init tcp_metrics_init(void) ret = register_pernet_subsys(&tcp_net_metrics_ops); if (ret < 0) - goto cleanup; + panic("Could not allocate the tcp_metrics hash table\n"); + ret = genl_register_family_with_ops(&tcp_metrics_nl_family, tcp_metrics_nl_ops); if (ret < 0) - goto cleanup_subsys; - return; - -cleanup_subsys: - unregister_pernet_subsys(&tcp_net_metrics_ops); - -cleanup: - return; + panic("Could not register tcp_metrics generic netlink\n"); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dd11ac7798c6..274e96fb369b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -572,7 +572,6 @@ EXPORT_SYMBOL(tcp_create_openreq_child); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct request_sock **prev, bool fastopen) { struct tcp_options_received tmp_opt; @@ -630,8 +629,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, &tcp_rsk(req)->last_oow_ack_time) && !inet_rtx_syn_ack(sk, req)) - req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout, - TCP_RTO_MAX) + jiffies; + mod_timer_pending(&req->rsk_timer, jiffies + + min(TCP_TIMEOUT_INIT << req->num_timeout, + TCP_RTO_MAX)); return NULL; } @@ -766,7 +766,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (child == NULL) goto listen_overflow; - inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_unlink(sk, req); inet_csk_reqsk_queue_removed(sk, req); inet_csk_reqsk_queue_add(sk, req, child); @@ -791,7 +791,7 @@ embryonic_reset: tcp_reset(sk); } if (!fastopen) { - inet_csk_reqsk_queue_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); } return NULL; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 9d7930ba8e0f..3f7c2fca5431 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -29,8 +29,8 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, } } -struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, - netdev_features_t features) +static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, + netdev_features_t features) { if (!pskb_may_pull(skb, sizeof(struct tcphdr))) return ERR_PTR(-EINVAL); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1db253e36045..2e69b8d16e68 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -601,15 +601,14 @@ static unsigned int tcp_synack_options(struct sock *sk, struct request_sock *req, unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5, + const struct tcp_md5sig_key *md5, struct tcp_fastopen_cookie *foc) { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; #ifdef CONFIG_TCP_MD5SIG - *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); - if (*md5) { + if (md5) { opts->options |= OPTION_MD5; remaining -= TCPOLEN_MD5SIG_ALIGNED; @@ -620,8 +619,6 @@ static unsigned int tcp_synack_options(struct sock *sk, */ ireq->tstamp_ok &= !ireq->sack_ok; } -#else - *md5 = NULL; #endif /* We always send an MSS option. */ @@ -989,7 +986,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (md5) { sk_nocaps_add(sk, NETIF_F_GSO_MASK); tp->af_specific->calc_md5_hash(opts.hash_location, - md5, sk, NULL, skb); + md5, sk, skb); } #endif @@ -1354,6 +1351,8 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; + if (icsk->icsk_mtup.enabled) + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; } EXPORT_SYMBOL(tcp_mtup_init); @@ -1752,20 +1751,23 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, bool *is_cwnd_limited, u32 max_segs) { - struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - u32 send_win, cong_win, limit, in_flight; + u32 age, send_win, cong_win, limit, in_flight; + struct tcp_sock *tp = tcp_sk(sk); + struct skb_mstamp now; + struct sk_buff *head; int win_divisor; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto send_now; - if (icsk->icsk_ca_state != TCP_CA_Open) + if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR))) goto send_now; - /* Defer for less than two clock ticks. */ - if (tp->tso_deferred && - (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) + /* Avoid bursty behavior by allowing defer + * only if the last write was recent. + */ + if ((s32)(tcp_time_stamp - tp->lsndtime) > 0) goto send_now; in_flight = tcp_packets_in_flight(tp); @@ -1807,11 +1809,14 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, goto send_now; } - /* Ok, it looks like it is advisable to defer. - * Do not rearm the timer if already set to not break TCP ACK clocking. - */ - if (!tp->tso_deferred) - tp->tso_deferred = 1 | (jiffies << 1); + head = tcp_write_queue_head(sk); + skb_mstamp_get(&now); + age = skb_mstamp_us_delta(&now, &head->skb_mstamp); + /* If next ACK is likely to come too late (half srtt), do not defer */ + if (age < (tp->srtt_us >> 4)) + goto send_now; + + /* Ok, it looks like it is advisable to defer. */ if (cong_win < send_win && cong_win < skb->len) *is_cwnd_limited = true; @@ -1819,10 +1824,34 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, return true; send_now: - tp->tso_deferred = 0; return false; } +static inline void tcp_mtu_check_reprobe(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + u32 interval; + s32 delta; + + interval = net->ipv4.sysctl_tcp_probe_interval; + delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; + if (unlikely(delta >= interval * HZ)) { + int mss = tcp_current_mss(sk); + + /* Update current search range */ + icsk->icsk_mtup.probe_size = 0; + icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + + sizeof(struct tcphdr) + + icsk->icsk_af_ops->net_header_len; + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + + /* Update probe time stamp */ + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + } +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -1837,11 +1866,13 @@ static int tcp_mtu_probe(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *nskb, *next; + struct net *net = sock_net(sk); int len; int probe_size; int size_needed; int copy; int mss_now; + int interval; /* Not currently probing/verifying, * not in recovery, @@ -1854,12 +1885,25 @@ static int tcp_mtu_probe(struct sock *sk) tp->rx_opt.num_sacks || tp->rx_opt.dsack) return -1; - /* Very simple search strategy: just double the MSS. */ + /* Use binary search for probe_size between tcp_mss_base, + * and current mss_clamp. if (search_high - search_low) + * smaller than a threshold, backoff from probing. + */ mss_now = tcp_current_mss(sk); - probe_size = 2 * tp->mss_cache; + probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high + + icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; - if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { - /* TODO: set timer for probe_converge_event */ + interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; + /* When misfortune happens, we are reprobing actively, + * and then reprobe timer has expired. We stick with current + * probing process by not resetting search range to its orignal. + */ + if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || + interval < net->ipv4.sysctl_tcp_probe_threshold) { + /* Check whether enough time has elaplased for + * another round of probing. + */ + tcp_mtu_check_reprobe(sk); return -1; } @@ -2866,7 +2910,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th; struct sk_buff *skb; - struct tcp_md5sig_key *md5; + struct tcp_md5sig_key *md5 = NULL; int tcp_header_size; int mss; @@ -2879,7 +2923,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb_reserve(skb, MAX_TCP_HEADER); skb_dst_set(skb, dst); - security_skb_owned_by(skb, sk); mss = dst_metric_advmss(dst); if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) @@ -2892,7 +2935,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, else #endif skb_mstamp_get(&skb->skb_mstamp); - tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, + +#ifdef CONFIG_TCP_MD5SIG + rcu_read_lock(); + md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); +#endif + tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, foc) + sizeof(*th); skb_push(skb, tcp_header_size); @@ -2923,10 +2971,10 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ - if (md5) { + if (md5) tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, - md5, NULL, req, skb); - } + md5, req_to_sk(req), skb); + rcu_read_unlock(); #endif return skb; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0732b787904e..2568fd282873 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) if (net->ipv4.sysctl_tcp_mtu_probing) { if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct net *net = sock_net(sk); @@ -326,7 +327,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) struct request_sock *req; req = tcp_sk(sk)->fastopen_rsk; - req->rsk_ops->syn_ack_timeout(sk, req); + req->rsk_ops->syn_ack_timeout(req); if (req->num_timeout >= max_retries) { tcp_write_err(sk); @@ -538,19 +539,11 @@ static void tcp_write_timer(unsigned long data) sock_put(sk); } -/* - * Timer for listening sockets - */ - -static void tcp_synack_timer(struct sock *sk) +void tcp_syn_ack_timeout(const struct request_sock *req) { - inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, - TCP_TIMEOUT_INIT, TCP_RTO_MAX); -} + struct net *net = read_pnet(&inet_rsk(req)->ireq_net); -void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req) -{ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); + NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS); } EXPORT_SYMBOL(tcp_syn_ack_timeout); @@ -582,7 +575,7 @@ static void tcp_keepalive_timer (unsigned long data) } if (sk->sk_state == TCP_LISTEN) { - tcp_synack_timer(sk); + pr_err("Hmm... keepalive on a LISTEN ???\n"); goto out; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 97ef1f8b7be8..294af16633af 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -318,8 +318,8 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); } -static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, - unsigned int port) +static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, + unsigned int port) { return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port; } @@ -421,9 +421,9 @@ static inline int compute_score2(struct sock *sk, struct net *net, return score; } -static unsigned int udp_ehashfn(struct net *net, const __be32 laddr, - const __u16 lport, const __be32 faddr, - const __be16 fport) +static u32 udp_ehashfn(const struct net *net, const __be32 laddr, + const __u16 lport, const __be32 faddr, + const __be16 fport) { static u32 udp_ehash_secret __read_mostly; @@ -873,8 +873,7 @@ out: } EXPORT_SYMBOL(udp_push_pending_frames); -int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len) +int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); @@ -1136,7 +1135,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, * sendpage interface can't pass. * This will succeed only when the socket is connected. */ - ret = udp_sendmsg(NULL, sk, &msg, 0); + ret = udp_sendmsg(sk, &msg, 0); if (ret < 0) return ret; } @@ -1254,8 +1253,8 @@ EXPORT_SYMBOL(udp_ioctl); * return it, otherwise we block. */ -int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len) +int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); @@ -2525,6 +2524,16 @@ void __init udp_table_init(struct udp_table *table, const char *name) } } +u32 udp_flow_hashrnd(void) +{ + static u32 hashrnd __read_mostly; + + net_get_random_once(&hashrnd, sizeof(hashrnd)); + + return hashrnd; +} +EXPORT_SYMBOL(udp_flow_hashrnd); + void __init udp_init(void) { unsigned long limit; diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 4a000f1dd757..2dbfc1f1f7b3 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -18,8 +18,9 @@ #include <linux/sock_diag.h> static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, struct inet_diag_req_v2 *req, - struct nlattr *bc) + struct netlink_callback *cb, + const struct inet_diag_req_v2 *req, + struct nlattr *bc) { if (!inet_diag_bc_sk(bc, sk)) return 0; @@ -31,7 +32,8 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, } static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, - const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req) + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *req) { int err = -EINVAL; struct sock *sk; @@ -90,8 +92,9 @@ out_nosk: return err; } -static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) +static void udp_dump(struct udp_table *table, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, struct nlattr *bc) { int num, s_num, slot, s_slot; struct net *net = sock_net(skb->sk); @@ -144,13 +147,13 @@ done: } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r, struct nlattr *bc) { udp_dump(&udp_table, skb, cb, r, bc); } static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) + const struct inet_diag_req_v2 *req) { return udp_dump_one(&udp_table, in_skb, nlh, req); } @@ -170,13 +173,14 @@ static const struct inet_diag_handler udp_diag_handler = { }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct inet_diag_req_v2 *r, struct nlattr *bc) + const struct inet_diag_req_v2 *r, + struct nlattr *bc) { udp_dump(&udplite_table, skb, cb, r, bc); } static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh, - struct inet_diag_req_v2 *req) + const struct inet_diag_req_v2 *req) { return udp_dump_one(&udplite_table, in_skb, nlh, req); } diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index f3c27899f62b..7e0fe4bdd967 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -21,8 +21,8 @@ int compat_udp_setsockopt(struct sock *sk, int level, int optname, int compat_udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); #endif -int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, - size_t len, int noblock, int flags, int *addr_len); +int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len); int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 91771a7c802f..35feda676464 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -63,7 +63,7 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->saddr = x->props.saddr.a4; top_iph->daddr = x->id.daddr.a4; - ip_select_ident(skb, NULL); + ip_select_ident(dev_net(dst->dev), skb, NULL); return 0; } diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 6156f68a1e90..c224c856247b 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -232,7 +232,6 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static struct dst_ops xfrm4_dst_ops = { .family = AF_INET, - .protocol = cpu_to_be16(ETH_P_IP), .gc = xfrm4_garbage_collect, .update_pmtu = xfrm4_update_pmtu, .redirect = xfrm4_redirect, |