summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_semantics.c42
-rw-r--r--net/ipv4/fib_trie.c53
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/route.c183
-rw-r--r--net/ipv4/sysctl_net_ipv4.c15
-rw-r--r--net/ipv4/tcp_input.c24
-rw-r--r--net/ipv4/tcp_ipv4.c14
-rw-r--r--net/ipv4/tcp_minisocks.c3
-rw-r--r--net/ipv4/tcp_output.c12
-rw-r--r--net/ipv4/xfrm4_policy.c1
12 files changed, 227 insertions, 125 deletions
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ae2ccf2890e4..15ca63ec604e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o
+obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 8732cc7920ed..c43ae3fba792 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1046,6 +1046,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
if (event == NETDEV_UNREGISTER) {
fib_disable_ip(dev, 2, -1);
+ rt_flush_dev(dev);
return NOTIFY_DONE;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index da0cc2e6b250..da80dc14cc76 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -140,6 +140,21 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
},
};
+static void rt_fibinfo_free(struct rtable __rcu **rtp)
+{
+ struct rtable *rt = rcu_dereference_protected(*rtp, 1);
+
+ if (!rt)
+ return;
+
+ /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
+ * because we waited an RCU grace period before calling
+ * free_fib_info_rcu()
+ */
+
+ dst_free(&rt->dst);
+}
+
static void free_nh_exceptions(struct fib_nh *nh)
{
struct fnhe_hash_bucket *hash = nh->nh_exceptions;
@@ -153,6 +168,9 @@ static void free_nh_exceptions(struct fib_nh *nh)
struct fib_nh_exception *next;
next = rcu_dereference_protected(fnhe->fnhe_next, 1);
+
+ rt_fibinfo_free(&fnhe->fnhe_rth);
+
kfree(fnhe);
fnhe = next;
@@ -161,6 +179,23 @@ static void free_nh_exceptions(struct fib_nh *nh)
kfree(hash);
}
+static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
+{
+ int cpu;
+
+ if (!rtp)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rtable *rt;
+
+ rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
+ if (rt)
+ dst_free(&rt->dst);
+ }
+ free_percpu(rtp);
+}
+
/* Release a nexthop info record */
static void free_fib_info_rcu(struct rcu_head *head)
{
@@ -171,10 +206,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
dev_put(nexthop_nh->nh_dev);
if (nexthop_nh->nh_exceptions)
free_nh_exceptions(nexthop_nh);
- if (nexthop_nh->nh_rth_output)
- dst_free(&nexthop_nh->nh_rth_output->dst);
- if (nexthop_nh->nh_rth_input)
- dst_free(&nexthop_nh->nh_rth_input->dst);
+ rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
+ rt_fibinfo_free(&nexthop_nh->nh_rth_input);
} endfor_nexthops(fi);
release_net(fi->fib_net);
@@ -804,6 +837,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi->fib_nhs = nhs;
change_nexthops(fi) {
nexthop_nh->nh_parent = fi;
+ nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
} endfor_nexthops(fi)
if (cfg->fc_mx) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 18cbc15b20d5..f0cdb30921c0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -159,7 +159,6 @@ struct trie {
#endif
};
-static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
int wasfull);
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
@@ -473,7 +472,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
}
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
- sizeof(struct rt_trie_node) << bits);
+ sizeof(struct rt_trie_node *) << bits);
return tn;
}
@@ -490,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *
return ((struct tnode *) n)->pos == tn->pos + tn->bits;
}
-static inline void put_child(struct trie *t, struct tnode *tn, int i,
+static inline void put_child(struct tnode *tn, int i,
struct rt_trie_node *n)
{
tnode_put_child_reorg(tn, i, n, -1);
@@ -754,8 +753,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
goto nomem;
}
- put_child(t, tn, 2*i, (struct rt_trie_node *) left);
- put_child(t, tn, 2*i+1, (struct rt_trie_node *) right);
+ put_child(tn, 2*i, (struct rt_trie_node *) left);
+ put_child(tn, 2*i+1, (struct rt_trie_node *) right);
}
}
@@ -776,9 +775,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
if (tkey_extract_bits(node->key,
oldtnode->pos + oldtnode->bits,
1) == 0)
- put_child(t, tn, 2*i, node);
+ put_child(tn, 2*i, node);
else
- put_child(t, tn, 2*i+1, node);
+ put_child(tn, 2*i+1, node);
continue;
}
@@ -786,8 +785,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
inode = (struct tnode *) node;
if (inode->bits == 1) {
- put_child(t, tn, 2*i, rtnl_dereference(inode->child[0]));
- put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1]));
+ put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
+ put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
tnode_free_safe(inode);
continue;
@@ -817,22 +816,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
*/
left = (struct tnode *) tnode_get_child(tn, 2*i);
- put_child(t, tn, 2*i, NULL);
+ put_child(tn, 2*i, NULL);
BUG_ON(!left);
right = (struct tnode *) tnode_get_child(tn, 2*i+1);
- put_child(t, tn, 2*i+1, NULL);
+ put_child(tn, 2*i+1, NULL);
BUG_ON(!right);
size = tnode_child_length(left);
for (j = 0; j < size; j++) {
- put_child(t, left, j, rtnl_dereference(inode->child[j]));
- put_child(t, right, j, rtnl_dereference(inode->child[j + size]));
+ put_child(left, j, rtnl_dereference(inode->child[j]));
+ put_child(right, j, rtnl_dereference(inode->child[j + size]));
}
- put_child(t, tn, 2*i, resize(t, left));
- put_child(t, tn, 2*i+1, resize(t, right));
+ put_child(tn, 2*i, resize(t, left));
+ put_child(tn, 2*i+1, resize(t, right));
tnode_free_safe(inode);
}
@@ -877,7 +876,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (!newn)
goto nomem;
- put_child(t, tn, i/2, (struct rt_trie_node *)newn);
+ put_child(tn, i/2, (struct rt_trie_node *)newn);
}
}
@@ -892,21 +891,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
if (left == NULL) {
if (right == NULL) /* Both are empty */
continue;
- put_child(t, tn, i/2, right);
+ put_child(tn, i/2, right);
continue;
}
if (right == NULL) {
- put_child(t, tn, i/2, left);
+ put_child(tn, i/2, left);
continue;
}
/* Two nonempty children */
newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
- put_child(t, tn, i/2, NULL);
- put_child(t, newBinNode, 0, left);
- put_child(t, newBinNode, 1, right);
- put_child(t, tn, i/2, resize(t, newBinNode));
+ put_child(tn, i/2, NULL);
+ put_child(newBinNode, 0, left);
+ put_child(newBinNode, 1, right);
+ put_child(tn, i/2, resize(t, newBinNode));
}
tnode_free_safe(oldtnode);
return tn;
@@ -1125,7 +1124,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
node_set_parent((struct rt_trie_node *)l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, tp, cindex, (struct rt_trie_node *)l);
+ put_child(tp, cindex, (struct rt_trie_node *)l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
@@ -1155,12 +1154,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
node_set_parent((struct rt_trie_node *)tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
- put_child(t, tn, missbit, (struct rt_trie_node *)l);
- put_child(t, tn, 1-missbit, n);
+ put_child(tn, missbit, (struct rt_trie_node *)l);
+ put_child(tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(t, tp, cindex, (struct rt_trie_node *)tn);
+ put_child(tp, cindex, (struct rt_trie_node *)tn);
} else {
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
tp = tn;
@@ -1619,7 +1618,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
if (tp) {
t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
- put_child(t, tp, cindex, NULL);
+ put_child(tp, cindex, NULL);
trie_rebalance(t, tp);
} else
RCU_INIT_POINTER(t->trie, NULL);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 981ff1eef28c..f1395a6fb35f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -325,14 +325,12 @@ static int ip_rcv_finish(struct sk_buff *skb)
const struct net_protocol *ipprot;
int protocol = iph->protocol;
- rcu_read_lock();
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->early_demux) {
ipprot->early_demux(skb);
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
- rcu_read_unlock();
}
/*
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fc1a81ca79a7..c035251beb07 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -147,6 +147,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
+static void ipv4_dst_destroy(struct dst_entry *dst);
static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
int how)
@@ -170,6 +171,7 @@ static struct dst_ops ipv4_dst_ops = {
.default_advmss = ipv4_default_advmss,
.mtu = ipv4_mtu,
.cow_metrics = ipv4_cow_metrics,
+ .destroy = ipv4_dst_destroy,
.ifdown = ipv4_dst_ifdown,
.negative_advice = ipv4_negative_advice,
.link_failure = ipv4_link_failure,
@@ -587,11 +589,17 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
build_sk_flow_key(fl4, sk);
}
-static DEFINE_SEQLOCK(fnhe_seqlock);
+static inline void rt_free(struct rtable *rt)
+{
+ call_rcu(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static DEFINE_SPINLOCK(fnhe_lock);
static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
{
struct fib_nh_exception *fnhe, *oldest;
+ struct rtable *orig;
oldest = rcu_dereference(hash->chain);
for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
@@ -599,6 +607,11 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
oldest = fnhe;
}
+ orig = rcu_dereference(oldest->fnhe_rth);
+ if (orig) {
+ RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
+ rt_free(orig);
+ }
return oldest;
}
@@ -620,7 +633,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
int depth;
u32 hval = fnhe_hashfun(daddr);
- write_seqlock_bh(&fnhe_seqlock);
+ spin_lock_bh(&fnhe_lock);
hash = nh->nh_exceptions;
if (!hash) {
@@ -667,7 +680,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_stamp = jiffies;
out_unlock:
- write_sequnlock_bh(&fnhe_seqlock);
+ spin_unlock_bh(&fnhe_lock);
return;
}
@@ -1164,53 +1177,62 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
return NULL;
}
-static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
__be32 daddr)
{
- __be32 fnhe_daddr, gw;
- unsigned long expires;
- unsigned int seq;
- u32 pmtu;
-
-restart:
- seq = read_seqbegin(&fnhe_seqlock);
- fnhe_daddr = fnhe->fnhe_daddr;
- gw = fnhe->fnhe_gw;
- pmtu = fnhe->fnhe_pmtu;
- expires = fnhe->fnhe_expires;
- if (read_seqretry(&fnhe_seqlock, seq))
- goto restart;
-
- if (daddr != fnhe_daddr)
- return;
+ bool ret = false;
+
+ spin_lock_bh(&fnhe_lock);
- if (pmtu) {
- unsigned long diff = expires - jiffies;
+ if (daddr == fnhe->fnhe_daddr) {
+ struct rtable *orig;
- if (time_before(jiffies, expires)) {
- rt->rt_pmtu = pmtu;
- dst_set_expires(&rt->dst, diff);
+ if (fnhe->fnhe_pmtu) {
+ unsigned long expires = fnhe->fnhe_expires;
+ unsigned long diff = expires - jiffies;
+
+ if (time_before(jiffies, expires)) {
+ rt->rt_pmtu = fnhe->fnhe_pmtu;
+ dst_set_expires(&rt->dst, diff);
+ }
}
+ if (fnhe->fnhe_gw) {
+ rt->rt_flags |= RTCF_REDIRECTED;
+ rt->rt_gateway = fnhe->fnhe_gw;
+ }
+
+ orig = rcu_dereference(fnhe->fnhe_rth);
+ rcu_assign_pointer(fnhe->fnhe_rth, rt);
+ if (orig)
+ rt_free(orig);
+
+ fnhe->fnhe_stamp = jiffies;
+ ret = true;
+ } else {
+ /* Routes we intend to cache in nexthop exception have
+ * the DST_NOCACHE bit clear. However, if we are
+ * unsuccessful at storing this route into the cache
+ * we really need to set it.
+ */
+ rt->dst.flags |= DST_NOCACHE;
}
- if (gw) {
- rt->rt_flags |= RTCF_REDIRECTED;
- rt->rt_gateway = gw;
- }
- fnhe->fnhe_stamp = jiffies;
-}
+ spin_unlock_bh(&fnhe_lock);
-static inline void rt_free(struct rtable *rt)
-{
- call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+ return ret;
}
-static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
{
- struct rtable *orig, *prev, **p = &nh->nh_rth_output;
-
- if (rt_is_input_route(rt))
- p = &nh->nh_rth_input;
+ struct rtable *orig, *prev, **p;
+ bool ret = true;
+ if (rt_is_input_route(rt)) {
+ p = (struct rtable **)&nh->nh_rth_input;
+ } else {
+ if (!nh->nh_pcpu_rth_output)
+ goto nocache;
+ p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
+ }
orig = *p;
prev = cmpxchg(p, orig, rt);
@@ -1223,7 +1245,50 @@ static void rt_cache_route(struct fib_nh *nh, struct rtable *rt)
* unsuccessful at storing this route into the cache
* we really need to set it.
*/
+nocache:
rt->dst.flags |= DST_NOCACHE;
+ ret = false;
+ }
+
+ return ret;
+}
+
+static DEFINE_SPINLOCK(rt_uncached_lock);
+static LIST_HEAD(rt_uncached_list);
+
+static void rt_add_uncached_list(struct rtable *rt)
+{
+ spin_lock_bh(&rt_uncached_lock);
+ list_add_tail(&rt->rt_uncached, &rt_uncached_list);
+ spin_unlock_bh(&rt_uncached_lock);
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+ struct rtable *rt = (struct rtable *) dst;
+
+ if (dst->flags & DST_NOCACHE) {
+ spin_lock_bh(&rt_uncached_lock);
+ list_del(&rt->rt_uncached);
+ spin_unlock_bh(&rt_uncached_lock);
+ }
+}
+
+void rt_flush_dev(struct net_device *dev)
+{
+ if (!list_empty(&rt_uncached_list)) {
+ struct net *net = dev_net(dev);
+ struct rtable *rt;
+
+ spin_lock_bh(&rt_uncached_lock);
+ list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
+ if (rt->dst.dev != dev)
+ continue;
+ rt->dst.dev = net->loopback_dev;
+ dev_hold(rt->dst.dev);
+ dev_put(dev);
+ }
+ spin_unlock_bh(&rt_uncached_lock);
}
}
@@ -1239,20 +1304,24 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
struct fib_nh_exception *fnhe,
struct fib_info *fi, u16 type, u32 itag)
{
+ bool cached = false;
+
if (fi) {
struct fib_nh *nh = &FIB_RES_NH(*res);
if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
rt->rt_gateway = nh->nh_gw;
- if (unlikely(fnhe))
- rt_bind_exception(rt, fnhe, daddr);
dst_init_metrics(&rt->dst, fi->fib_metrics, true);
#ifdef CONFIG_IP_ROUTE_CLASSID
rt->dst.tclassid = nh->nh_tclassid;
#endif
- if (!(rt->dst.flags & DST_NOCACHE))
- rt_cache_route(nh, rt);
+ if (unlikely(fnhe))
+ cached = rt_bind_exception(rt, fnhe, daddr);
+ else if (!(rt->dst.flags & DST_NOCACHE))
+ cached = rt_cache_route(nh, rt);
}
+ if (unlikely(!cached))
+ rt_add_uncached_list(rt);
#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1319,6 +1388,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
rth->rt_iif = 0;
rth->rt_pmtu = 0;
rth->rt_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
if (our) {
rth->dst.input= ip_local_deliver;
rth->rt_flags |= RTCF_LOCAL;
@@ -1420,7 +1490,7 @@ static int __mkroute_input(struct sk_buff *skb,
do_cache = false;
if (res->fi) {
if (!itag) {
- rth = FIB_RES_NH(*res).nh_rth_input;
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
@@ -1444,6 +1514,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth->rt_iif = 0;
rth->rt_pmtu = 0;
rth->rt_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
@@ -1582,7 +1653,7 @@ local_input:
do_cache = false;
if (res.fi) {
if (!itag) {
- rth = FIB_RES_NH(res).nh_rth_input;
+ rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
err = 0;
@@ -1610,6 +1681,7 @@ local_input:
rth->rt_iif = 0;
rth->rt_pmtu = 0;
rth->rt_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
if (res.type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
rth->dst.error= -err;
@@ -1748,19 +1820,23 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
fnhe = NULL;
if (fi) {
+ struct rtable __rcu **prth;
+
fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
- if (!fnhe) {
- rth = FIB_RES_NH(*res).nh_rth_output;
- if (rt_cache_valid(rth)) {
- dst_hold(&rth->dst);
- return rth;
- }
+ if (fnhe)
+ prth = &fnhe->fnhe_rth;
+ else
+ prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
+ rth = rcu_dereference(*prth);
+ if (rt_cache_valid(rth)) {
+ dst_hold(&rth->dst);
+ return rth;
}
}
rth = rt_dst_alloc(dev_out,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(in_dev, NOXFRM),
- fi && !fnhe);
+ fi);
if (!rth)
return ERR_PTR(-ENOBUFS);
@@ -1773,6 +1849,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
rth->rt_iif = orig_oif ? : 0;
rth->rt_pmtu = 0;
rth->rt_gateway = 0;
+ INIT_LIST_HEAD(&rth->rt_uncached);
RT_CACHE_STAT_INC(out_slow_tot);
@@ -2052,6 +2129,8 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_type = ort->rt_type;
rt->rt_gateway = ort->rt_gateway;
+ INIT_LIST_HEAD(&rt->rt_uncached);
+
dst_free(new);
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 5840c3255721..1b5ce96707a3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -184,7 +184,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
int ret;
unsigned long vec[3];
struct net *net = current->nsproxy->net_ns;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
struct mem_cgroup *memcg;
#endif
@@ -203,7 +203,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
if (ret)
return ret;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
@@ -784,13 +784,6 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "rt_cache_rebuild_count",
- .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "ping_group_range",
.data = &init_net.ipv4.sysctl_ping_group_range,
.maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range),
@@ -829,8 +822,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
table[5].data =
&net->ipv4.sysctl_icmp_ratemask;
table[6].data =
- &net->ipv4.sysctl_rt_cache_rebuild_count;
- table[7].data =
&net->ipv4.sysctl_ping_group_range;
}
@@ -842,8 +833,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
net->ipv4.sysctl_ping_group_range[0] = 1;
net->ipv4.sysctl_ping_group_range[1] = 0;
- net->ipv4.sysctl_rt_cache_rebuild_count = 4;
-
tcp_init_mem(net);
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a356e1fecf9a..2fd2bc9e3c64 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4351,19 +4351,20 @@ static void tcp_ofo_queue(struct sock *sk)
static bool tcp_prune_ofo_queue(struct sock *sk);
static int tcp_prune_queue(struct sock *sk);
-static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+ unsigned int size)
{
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- !sk_rmem_schedule(sk, size)) {
+ !sk_rmem_schedule(sk, skb, size)) {
if (tcp_prune_queue(sk) < 0)
return -1;
- if (!sk_rmem_schedule(sk, size)) {
+ if (!sk_rmem_schedule(sk, skb, size)) {
if (!tcp_prune_ofo_queue(sk))
return -1;
- if (!sk_rmem_schedule(sk, size))
+ if (!sk_rmem_schedule(sk, skb, size))
return -1;
}
}
@@ -4418,7 +4419,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
TCP_ECN_check_ce(tp, skb);
- if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) {
+ if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
__kfree_skb(skb);
return;
@@ -4552,17 +4553,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
struct tcphdr *th;
bool fragstolen;
- if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
- goto err;
-
skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
if (!skb)
goto err;
+ if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
+ goto err_free;
+
th = (struct tcphdr *)skb_put(skb, sizeof(*th));
skb_reset_transport_header(skb);
memset(th, 0, sizeof(*th));
@@ -4633,7 +4634,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) {
queue_and_out:
if (eaten < 0 &&
- tcp_try_rmem_schedule(sk, skb->truesize))
+ tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto drop;
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
@@ -5604,8 +5605,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
tcp_set_state(sk, TCP_ESTABLISHED);
if (skb != NULL) {
- sk->sk_rx_dst = dst_clone(skb_dst(skb));
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ inet_sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2fbd9921253f..42b2a6a73092 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1617,19 +1617,19 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
#endif
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+ struct dst_entry *dst = sk->sk_rx_dst;
+
sock_rps_save_rxhash(sk, skb);
- if (sk->sk_rx_dst) {
- struct dst_entry *dst = sk->sk_rx_dst;
+ if (dst) {
if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
dst->ops->check(dst, 0) == NULL) {
dst_release(dst);
sk->sk_rx_dst = NULL;
}
}
- if (unlikely(sk->sk_rx_dst == NULL)) {
- sk->sk_rx_dst = dst_clone(skb_dst(skb));
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
- }
+ if (unlikely(sk->sk_rx_dst == NULL))
+ inet_sk_rx_dst_set(sk, skb);
+
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
@@ -2633,7 +2633,7 @@ struct proto tcp_prot = {
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
.init_cgroup = tcp_init_cgroup,
.destroy_cgroup = tcp_destroy_cgroup,
.proto_cgroup = tcp_proto_cgroup,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3f1cc2028edd..232a90c3ec86 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -387,8 +387,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
struct tcp_sock *oldtp = tcp_sk(sk);
struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
- newsk->sk_rx_dst = dst_clone(skb_dst(skb));
- inet_sk(newsk)->rx_dst_ifindex = skb->skb_iif;
+ inet_sk_rx_dst_set(newsk, skb);
/* TCP Cookie Transactions require space for the cookie pair,
* as it differs for each connection. There is no need to
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 33cd065cfbd8..3f1bcff0b10b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2045,7 +2045,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
if (unlikely(sk->sk_state == TCP_CLOSE))
return;
- if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC))
+ if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
+ sk_gfp_atomic(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
@@ -2666,7 +2667,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
s_data_desired = cvp->s_data_desired;
- skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
+ sk_gfp_atomic(sk, GFP_ATOMIC));
if (unlikely(!skb)) {
dst_release(dst);
return NULL;
@@ -3064,7 +3066,7 @@ void tcp_send_ack(struct sock *sk)
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
- buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
if (buff == NULL) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
@@ -3079,7 +3081,7 @@ void tcp_send_ack(struct sock *sk)
/* Send it off, this clears delayed acks for us. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
- tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
+ tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
}
/* This routine sends a packet with an out of date sequence
@@ -3099,7 +3101,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
struct sk_buff *skb;
/* We don't queue it, tcp_transmit_skb() sets ownership. */
- skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+ skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
if (skb == NULL)
return -1;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index c6281847f16a..681ea2f413e2 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -92,6 +92,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_type = rt->rt_type;
xdst->u.rt.rt_gateway = rt->rt_gateway;
xdst->u.rt.rt_pmtu = rt->rt_pmtu;
+ INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
return 0;
}