summaryrefslogtreecommitdiff
path: root/include/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2014-07-27 22:34:49 -0700
committerDavid S. Miller <davem@davemloft.net>2014-07-27 22:34:49 -0700
commit6ceed786648618f6398a54006365f688f5a32012 (patch)
treed015ddcf9d9d16e423efb63e9ddbd46fb02d95c3 /include/net
parentac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f (diff)
parent1bab4c75075b84675b96992ac47580a57c26958d (diff)
Merge branch 'inet_frag_kill_lru_list'
Nikolay Aleksandrov says: ==================== inet: frag: cleanup and update The end goal of this patchset is to remove the LRU list and to move the frag eviction to a work queue. It also does a couple of necessary cleanups and fixes. Brief patch descriptions: Patches 1 - 3 inclusive: necessary clean ups Patch 4 moves the eviction from the softirqs to a workqueue. Patch 5 removes the nqueues counter which was protected by the LRU lock Patch 6 removes the, by now unused, lru list. Patch 7 moves the rebuild timer to the workqueue and schedules the rebuilds only if we've hit the maximum queue length on some of the chains. Patch 8 migrate the rwlock to a seqlock since the rehash is usually a rare operation. Patch 9 introduces an artificial global memory limit based on the value of init_net's high_thresh which is used to cap the high_thresh of the other namespaces. Also introduces some sane limits on the other tunables, and makes it impossible to have low_thresh > high_thresh. Here are some numbers from running netperf before and after the patchset: Each test consists of the following setting: -I 95,5 -i 15,10 1. Bound test (-T 4,4) 1.1 Virtio before the patchset - MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.177 () port 0 AF_INET : +/-2.500% @ 95% conf. : cpu bind Socket Message Elapsed Messages CPU Service Size Size Time Okay Errors Throughput Util Demand bytes bytes secs # # 10^6bits/sec % SS us/KB 212992 64000 30.00 722177 0 12325.1 34.55 2.025 212992 30.00 368020 6280.9 34.05 0.752 1.2 Virtio after the patchset - MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.177 () port 0 AF_INET : +/-2.500% @ 95% conf. : cpu bind Socket Message Elapsed Messages CPU Service Size Size Time Okay Errors Throughput Util Demand bytes bytes secs # # 10^6bits/sec % SS us/KB 212992 64000 30.00 727030 0 12407.9 35.45 1.876 212992 30.00 505405 8625.5 34.92 0.693 2. Virtio unbound test 2.1 Before the patchset MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.177 () port 0 AF_INET : +/-2.500% @ 95% conf. Socket Message Elapsed Messages Size Size Time Okay Errors Throughput bytes bytes secs # # 10^6bits/sec 212992 64000 30.00 730008 0 12458.77 212992 30.00 416721 7112.02 2.2 After the patchset MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.177 () port 0 AF_INET : +/-2.500% @ 95% conf. Socket Message Elapsed Messages Size Size Time Okay Errors Throughput bytes bytes secs # # 10^6bits/sec 212992 64000 30.00 731129 0 12477.89 212992 30.00 487707 8323.50 3. 10 gig unbound tests 3.1 Before the patchset MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.133.1 () port 0 AF_INET : +/-2.500% @ 95% conf. Socket Message Elapsed Messages Size Size Time Okay Errors Throughput bytes bytes secs # # 10^6bits/sec 212992 64000 30.00 417209 0 7120.33 212992 30.00 416740 7112.33 3.2 After the patchset MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.133.1 () port 0 AF_INET : +/-2.500% @ 95% conf. Socket Message Elapsed Messages Size Size Time Okay Errors Throughput bytes bytes secs # # 10^6bits/sec 212992 64000 30.00 438009 0 7475.33 212992 30.00 437630 7468.87 Given the options each netperf ran between 10 and 15 times for 30 seconds to get the necessary confidence, also the tests themselves ran 3 times and were consistent. Another set of tests that I ran were parallel stress tests which consisted of flooding the machine with fragmented packets from different sources with frag timeout set to 0 (so there're lots of timeouts) and low_thresh set to 1 byte (so evictions are happening all the time) and on top of that running a namespace create/destroy endless loop with network interfaces and addresses that got flooded (for the brief periods they were up) in parallel. This test ran for an hour without any issues. ====================
Diffstat (limited to 'include/net')
-rw-r--r--include/net/inet_frag.h70
-rw-r--r--include/net/ip.h1
-rw-r--r--include/net/ipv6.h9
3 files changed, 23 insertions, 57 deletions
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 6f59de98dabd..6f4930a0b660 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -4,10 +4,6 @@
#include <linux/percpu_counter.h>
struct netns_frags {
- int nqueues;
- struct list_head lru_list;
- spinlock_t lru_lock;
-
/* The percpu_counter "mem" need to be cacheline aligned.
* mem.count must not share cacheline with other writers
*/
@@ -22,7 +18,6 @@ struct netns_frags {
struct inet_frag_queue {
spinlock_t lock;
struct timer_list timer; /* when will this queue expire? */
- struct list_head lru_list; /* lru list member */
struct hlist_node list;
atomic_t refcnt;
struct sk_buff *fragments; /* list of received fragments */
@@ -32,6 +27,7 @@ struct inet_frag_queue {
int meat;
__u8 last_in; /* first/last segment arrived? */
+#define INET_FRAG_EVICTED 8
#define INET_FRAG_COMPLETE 4
#define INET_FRAG_FIRST_IN 2
#define INET_FRAG_LAST_IN 1
@@ -48,7 +44,7 @@ struct inet_frag_queue {
* rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
* struct frag_queue))
*/
-#define INETFRAGS_MAXDEPTH 128
+#define INETFRAGS_MAXDEPTH 128
struct inet_frag_bucket {
struct hlist_head chain;
@@ -57,24 +53,27 @@ struct inet_frag_bucket {
struct inet_frags {
struct inet_frag_bucket hash[INETFRAGS_HASHSZ];
- /* This rwlock is a global lock (seperate per IPv4, IPv6 and
- * netfilter). Important to keep this on a seperate cacheline.
- * Its primarily a rebuild protection rwlock.
- */
- rwlock_t lock ____cacheline_aligned_in_smp;
- int secret_interval;
- struct timer_list secret_timer;
+
+ struct work_struct frags_work;
+ unsigned int next_bucket;
+ unsigned long last_rebuild_jiffies;
+ bool rebuild;
/* The first call to hashfn is responsible to initialize
* rnd. This is best done with net_get_random_once.
+ *
+ * rnd_seqlock is used to let hash insertion detect
+ * when it needs to re-lookup the hash chain to use.
*/
u32 rnd;
+ seqlock_t rnd_seqlock;
int qsize;
- unsigned int (*hashfn)(struct inet_frag_queue *);
- bool (*match)(struct inet_frag_queue *q, void *arg);
+ unsigned int (*hashfn)(const struct inet_frag_queue *);
+ bool (*match)(const struct inet_frag_queue *q,
+ const void *arg);
void (*constructor)(struct inet_frag_queue *q,
- void *arg);
+ const void *arg);
void (*destructor)(struct inet_frag_queue *);
void (*skb_free)(struct sk_buff *);
void (*frag_expire)(unsigned long data);
@@ -87,19 +86,17 @@ void inet_frags_init_net(struct netns_frags *nf);
void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
-void inet_frag_destroy(struct inet_frag_queue *q,
- struct inet_frags *f, int *work);
-int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force);
+void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
- struct inet_frags *f, void *key, unsigned int hash)
- __releases(&f->lock);
+ struct inet_frags *f, void *key, unsigned int hash);
+
void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
const char *prefix);
static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
{
if (atomic_dec_and_test(&q->refcnt))
- inet_frag_destroy(q, f, NULL);
+ inet_frag_destroy(q, f);
}
/* Memory Tracking Functions. */
@@ -131,9 +128,9 @@ static inline void init_frag_mem_limit(struct netns_frags *nf)
percpu_counter_init(&nf->mem, 0);
}
-static inline int sum_frag_mem_limit(struct netns_frags *nf)
+static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
{
- int res;
+ unsigned int res;
local_bh_disable();
res = percpu_counter_sum_positive(&nf->mem);
@@ -142,31 +139,6 @@ static inline int sum_frag_mem_limit(struct netns_frags *nf)
return res;
}
-static inline void inet_frag_lru_move(struct inet_frag_queue *q)
-{
- spin_lock(&q->net->lru_lock);
- if (!list_empty(&q->lru_list))
- list_move_tail(&q->lru_list, &q->net->lru_list);
- spin_unlock(&q->net->lru_lock);
-}
-
-static inline void inet_frag_lru_del(struct inet_frag_queue *q)
-{
- spin_lock(&q->net->lru_lock);
- list_del_init(&q->lru_list);
- q->net->nqueues--;
- spin_unlock(&q->net->lru_lock);
-}
-
-static inline void inet_frag_lru_add(struct netns_frags *nf,
- struct inet_frag_queue *q)
-{
- spin_lock(&nf->lru_lock);
- list_add_tail(&q->lru_list, &nf->lru_list);
- q->net->nqueues++;
- spin_unlock(&nf->lru_lock);
-}
-
/* RFC 3168 support :
* We want to check ECN values of all fragments, do detect invalid combinations.
* In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
diff --git a/include/net/ip.h b/include/net/ip.h
index 2e8f055989c3..ca14799545fd 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -495,7 +495,6 @@ static inline struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
}
#endif
int ip_frag_mem(struct net *net);
-int ip_frag_nqueues(struct net *net);
/*
* Functions provided by ip_forward.c
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index a25017247457..a2db816e8461 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -299,11 +299,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
}
#if IS_ENABLED(CONFIG_IPV6)
-static inline int ip6_frag_nqueues(struct net *net)
-{
- return net->ipv6.frags.nqueues;
-}
-
static inline int ip6_frag_mem(struct net *net)
{
return sum_frag_mem_limit(&net->ipv6.frags);
@@ -496,8 +491,8 @@ struct ip6_create_arg {
u8 ecn;
};
-void ip6_frag_init(struct inet_frag_queue *q, void *a);
-bool ip6_frag_match(struct inet_frag_queue *q, void *a);
+void ip6_frag_init(struct inet_frag_queue *q, const void *a);
+bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
/*
* Equivalent of ipv4 struct ip