diff options
author | David S. Miller <davem@davemloft.net> | 2014-07-27 22:34:49 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-07-27 22:34:49 -0700 |
commit | 6ceed786648618f6398a54006365f688f5a32012 (patch) | |
tree | d015ddcf9d9d16e423efb63e9ddbd46fb02d95c3 /include/net | |
parent | ac3d2e5a9ef2f4d8f57c50070c4883ecb7cec29f (diff) | |
parent | 1bab4c75075b84675b96992ac47580a57c26958d (diff) |
Merge branch 'inet_frag_kill_lru_list'
Nikolay Aleksandrov says:
====================
inet: frag: cleanup and update
The end goal of this patchset is to remove the LRU list and to move the
frag eviction to a work queue. It also does a couple of necessary cleanups
and fixes. Brief patch descriptions:
Patches 1 - 3 inclusive: necessary clean ups
Patch 4 moves the eviction from the softirqs to a workqueue.
Patch 5 removes the nqueues counter which was protected by the LRU lock
Patch 6 removes the, by now unused, lru list.
Patch 7 moves the rebuild timer to the workqueue and schedules the rebuilds
only if we've hit the maximum queue length on some of the chains.
Patch 8 migrate the rwlock to a seqlock since the rehash is usually a rare
operation.
Patch 9 introduces an artificial global memory limit based on the value of
init_net's high_thresh which is used to cap the high_thresh of the
other namespaces. Also introduces some sane limits on the other
tunables, and makes it impossible to have low_thresh > high_thresh.
Here are some numbers from running netperf before and after the patchset:
Each test consists of the following setting: -I 95,5 -i 15,10
1. Bound test (-T 4,4)
1.1 Virtio before the patchset -
MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.177 () port 0 AF_INET : +/-2.500% @ 95% conf. : cpu bind
Socket Message Elapsed Messages CPU Service
Size Size Time Okay Errors Throughput Util Demand
bytes bytes secs # # 10^6bits/sec % SS us/KB
212992 64000 30.00 722177 0 12325.1 34.55 2.025
212992 30.00 368020 6280.9 34.05 0.752
1.2 Virtio after the patchset -
MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.177 () port 0 AF_INET : +/-2.500% @ 95% conf. : cpu bind
Socket Message Elapsed Messages CPU Service
Size Size Time Okay Errors Throughput Util Demand
bytes bytes secs # # 10^6bits/sec % SS us/KB
212992 64000 30.00 727030 0 12407.9 35.45 1.876
212992 30.00 505405 8625.5 34.92 0.693
2. Virtio unbound test
2.1 Before the patchset
MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.177 () port 0 AF_INET : +/-2.500% @ 95% conf.
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
212992 64000 30.00 730008 0 12458.77
212992 30.00 416721 7112.02
2.2 After the patchset
MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.177 () port 0 AF_INET : +/-2.500% @ 95% conf.
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
212992 64000 30.00 731129 0 12477.89
212992 30.00 487707 8323.50
3. 10 gig unbound tests
3.1 Before the patchset
MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.133.1 () port 0 AF_INET : +/-2.500% @ 95% conf.
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
212992 64000 30.00 417209 0 7120.33
212992 30.00 416740 7112.33
3.2 After the patchset
MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.133.1 () port 0 AF_INET : +/-2.500% @ 95% conf.
Socket Message Elapsed Messages
Size Size Time Okay Errors Throughput
bytes bytes secs # # 10^6bits/sec
212992 64000 30.00 438009 0 7475.33
212992 30.00 437630 7468.87
Given the options each netperf ran between 10 and 15 times for 30 seconds
to get the necessary confidence, also the tests themselves ran 3 times and
were consistent.
Another set of tests that I ran were parallel stress tests which consisted
of flooding the machine with fragmented packets from different sources with
frag timeout set to 0 (so there're lots of timeouts) and low_thresh set to
1 byte (so evictions are happening all the time) and on top of that running
a namespace create/destroy endless loop with network interfaces and
addresses that got flooded (for the brief periods they were up) in parallel.
This test ran for an hour without any issues.
====================
Diffstat (limited to 'include/net')
-rw-r--r-- | include/net/inet_frag.h | 70 | ||||
-rw-r--r-- | include/net/ip.h | 1 | ||||
-rw-r--r-- | include/net/ipv6.h | 9 |
3 files changed, 23 insertions, 57 deletions
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 6f59de98dabd..6f4930a0b660 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -4,10 +4,6 @@ #include <linux/percpu_counter.h> struct netns_frags { - int nqueues; - struct list_head lru_list; - spinlock_t lru_lock; - /* The percpu_counter "mem" need to be cacheline aligned. * mem.count must not share cacheline with other writers */ @@ -22,7 +18,6 @@ struct netns_frags { struct inet_frag_queue { spinlock_t lock; struct timer_list timer; /* when will this queue expire? */ - struct list_head lru_list; /* lru list member */ struct hlist_node list; atomic_t refcnt; struct sk_buff *fragments; /* list of received fragments */ @@ -32,6 +27,7 @@ struct inet_frag_queue { int meat; __u8 last_in; /* first/last segment arrived? */ +#define INET_FRAG_EVICTED 8 #define INET_FRAG_COMPLETE 4 #define INET_FRAG_FIRST_IN 2 #define INET_FRAG_LAST_IN 1 @@ -48,7 +44,7 @@ struct inet_frag_queue { * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or * struct frag_queue)) */ -#define INETFRAGS_MAXDEPTH 128 +#define INETFRAGS_MAXDEPTH 128 struct inet_frag_bucket { struct hlist_head chain; @@ -57,24 +53,27 @@ struct inet_frag_bucket { struct inet_frags { struct inet_frag_bucket hash[INETFRAGS_HASHSZ]; - /* This rwlock is a global lock (seperate per IPv4, IPv6 and - * netfilter). Important to keep this on a seperate cacheline. - * Its primarily a rebuild protection rwlock. - */ - rwlock_t lock ____cacheline_aligned_in_smp; - int secret_interval; - struct timer_list secret_timer; + + struct work_struct frags_work; + unsigned int next_bucket; + unsigned long last_rebuild_jiffies; + bool rebuild; /* The first call to hashfn is responsible to initialize * rnd. This is best done with net_get_random_once. + * + * rnd_seqlock is used to let hash insertion detect + * when it needs to re-lookup the hash chain to use. */ u32 rnd; + seqlock_t rnd_seqlock; int qsize; - unsigned int (*hashfn)(struct inet_frag_queue *); - bool (*match)(struct inet_frag_queue *q, void *arg); + unsigned int (*hashfn)(const struct inet_frag_queue *); + bool (*match)(const struct inet_frag_queue *q, + const void *arg); void (*constructor)(struct inet_frag_queue *q, - void *arg); + const void *arg); void (*destructor)(struct inet_frag_queue *); void (*skb_free)(struct sk_buff *); void (*frag_expire)(unsigned long data); @@ -87,19 +86,17 @@ void inet_frags_init_net(struct netns_frags *nf); void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); -void inet_frag_destroy(struct inet_frag_queue *q, - struct inet_frags *f, int *work); -int inet_frag_evictor(struct netns_frags *nf, struct inet_frags *f, bool force); +void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f); struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, - struct inet_frags *f, void *key, unsigned int hash) - __releases(&f->lock); + struct inet_frags *f, void *key, unsigned int hash); + void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, const char *prefix); static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f) { if (atomic_dec_and_test(&q->refcnt)) - inet_frag_destroy(q, f, NULL); + inet_frag_destroy(q, f); } /* Memory Tracking Functions. */ @@ -131,9 +128,9 @@ static inline void init_frag_mem_limit(struct netns_frags *nf) percpu_counter_init(&nf->mem, 0); } -static inline int sum_frag_mem_limit(struct netns_frags *nf) +static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) { - int res; + unsigned int res; local_bh_disable(); res = percpu_counter_sum_positive(&nf->mem); @@ -142,31 +139,6 @@ static inline int sum_frag_mem_limit(struct netns_frags *nf) return res; } -static inline void inet_frag_lru_move(struct inet_frag_queue *q) -{ - spin_lock(&q->net->lru_lock); - if (!list_empty(&q->lru_list)) - list_move_tail(&q->lru_list, &q->net->lru_list); - spin_unlock(&q->net->lru_lock); -} - -static inline void inet_frag_lru_del(struct inet_frag_queue *q) -{ - spin_lock(&q->net->lru_lock); - list_del_init(&q->lru_list); - q->net->nqueues--; - spin_unlock(&q->net->lru_lock); -} - -static inline void inet_frag_lru_add(struct netns_frags *nf, - struct inet_frag_queue *q) -{ - spin_lock(&nf->lru_lock); - list_add_tail(&q->lru_list, &nf->lru_list); - q->net->nqueues++; - spin_unlock(&nf->lru_lock); -} - /* RFC 3168 support : * We want to check ECN values of all fragments, do detect invalid combinations. * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. diff --git a/include/net/ip.h b/include/net/ip.h index 2e8f055989c3..ca14799545fd 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -495,7 +495,6 @@ static inline struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) } #endif int ip_frag_mem(struct net *net); -int ip_frag_nqueues(struct net *net); /* * Functions provided by ip_forward.c diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a25017247457..a2db816e8461 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -299,11 +299,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev) } #if IS_ENABLED(CONFIG_IPV6) -static inline int ip6_frag_nqueues(struct net *net) -{ - return net->ipv6.frags.nqueues; -} - static inline int ip6_frag_mem(struct net *net) { return sum_frag_mem_limit(&net->ipv6.frags); @@ -496,8 +491,8 @@ struct ip6_create_arg { u8 ecn; }; -void ip6_frag_init(struct inet_frag_queue *q, void *a); -bool ip6_frag_match(struct inet_frag_queue *q, void *a); +void ip6_frag_init(struct inet_frag_queue *q, const void *a); +bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); /* * Equivalent of ipv4 struct ip |