From 1a57feb847c56d6193f67d0e892c24e71f9e3ab1 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:26:23 +0300 Subject: net: Introduce net_sem for protection of pernet_list Currently, the mutex is mostly used to protect pernet operations list. It orders setup_net() and cleanup_net() with parallel {un,}register_pernet_operations() calls, so ->exit{,batch} methods of the same pernet operations are executed for a dying net, as were used to call ->init methods, even after the net namespace is unlinked from net_namespace_list in cleanup_net(). But there are several problems with scalability. The first one is that more than one net can't be created or destroyed at the same moment on the node. For big machines with many cpus running many containers it's very sensitive. The second one is that it's need to synchronize_rcu() after net is removed from net_namespace_list(): Destroy net_ns: cleanup_net() mutex_lock(&net_mutex) list_del_rcu(&net->list) synchronize_rcu() <--- Sleep there for ages list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list) list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list) mutex_unlock(&net_mutex) This primitive is not fast, especially on the systems with many processors and/or when preemptible RCU is enabled in config. So, all the time, while cleanup_net() is waiting for RCU grace period, creation of new net namespaces is not possible, the tasks, who makes it, are sleeping on the same mutex: Create net_ns: copy_net_ns() mutex_lock_killable(&net_mutex) <--- Sleep there for ages I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop with preemptible RCU enabled after CRIU tests round is finished. The solution is to convert net_mutex to the rw_semaphore and add fine grain locks to really small number of pernet_operations, what really need them. Then, pernet_operations::init/::exit methods, modifying the net-related data, will require down_read() locking only, while down_write() will be used for changing pernet_list (i.e., when modules are being loaded and unloaded). This gives signify performance increase, after all patch set is applied, like you may see here: %for i in {1..10000}; do unshare -n bash -c exit; done *before* real 1m40,377s user 0m9,672s sys 0m19,928s *after* real 0m17,007s user 0m5,311s sys 0m11,779 (5.8 times faster) This patch starts replacing net_mutex to net_sem. It adds rw_semaphore, describes the variables it protects, and makes to use, where appropriate. net_mutex is still present, and next patches will kick it out step-by-step. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/rtnetlink.h') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 1fdcde96eb65..e9ee9ad0a681 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -36,6 +36,7 @@ extern int rtnl_is_locked(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct mutex net_mutex; +extern struct rw_semaphore net_sem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); -- cgit v1.2.3 From 19efbd93e6fb05eab81856b4fc8d64211dd37088 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 12:58:38 +0300 Subject: net: Kill net_mutex We take net_mutex, when there are !async pernet_operations registered, and read locking of net_sem is not enough. But we may get rid of taking the mutex, and just change the logic to write lock net_sem in such cases. This obviously reduces the number of lock operations, we do. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/rtnetlink.h') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index e9ee9ad0a681..3573b4bf2fdf 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -35,7 +35,6 @@ extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern wait_queue_head_t netdev_unregistering_wq; -extern struct mutex net_mutex; extern struct rw_semaphore net_sem; #ifdef CONFIG_PROVE_LOCKING -- cgit v1.2.3 From 79ffdfc6522ae33d8a33e971070c08ee5f27439b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 14 Mar 2018 22:17:20 +0300 Subject: net: Add rtnl_lock_killable() rtnl_lock() is widely used mutex in kernel. Some of kernel code does memory allocations under it. In case of memory deficit this may invoke OOM killer, but the problem is a killed task can't exit if it's waiting for the mutex. This may be a reason of deadlock and panic. This patch adds a new primitive, which responds on SIGKILL, and it allows to use it in the places, where we don't want to sleep forever. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/rtnetlink.h') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3573b4bf2fdf..562a175c35a9 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -33,6 +33,7 @@ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); +extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore net_sem; -- cgit v1.2.3 From 4420bf21fb6c0306e36ad58ade1e741fba57ce65 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 27 Mar 2018 18:02:23 +0300 Subject: net: Rename net_sem to pernet_ops_rwsem net_sem is some undefined area name, so it will be better to make the area more defined. Rename it to pernet_ops_rwsem for better readability and better intelligibility. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/rtnetlink.h') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 562a175c35a9..c7d1e4689325 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -36,7 +36,7 @@ extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; -extern struct rw_semaphore net_sem; +extern struct rw_semaphore pernet_ops_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); -- cgit v1.2.3 From f0b07bb151b098d291fd1fd71ef7a2df56fb124a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:20:32 +0300 Subject: net: Introduce net_rwsem to protect net_namespace_list rtnl_lock() is used everywhere, and contention is very high. When someone wants to iterate over alive net namespaces, he/she has no a possibility to do that without exclusive lock. But the exclusive rtnl_lock() in such places is overkill, and it just increases the contention. Yes, there is already for_each_net_rcu() in kernel, but it requires rcu_read_lock(), and this can't be sleepable. Also, sometimes it may be need really prevent net_namespace_list growth, so for_each_net_rcu() is not fit there. This patch introduces new rw_semaphore, which will be used instead of rtnl_mutex to protect net_namespace_list. It is sleepable and allows not-exclusive iterations over net namespaces list. It allows to stop using rtnl_lock() in several places (what is made in next patches) and makes less the time, we keep rtnl_mutex. Here we just add new lock, while the explanation of we can remove rtnl_lock() there are in next patches. Fine grained locks generally are better, then one big lock, so let's do that with net_namespace_list, while the situation allows that. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/rtnetlink.h') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c7d1e4689325..5225832bd6ff 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -37,6 +37,7 @@ extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore pernet_ops_rwsem; +extern struct rw_semaphore net_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); -- cgit v1.2.3