summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2025-09-15 13:45:37 +0200
committerChristian Brauner <brauner@kernel.org>2025-09-19 14:26:17 +0200
commit3ab378cfa793c648d4edf02bbfff3af8715aca91 (patch)
treeb5a57da7ff6c26434569ec95e8d2cf43f9cbe3c6 /kernel
parent8f5ae30d69d7543eee0d70083daf4de8fe15d585 (diff)
parent28ef38a9a2c7aa4dd8dfbb1d2b12f9ea84044327 (diff)
Merge patch series "ns: support file handles"
Christian Brauner <brauner@kernel.org> says: For a while now we have supported file handles for pidfds. This has proven to be very useful. Extend the concept to cover namespaces as well. After this patchset it is possible to encode and decode namespace file handles using the commong name_to_handle_at() and open_by_handle_at() apis. Namespaces file descriptors can already be derived from pidfds which means they aren't subject to overmount protection bugs. IOW, it's irrelevant if the caller would not have access to an appropriate /proc/<pid>/ns/ directory as they could always just derive the namespace based on a pidfd already. It has the same advantage as pidfds. It's possible to reliably and for the lifetime of the system refer to a namespace without pinning any resources and to compare them. Permission checking is kept simple. If the caller is located in the namespace the file handle refers to they are able to open it otherwise they must hold privilege over the owning namespace of the relevant namespace. Both the network namespace and the mount namespace already have an associated cookie that isn't recycled and is fully exposed to userspace. Move this into ns_common and use the same id space for all namespaces so they can trivially and reliably be compared. There's more coming based on the iterator infrastructure but the series is large enough and focuses on file handles. Extensive selftests included. * patches from https://lore.kernel.org/20250912-work-namespace-v2-0-1a247645cef5@kernel.org: (33 commits) selftests/namespaces: add file handle selftests selftests/namespaces: add identifier selftests tools: update nsfs.h uapi header nsfs: add missing id retrieval support nsfs: support exhaustive file handles nsfs: support file handles nsfs: add current_in_namespace() ns: add to_<type>_ns() to respective headers uts: support ns lookup user: support ns lookup time: support ns lookup pid: support ns lookup net: support ns lookup ipc: support ns lookup cgroup: support ns lookup mnt: support ns lookup nstree: make iterator generic ns: remove ns_alloc_inum() uts: use ns_common_init() user: use ns_common_init() ... Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.kexec1
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/cgroup/cpuset.c11
-rw-r--r--kernel/cgroup/namespace.c24
-rw-r--r--kernel/cgroup/rstat.c3
-rw-r--r--kernel/dma/contiguous.c2
-rw-r--r--kernel/dma/pool.c4
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/futex/futex.h6
-rw-r--r--kernel/kexec_handover.c29
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/locking/ww_mutex.h6
-rw-r--r--kernel/nstree.c233
-rw-r--r--kernel/params.c7
-rw-r--r--kernel/pid_namespace.c13
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/tree.h1
-rw-r--r--kernel/rcu/tree_plugin.h8
-rw-r--r--kernel/sched/deadline.c18
-rw-r--r--kernel/sched/debug.c6
-rw-r--r--kernel/sched/ext.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/time/namespace.c23
-rw-r--r--kernel/trace/fgraph.c1
-rw-r--r--kernel/trace/ftrace.c19
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c22
-rw-r--r--kernel/trace/trace.h10
-rw-r--r--kernel/trace/trace_functions_graph.c22
-rw-r--r--kernel/user_namespace.c17
-rw-r--r--kernel/utsname.c28
32 files changed, 415 insertions, 124 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec
index 2ee603a98813..1224dd937df0 100644
--- a/kernel/Kconfig.kexec
+++ b/kernel/Kconfig.kexec
@@ -97,6 +97,7 @@ config KEXEC_JUMP
config KEXEC_HANDOVER
bool "kexec handover"
depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE
+ depends on !DEFERRED_STRUCT_PAGE_INIT
select MEMBLOCK_KHO_SCRATCH
select KEXEC_FILE
select DEBUG_FS
diff --git a/kernel/Makefile b/kernel/Makefile
index c60623448235..b807516a1b43 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = fork.o exec_domain.o panic.o \
sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
- kthread.o sys_ni.o nsproxy.o \
+ kthread.o sys_ni.o nsproxy.o nstree.o \
notifier.o ksysfs.o cred.o reboot.o \
async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 312c6a8b55bb..092e6bf081ed 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -59,6 +59,7 @@
#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
#include <linux/psi.h>
+#include <linux/nstree.h>
#include <net/sock.h>
#define CREATE_TRACE_POINTS
@@ -6312,6 +6313,7 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cpuset_fs_type));
#endif
+ ns_tree_add(&init_cgroup_ns);
return 0;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f74d04429a29..27adb04df675 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -280,7 +280,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes)
{
if (!cpusets_insane_config() &&
movable_only_nodes(nodes)) {
- static_branch_enable(&cpusets_insane_config_key);
+ static_branch_enable_cpuslocked(&cpusets_insane_config_key);
pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
"Cpuset allocations might fail even with a lot of memory available.\n",
nodemask_pr_args(nodes));
@@ -1843,7 +1843,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (is_partition_valid(cs))
adding = cpumask_and(tmp->addmask,
xcpus, parent->effective_xcpus);
- } else if (is_partition_invalid(cs) &&
+ } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&
cpumask_subset(xcpus, parent->effective_xcpus)) {
struct cgroup_subsys_state *css;
struct cpuset *child;
@@ -3358,14 +3358,12 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
else
return -EINVAL;
- css_get(&cs->css);
cpus_read_lock();
mutex_lock(&cpuset_mutex);
if (is_cpuset_online(cs))
retval = update_prstate(cs, val);
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
- css_put(&cs->css);
return retval ?: nbytes;
}
@@ -3870,9 +3868,10 @@ retry:
partcmd = partcmd_invalidate;
/*
* On the other hand, an invalid partition root may be transitioned
- * back to a regular one.
+ * back to a regular one with a non-empty effective xcpus.
*/
- else if (is_partition_valid(parent) && is_partition_invalid(cs))
+ else if (is_partition_valid(parent) && is_partition_invalid(cs) &&
+ !cpumask_empty(cs->effective_xcpus))
partcmd = partcmd_update;
if (partcmd >= 0) {
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 144a464e45c6..5a327914b565 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -5,7 +5,7 @@
#include <linux/slab.h>
#include <linux/nsproxy.h>
#include <linux/proc_ns.h>
-
+#include <linux/nstree.h>
/* cgroup namespaces */
@@ -21,29 +21,28 @@ static void dec_cgroup_namespaces(struct ucounts *ucounts)
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
- struct cgroup_namespace *new_ns;
+ struct cgroup_namespace *new_ns __free(kfree) = NULL;
int ret;
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
- ret = ns_alloc_inum(&new_ns->ns);
- if (ret) {
- kfree(new_ns);
+ ret = ns_common_init(&new_ns->ns, &cgroupns_operations, true);
+ if (ret)
return ERR_PTR(ret);
- }
- refcount_set(&new_ns->ns.count, 1);
- new_ns->ns.ops = &cgroupns_operations;
- return new_ns;
+ ns_tree_add(new_ns);
+ return no_free_ptr(new_ns);
}
void free_cgroup_ns(struct cgroup_namespace *ns)
{
+ ns_tree_remove(ns);
put_css_set(ns->root_cset);
dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
- kfree(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
EXPORT_SYMBOL(free_cgroup_ns);
@@ -90,11 +89,6 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
return new_ns;
}
-static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
-{
- return container_of(ns, struct cgroup_namespace, ns);
-}
-
static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
struct nsproxy *nsproxy = nsset->nsproxy;
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 981e2f77ad4e..a198e40c799b 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -479,6 +479,9 @@ void css_rstat_exit(struct cgroup_subsys_state *css)
if (!css_uses_rstat(css))
return;
+ if (!css->rstat_cpu)
+ return;
+
css_rstat_flush(css);
/* sanity check */
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 67af8a55185d..d9b9dcba6ff7 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -483,8 +483,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
pr_err("Reserved memory: unable to setup CMA region\n");
return err;
}
- /* Architecture specific contiguous memory fixup. */
- dma_contiguous_early_fixup(rmem->base, rmem->size);
if (default_cma)
dma_contiguous_default_area = cma;
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 7b04f7575796..ee45dee33d49 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -102,8 +102,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
#ifdef CONFIG_DMA_DIRECT_REMAP
addr = dma_common_contiguous_remap(page, pool_size,
- pgprot_dmacoherent(PAGE_KERNEL),
- __builtin_return_address(0));
+ pgprot_decrypted(pgprot_dmacoherent(PAGE_KERNEL)),
+ __builtin_return_address(0));
if (!addr)
goto free_page;
#else
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8060c2857bb2..872122e074e5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2665,6 +2665,9 @@ static void perf_log_itrace_start(struct perf_event *event);
static void perf_event_unthrottle(struct perf_event *event, bool start)
{
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
event->hw.interrupts = 0;
if (start)
event->pmu->start(event, 0);
@@ -2674,6 +2677,9 @@ static void perf_event_unthrottle(struct perf_event *event, bool start)
static void perf_event_throttle(struct perf_event *event)
{
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
+
event->hw.interrupts = MAX_INTERRUPTS;
event->pmu->stop(event, 0);
if (event == event->group_leader)
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index c74eac572acd..2cd57096c38e 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -319,13 +319,13 @@ static __always_inline int futex_put_value(u32 val, u32 __user *to)
{
if (can_do_masked_user_access())
to = masked_user_access_begin(to);
- else if (!user_read_access_begin(to, sizeof(*to)))
+ else if (!user_write_access_begin(to, sizeof(*to)))
return -EFAULT;
unsafe_put_user(val, to, Efault);
- user_read_access_end();
+ user_write_access_end();
return 0;
Efault:
- user_read_access_end();
+ user_write_access_end();
return -EFAULT;
}
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index e49743ae52c5..ecd1ac210dbd 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -144,14 +144,34 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn,
unsigned int order)
{
struct kho_mem_phys_bits *bits;
- struct kho_mem_phys *physxa;
+ struct kho_mem_phys *physxa, *new_physxa;
const unsigned long pfn_high = pfn >> order;
might_sleep();
- physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa));
- if (IS_ERR(physxa))
- return PTR_ERR(physxa);
+ physxa = xa_load(&track->orders, order);
+ if (!physxa) {
+ int err;
+
+ new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL);
+ if (!new_physxa)
+ return -ENOMEM;
+
+ xa_init(&new_physxa->phys_bits);
+ physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa,
+ GFP_KERNEL);
+
+ err = xa_err(physxa);
+ if (err || physxa) {
+ xa_destroy(&new_physxa->phys_bits);
+ kfree(new_physxa);
+
+ if (err)
+ return err;
+ } else {
+ physxa = new_physxa;
+ }
+ }
bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS,
sizeof(*bits));
@@ -544,6 +564,7 @@ err_free_scratch_areas:
err_free_scratch_desc:
memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch));
err_disable_kho:
+ pr_warn("Failed to reserve scratch area, disabling kexec handover\n");
kho_enable = false;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 0e98b228a8ef..31b072e8d427 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -893,6 +893,7 @@ out:
return ret;
}
+EXPORT_SYMBOL_GPL(kthread_affine_preferred);
/*
* Re-affine kthreads according to their preferences
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 086fd5487ca7..31a785afee6c 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -342,8 +342,12 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
* When waking up the task to wound, be sure to clear the
* blocked_on pointer. Otherwise we can see circular
* blocked_on relationships that can't resolve.
+ *
+ * NOTE: We pass NULL here instead of lock, because we
+ * are waking the mutex owner, who may be currently
+ * blocked on a different mutex.
*/
- __clear_task_blocked_on(owner, lock);
+ __clear_task_blocked_on(owner, NULL);
wake_q_add(wake_q, owner);
}
return true;
diff --git a/kernel/nstree.c b/kernel/nstree.c
new file mode 100644
index 000000000000..bbe8bedc924c
--- /dev/null
+++ b/kernel/nstree.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/nstree.h>
+#include <linux/proc_ns.h>
+#include <linux/vfsdebug.h>
+
+struct ns_tree mnt_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWNS,
+};
+
+struct ns_tree net_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWNET,
+};
+EXPORT_SYMBOL_GPL(net_ns_tree);
+
+struct ns_tree uts_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWUTS,
+};
+
+struct ns_tree user_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWUSER,
+};
+
+struct ns_tree ipc_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWIPC,
+};
+
+struct ns_tree pid_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWPID,
+};
+
+struct ns_tree cgroup_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWCGROUP,
+};
+
+struct ns_tree time_ns_tree = {
+ .ns_tree = RB_ROOT,
+ .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list),
+ .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock),
+ .type = CLONE_NEWTIME,
+};
+
+DEFINE_COOKIE(namespace_cookie);
+
+static inline struct ns_common *node_to_ns(const struct rb_node *node)
+{
+ if (!node)
+ return NULL;
+ return rb_entry(node, struct ns_common, ns_tree_node);
+}
+
+static inline int ns_cmp(struct rb_node *a, const struct rb_node *b)
+{
+ struct ns_common *ns_a = node_to_ns(a);
+ struct ns_common *ns_b = node_to_ns(b);
+ u64 ns_id_a = ns_a->ns_id;
+ u64 ns_id_b = ns_b->ns_id;
+
+ if (ns_id_a < ns_id_b)
+ return -1;
+ if (ns_id_a > ns_id_b)
+ return 1;
+ return 0;
+}
+
+void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+ struct rb_node *node, *prev;
+
+ VFS_WARN_ON_ONCE(!ns->ns_id);
+
+ write_seqlock(&ns_tree->ns_tree_lock);
+
+ VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type);
+
+ node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp);
+ /*
+ * If there's no previous entry simply add it after the
+ * head and if there is add it after the previous entry.
+ */
+ prev = rb_prev(&ns->ns_tree_node);
+ if (!prev)
+ list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list);
+ else
+ list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node);
+
+ write_sequnlock(&ns_tree->ns_tree_lock);
+
+ VFS_WARN_ON_ONCE(node);
+}
+
+void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree)
+{
+ VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node));
+ VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node));
+ VFS_WARN_ON_ONCE(ns->ops->type != ns_tree->type);
+
+ write_seqlock(&ns_tree->ns_tree_lock);
+ rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree);
+ list_bidir_del_rcu(&ns->ns_list_node);
+ RB_CLEAR_NODE(&ns->ns_tree_node);
+ write_sequnlock(&ns_tree->ns_tree_lock);
+}
+EXPORT_SYMBOL_GPL(__ns_tree_remove);
+
+static int ns_find(const void *key, const struct rb_node *node)
+{
+ const u64 ns_id = *(u64 *)key;
+ const struct ns_common *ns = node_to_ns(node);
+
+ if (ns_id < ns->ns_id)
+ return -1;
+ if (ns_id > ns->ns_id)
+ return 1;
+ return 0;
+}
+
+
+static struct ns_tree *ns_tree_from_type(int ns_type)
+{
+ switch (ns_type) {
+ case CLONE_NEWCGROUP:
+ return &cgroup_ns_tree;
+ case CLONE_NEWIPC:
+ return &ipc_ns_tree;
+ case CLONE_NEWNS:
+ return &mnt_ns_tree;
+ case CLONE_NEWNET:
+ return &net_ns_tree;
+ case CLONE_NEWPID:
+ return &pid_ns_tree;
+ case CLONE_NEWUSER:
+ return &user_ns_tree;
+ case CLONE_NEWUTS:
+ return &uts_ns_tree;
+ case CLONE_NEWTIME:
+ return &time_ns_tree;
+ }
+
+ return NULL;
+}
+
+struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type)
+{
+ struct ns_tree *ns_tree;
+ struct rb_node *node;
+ unsigned int seq;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage");
+
+ ns_tree = ns_tree_from_type(ns_type);
+ if (!ns_tree)
+ return NULL;
+
+ do {
+ seq = read_seqbegin(&ns_tree->ns_tree_lock);
+ node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find);
+ if (node)
+ break;
+ } while (read_seqretry(&ns_tree->ns_tree_lock, seq));
+
+ if (!node)
+ return NULL;
+
+ VFS_WARN_ON_ONCE(node_to_ns(node)->ops->type != ns_type);
+
+ return node_to_ns(node);
+}
+
+/**
+ * ns_tree_adjoined_rcu - find the next/previous namespace in the same
+ * tree
+ * @ns: namespace to start from
+ * @previous: if true find the previous namespace, otherwise the next
+ *
+ * Find the next or previous namespace in the same tree as @ns. If
+ * there is no next/previous namespace, -ENOENT is returned.
+ */
+struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns,
+ struct ns_tree *ns_tree, bool previous)
+{
+ struct list_head *list;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage");
+
+ if (previous)
+ list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node));
+ else
+ list = rcu_dereference(list_next_rcu(&ns->ns_list_node));
+ if (list_is_head(list, &ns_tree->ns_list))
+ return ERR_PTR(-ENOENT);
+
+ VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ops->type != ns_tree->type);
+
+ return list_entry_rcu(list, struct ns_common, ns_list_node);
+}
+
+/**
+ * ns_tree_gen_id - generate a new namespace id
+ * @ns: namespace to generate id for
+ *
+ * Generates a new namespace id and assigns it to the namespace. All
+ * namespaces types share the same id space and thus can be compared
+ * directly. IOW, when two ids of two namespace are equal, they are
+ * identical.
+ */
+u64 ns_tree_gen_id(struct ns_common *ns)
+{
+ guard(preempt)();
+ ns->ns_id = gen_cookie_next(&namespace_cookie);
+ return ns->ns_id;
+}
diff --git a/kernel/params.c b/kernel/params.c
index b92d64161b75..b96cfd693c99 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -513,13 +513,14 @@ EXPORT_SYMBOL(param_array_ops);
int param_set_copystring(const char *val, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
+ const size_t len = strnlen(val, kps->maxlen);
- if (strnlen(val, kps->maxlen) == kps->maxlen) {
+ if (len == kps->maxlen) {
pr_err("%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
return -ENOSPC;
}
- strcpy(kps->string, val);
+ memcpy(kps->string, val, len + 1);
return 0;
}
EXPORT_SYMBOL(param_set_copystring);
@@ -841,7 +842,7 @@ static void __init param_sysfs_builtin(void)
dot = strchr(kp->name, '.');
if (!dot) {
/* This happens for core_param() */
- strcpy(modname, "kernel");
+ strscpy(modname, "kernel");
name_len = 0;
} else {
name_len = dot - kp->name + 1;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7098ed44e717..9b327420309e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/idr.h>
+#include <linux/nstree.h>
#include <uapi/linux/wait.h>
#include "pid_sysctl.h"
@@ -102,17 +103,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
if (ns->pid_cachep == NULL)
goto out_free_idr;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(&ns->ns, &pidns_operations, true);
if (err)
goto out_free_idr;
- ns->ns.ops = &pidns_operations;
ns->pid_max = PID_MAX_LIMIT;
err = register_pidns_sysctls(ns);
if (err)
goto out_free_inum;
- refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
@@ -124,6 +123,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
+ ns_tree_add(ns);
return ns;
out_free_inum:
@@ -149,6 +149,7 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
+ ns_tree_remove(ns);
unregister_pidns_sysctls(ns);
ns_free_inum(&ns->ns);
@@ -344,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
return 0;
}
-static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
-{
- return container_of(ns, struct pid_namespace, ns);
-}
-
static struct ns_common *pidns_get(struct task_struct *task)
{
struct pid_namespace *ns;
@@ -475,6 +471,7 @@ static __init int pid_namespaces_init(void)
#endif
register_pid_ns_sysctl_table_vm();
+ ns_tree_add(&init_pid_ns);
return 0;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 174ee243b349..8eff357b0436 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4262,6 +4262,8 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
+ rcu_preempt_deferred_qs_init(rdp);
rcu_spawn_rnp_kthreads(rnp);
rcu_spawn_cpu_nocb_kthread(cpu);
ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index de6ca13a7b5f..b8bbe7960cda 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -488,6 +488,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
static void rcu_flavor_sched_clock_irq(int user);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fc14adf15cbb..4cd170b2d655 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -763,8 +763,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
- rdp->defer_qs_iw =
- IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
}
@@ -904,6 +902,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
}
}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp)
+{
+ rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler);
+}
#else /* #ifdef CONFIG_PREEMPT_RCU */
/*
@@ -1103,6 +1105,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
}
+static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { }
+
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e2d51f4306b3..f25301267e47 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1496,10 +1496,12 @@ throttle:
}
if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) {
- if (dl_server(dl_se))
- enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);
- else
+ if (dl_server(dl_se)) {
+ replenish_dl_new_period(dl_se, rq);
+ start_dl_timer(dl_se);
+ } else {
enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH);
+ }
}
if (!is_leftmost(dl_se, &rq->dl))
@@ -1611,7 +1613,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se)
static bool dl_server_stopped(struct sched_dl_entity *dl_se)
{
if (!dl_se->dl_server_active)
- return false;
+ return true;
if (dl_se->dl_server_idle) {
dl_server_stop(dl_se);
@@ -1849,7 +1851,9 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
u64 deadline = dl_se->deadline;
dl_rq->dl_nr_running++;
- add_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
inc_dl_deadline(dl_rq, deadline);
}
@@ -1859,7 +1863,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
- sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+
+ if (!dl_server(dl_se))
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
dec_dl_deadline(dl_rq, dl_se->deadline);
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 3f06ab84d53f..02e16b70a790 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -376,10 +376,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return -EINVAL;
}
- if (rq->cfs.h_nr_queued) {
- update_rq_clock(rq);
- dl_server_stop(&rq->fair_server);
- }
+ update_rq_clock(rq);
+ dl_server_stop(&rq->fair_server);
retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
if (retval)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7dedc9a16281..4ae32ef179dd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5749,6 +5749,9 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
__setscheduler_class(p->policy, p->prio);
struct sched_enq_and_set_ctx ctx;
+ if (!tryget_task_struct(p))
+ continue;
+
if (old_class != new_class && p->se.sched_delayed)
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
@@ -5761,6 +5764,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
sched_enq_and_set_task(&ctx);
check_class_changed(task_rq(p), p, old_class, p->prio);
+ put_task_struct(p);
}
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
diff --git a/kernel/signal.c b/kernel/signal.c
index e2c928de7d2c..fe9190d84f28 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -4067,6 +4067,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
{
struct pid *pid;
enum pid_type type;
+ int ret;
/* Enforce flags be set to 0 until we add an extension. */
if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
@@ -4108,7 +4109,10 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
}
}
- return do_pidfd_send_signal(pid, sig, type, info, flags);
+ ret = do_pidfd_send_signal(pid, sig, type, info, flags);
+ put_pid(pid);
+
+ return ret;
}
static int
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 667452768ed3..20b65f90549e 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -12,6 +12,7 @@
#include <linux/seq_file.h>
#include <linux/proc_ns.h>
#include <linux/export.h>
+#include <linux/nstree.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/cred.h>
@@ -88,25 +89,23 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
+ ns = kzalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
if (!ns)
goto fail_dec;
- refcount_set(&ns->ns.count, 1);
-
ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!ns->vvar_page)
goto fail_free;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(&ns->ns, &timens_operations, true);
if (err)
goto fail_free_page;
ns->ucounts = ucounts;
- ns->ns.ops = &timens_operations;
ns->user_ns = get_user_ns(user_ns);
ns->offsets = old_ns->offsets;
ns->frozen_offsets = false;
+ ns_tree_add(ns);
return ns;
fail_free_page:
@@ -253,16 +252,13 @@ out:
void free_time_ns(struct time_namespace *ns)
{
+ ns_tree_remove(ns);
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
__free_page(ns->vvar_page);
- kfree(ns);
-}
-
-static struct time_namespace *to_time_ns(struct ns_common *ns)
-{
- return container_of(ns, struct time_namespace, ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *timens_get(struct task_struct *task)
@@ -490,3 +486,8 @@ struct time_namespace init_time_ns = {
.ns.ops = &timens_operations,
.frozen_offsets = true,
};
+
+void __init time_ns_init(void)
+{
+ ns_tree_add(&init_time_ns);
+}
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index f4d200f0c610..2a42c1036ea8 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1397,6 +1397,7 @@ error:
ftrace_graph_active--;
gops->saved_func = NULL;
fgraph_lru_release_index(i);
+ unregister_pm_notifier(&ftrace_suspend_notifier);
}
return ret;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 00b76d450a89..a69067367c29 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4661,13 +4661,17 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
} else {
iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
}
+ } else {
+ if (hash)
+ iter->hash = alloc_and_copy_ftrace_hash(hash->size_bits, hash);
+ else
+ iter->hash = EMPTY_HASH;
+ }
- if (!iter->hash) {
- trace_parser_put(&iter->parser);
- goto out_unlock;
- }
- } else
- iter->hash = hash;
+ if (!iter->hash) {
+ trace_parser_put(&iter->parser);
+ goto out_unlock;
+ }
ret = 0;
@@ -6543,9 +6547,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
ftrace_hash_move_and_update_ops(iter->ops, orig_hash,
iter->hash, filter_hash);
mutex_unlock(&ftrace_lock);
- } else {
- /* For read only, the hash is the ops hash */
- iter->hash = NULL;
}
mutex_unlock(&iter->ops->func_hash->regex_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bb71a0dc9d69..43460949ad3f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -7666,7 +7666,7 @@ static __init int test_ringbuffer(void)
rb_test_started = true;
set_current_state(TASK_INTERRUPTIBLE);
- /* Just run for 10 seconds */;
+ /* Just run for 10 seconds */
schedule_timeout(10 * HZ);
kthread_stop(rb_hammer);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4283ed4e8f59..1b7db732c0b1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1816,7 +1816,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
ret = get_user(ch, ubuf++);
if (ret)
- return ret;
+ goto fail;
read++;
cnt--;
@@ -1830,7 +1830,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
while (cnt && isspace(ch)) {
ret = get_user(ch, ubuf++);
if (ret)
- return ret;
+ goto fail;
read++;
cnt--;
}
@@ -1848,12 +1848,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
while (cnt && !isspace(ch) && ch) {
if (parser->idx < parser->size - 1)
parser->buffer[parser->idx++] = ch;
- else
- return -EINVAL;
+ else {
+ ret = -EINVAL;
+ goto fail;
+ }
ret = get_user(ch, ubuf++);
if (ret)
- return ret;
+ goto fail;
read++;
cnt--;
}
@@ -1868,11 +1870,15 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
/* Make sure the parsed string always terminates with '\0'. */
parser->buffer[parser->idx] = 0;
} else {
- return -EINVAL;
+ ret = -EINVAL;
+ goto fail;
}
*ppos += read;
return read;
+fail:
+ trace_parser_fail(parser);
+ return ret;
}
/* TODO add a seq_buf_to_buffer() */
@@ -10632,10 +10638,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m
ret = print_trace_line(&iter);
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(&iter);
+
+ trace_printk_seq(&iter.seq);
}
touch_nmi_watchdog();
-
- trace_printk_seq(&iter.seq);
}
if (!cnt)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1dbf1d3cf2f1..5f4bed5842f9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1292,6 +1292,7 @@ bool ftrace_event_is_function(struct trace_event_call *call);
*/
struct trace_parser {
bool cont;
+ bool fail;
char *buffer;
unsigned idx;
unsigned size;
@@ -1299,7 +1300,7 @@ struct trace_parser {
static inline bool trace_parser_loaded(struct trace_parser *parser)
{
- return (parser->idx != 0);
+ return !parser->fail && parser->idx != 0;
}
static inline bool trace_parser_cont(struct trace_parser *parser)
@@ -1313,6 +1314,11 @@ static inline void trace_parser_clear(struct trace_parser *parser)
parser->idx = 0;
}
+static inline void trace_parser_fail(struct trace_parser *parser)
+{
+ parser->fail = true;
+}
+
extern int trace_parser_get_init(struct trace_parser *parser, int size);
extern void trace_parser_put(struct trace_parser *parser);
extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
@@ -2204,7 +2210,7 @@ static inline bool is_good_system_name(const char *name)
static inline void sanitize_event_name(char *name)
{
while (*name++ != '\0')
- if (*name == ':' || *name == '.')
+ if (*name == ':' || *name == '.' || *name == '*')
*name = '_';
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 66e1a527cf1a..a7f4b9a47a71 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -27,14 +27,21 @@ struct fgraph_cpu_data {
unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
};
+struct fgraph_ent_args {
+ struct ftrace_graph_ent_entry ent;
+ /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */
+ unsigned long args[FTRACE_REGS_MAX_ARGS];
+};
+
struct fgraph_data {
struct fgraph_cpu_data __percpu *cpu_data;
/* Place to preserve last processed entry. */
union {
- struct ftrace_graph_ent_entry ent;
+ struct fgraph_ent_args ent;
+ /* TODO allow retaddr to have args */
struct fgraph_retaddr_ent_entry rent;
- } ent;
+ };
struct ftrace_graph_ret_entry ret;
int failed;
int cpu;
@@ -627,10 +634,13 @@ get_return_for_leaf(struct trace_iterator *iter,
* Save current and next entries for later reference
* if the output fails.
*/
- if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT))
- data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr;
- else
- data->ent.ent = *curr;
+ if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) {
+ data->rent = *(struct fgraph_retaddr_ent_entry *)curr;
+ } else {
+ int size = min((int)sizeof(data->ent), (int)iter->ent_size);
+
+ memcpy(&data->ent, curr, size);
+ }
/*
* If the next event is not a return type, then
* we only care about what type it is. Otherwise we can
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 682f40d5632d..cfb0e28f2779 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,6 +21,7 @@
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/nstree.h>
static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);
@@ -124,12 +125,11 @@ int create_user_ns(struct cred *new)
goto fail_dec;
ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
- ret = ns_alloc_inum(&ns->ns);
+
+ ret = ns_common_init(&ns->ns, &userns_operations, true);
if (ret)
goto fail_free;
- ns->ns.ops = &userns_operations;
- refcount_set(&ns->ns.count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
ns->level = parent_ns->level + 1;
@@ -159,6 +159,7 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
set_cred_user_ns(new, ns);
+ ns_tree_add(ns);
return 0;
fail_keyring:
#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -201,6 +202,7 @@ static void free_user_ns(struct work_struct *work)
do {
struct ucounts *ucounts = ns->ucounts;
parent = ns->parent;
+ ns_tree_remove(ns);
if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
kfree(ns->gid_map.forward);
kfree(ns->gid_map.reverse);
@@ -219,7 +221,8 @@ static void free_user_ns(struct work_struct *work)
retire_userns_sysctls(ns);
key_free_user_ns(ns);
ns_free_inum(&ns->ns);
- kmem_cache_free(user_ns_cachep, ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
dec_user_namespaces(ucounts);
ns = parent;
} while (refcount_dec_and_test(&parent->ns.count));
@@ -1322,11 +1325,6 @@ bool current_in_userns(const struct user_namespace *target_ns)
}
EXPORT_SYMBOL(current_in_userns);
-static inline struct user_namespace *to_user_ns(struct ns_common *ns)
-{
- return container_of(ns, struct user_namespace, ns);
-}
-
static struct ns_common *userns_get(struct task_struct *task)
{
struct user_namespace *user_ns;
@@ -1413,6 +1411,7 @@ const struct proc_ns_operations userns_operations = {
static __init int user_namespaces_init(void)
{
user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
+ ns_tree_add(&init_user_ns);
return 0;
}
subsys_initcall(user_namespaces_init);
diff --git a/kernel/utsname.c b/kernel/utsname.c
index b1ac3ca870f2..a682830742d3 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -13,6 +13,7 @@
#include <linux/cred.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+#include <linux/nstree.h>
#include <linux/sched/task.h>
static struct kmem_cache *uts_ns_cache __ro_after_init;
@@ -27,16 +28,6 @@ static void dec_uts_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
}
-static struct uts_namespace *create_uts_ns(void)
-{
- struct uts_namespace *uts_ns;
-
- uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
- if (uts_ns)
- refcount_set(&uts_ns->ns.count, 1);
- return uts_ns;
-}
-
/*
* Clone a new ns copying an original utsname, setting refcount to 1
* @old_ns: namespace to clone
@@ -55,21 +46,20 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = create_uts_ns();
+ ns = kmem_cache_zalloc(uts_ns_cache, GFP_KERNEL);
if (!ns)
goto fail_dec;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(&ns->ns, &utsns_operations, true);
if (err)
goto fail_free;
ns->ucounts = ucounts;
- ns->ns.ops = &utsns_operations;
-
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
ns->user_ns = get_user_ns(user_ns);
up_read(&uts_sem);
+ ns_tree_add(ns);
return ns;
fail_free:
@@ -105,15 +95,12 @@ struct uts_namespace *copy_utsname(unsigned long flags,
void free_uts_ns(struct uts_namespace *ns)
{
+ ns_tree_remove(ns);
dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
- kmem_cache_free(uts_ns_cache, ns);
-}
-
-static inline struct uts_namespace *to_uts_ns(struct ns_common *ns)
-{
- return container_of(ns, struct uts_namespace, ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *utsns_get(struct task_struct *task)
@@ -174,4 +161,5 @@ void __init uts_ns_init(void)
offsetof(struct uts_namespace, name),
sizeof_field(struct uts_namespace, name),
NULL);
+ ns_tree_add(&init_uts_ns);
}