summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2024-10-03 10:05:55 -0700
committerJakub Kicinski <kuba@kernel.org>2024-10-03 10:05:55 -0700
commitf66ebf37d69cc700ca884c6a18c2258caf8b151b (patch)
tree9971d3c37e198dc9474184427fe5bb7408bb8bf5 /kernel
parentc30a3f54e661d01df2bf193398336155089dd502 (diff)
parent8c245fe7dde3bf776253550fc914a36293db4ff3 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR. No conflicts and no adjacent changes. Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/bpf_iter.c1
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/debug/gdbstub.c2
-rw-r--r--kernel/events/core.c1
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/jump_label.c34
-rw-r--r--kernel/locking/lockdep.c53
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/rwsem.c22
-rw-r--r--kernel/module/Kconfig77
-rw-r--r--kernel/module/debug_kmemleak.c18
-rw-r--r--kernel/module/sysfs.c63
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/relay.c1
-rw-r--r--kernel/sched/ext.c320
-rw-r--r--kernel/signal.c21
-rw-r--r--kernel/static_call_inline.c13
-rw-r--r--kernel/time/posix-clock.c1
-rw-r--r--kernel/trace/rv/rv.c3
-rw-r--r--kernel/trace/rv/rv_reactors.c1
-rw-r--r--kernel/trace/trace.c3
21 files changed, 347 insertions, 294 deletions
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 112581cf97e7..106735145948 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -283,7 +283,6 @@ static int iter_release(struct inode *inode, struct file *file)
const struct file_operations bpf_iter_fops = {
.open = iter_open,
- .llseek = no_llseek,
.read = bpf_seq_read,
.release = iter_release,
};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4e07cc057d6f..5e77c58e0601 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -40,7 +40,7 @@
#include <linux/execmem.h>
#include <asm/barrier.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 9d34d2364b5a..f625172d4b67 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -33,7 +33,7 @@
#include <linux/reboot.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include "debug_core.h"
#define KGDB_MAX_THREAD_QUERY 17
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5a8071c45c80..e3589c4287cb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6821,7 +6821,6 @@ static int perf_fasync(int fd, struct file *filp, int on)
}
static const struct file_operations perf_fops = {
- .llseek = no_llseek,
.release = perf_release,
.read = perf_read,
.poll = perf_poll,
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 2ec796e2f055..4b52cb2ae6d6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1545,7 +1545,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
if (!area->bitmap)
goto free_area;
- area->page = alloc_page(GFP_HIGHUSER);
+ area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
if (!area->page)
goto free_bitmap;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 6dc76b590703..93a822d3c468 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -168,7 +168,7 @@ bool static_key_slow_inc_cpuslocked(struct static_key *key)
jump_label_update(key);
/*
* Ensure that when static_key_fast_inc_not_disabled() or
- * static_key_slow_try_dec() observe the positive value,
+ * static_key_dec_not_one() observe the positive value,
* they must also observe all the text changes.
*/
atomic_set_release(&key->enabled, 1);
@@ -250,7 +250,7 @@ void static_key_disable(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_disable);
-static bool static_key_slow_try_dec(struct static_key *key)
+static bool static_key_dec_not_one(struct static_key *key)
{
int v;
@@ -274,6 +274,14 @@ static bool static_key_slow_try_dec(struct static_key *key)
* enabled. This suggests an ordering problem on the user side.
*/
WARN_ON_ONCE(v < 0);
+
+ /*
+ * Warn about underflow, and lie about success in an attempt to
+ * not make things worse.
+ */
+ if (WARN_ON_ONCE(v == 0))
+ return true;
+
if (v <= 1)
return false;
} while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1)));
@@ -284,15 +292,27 @@ static bool static_key_slow_try_dec(struct static_key *key)
static void __static_key_slow_dec_cpuslocked(struct static_key *key)
{
lockdep_assert_cpus_held();
+ int val;
- if (static_key_slow_try_dec(key))
+ if (static_key_dec_not_one(key))
return;
guard(mutex)(&jump_label_mutex);
- if (atomic_cmpxchg(&key->enabled, 1, 0) == 1)
+ val = atomic_read(&key->enabled);
+ /*
+ * It should be impossible to observe -1 with jump_label_mutex held,
+ * see static_key_slow_inc_cpuslocked().
+ */
+ if (WARN_ON_ONCE(val == -1))
+ return;
+ /*
+ * Cannot already be 0, something went sideways.
+ */
+ if (WARN_ON_ONCE(val == 0))
+ return;
+
+ if (atomic_dec_and_test(&key->enabled))
jump_label_update(key);
- else
- WARN_ON_ONCE(!static_key_slow_try_dec(key));
}
static void __static_key_slow_dec(struct static_key *key)
@@ -329,7 +349,7 @@ void __static_key_slow_dec_deferred(struct static_key *key,
{
STATIC_KEY_CHECK_USE(key);
- if (static_key_slow_try_dec(key))
+ if (static_key_dec_not_one(key))
return;
schedule_delayed_work(work, timeout);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7963deac33c3..536bd471557f 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -788,7 +788,7 @@ static void lockdep_print_held_locks(struct task_struct *p)
printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
else
printk("%d lock%s held by %s/%d:\n", depth,
- depth > 1 ? "s" : "", p->comm, task_pid_nr(p));
+ str_plural(depth), p->comm, task_pid_nr(p));
/*
* It's not reliable to print a task's held locks if it's not sleeping
* and it's not the current task.
@@ -2084,6 +2084,9 @@ static noinline void print_bfs_bug(int ret)
/*
* Breadth-first-search failed, graph got corrupted?
*/
+ if (ret == BFS_EQUEUEFULL)
+ pr_warn("Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning:\n");
+
WARN(1, "lockdep bfs error:%d\n", ret);
}
@@ -6263,25 +6266,27 @@ static struct pending_free *get_pending_free(void)
static void free_zapped_rcu(struct rcu_head *cb);
/*
- * Schedule an RCU callback if no RCU callback is pending. Must be called with
- * the graph lock held.
- */
-static void call_rcu_zapped(struct pending_free *pf)
+* See if we need to queue an RCU callback, must called with
+* the lockdep lock held, returns false if either we don't have
+* any pending free or the callback is already scheduled.
+* Otherwise, a call_rcu() must follow this function call.
+*/
+static bool prepare_call_rcu_zapped(struct pending_free *pf)
{
WARN_ON_ONCE(inside_selftest());
if (list_empty(&pf->zapped))
- return;
+ return false;
if (delayed_free.scheduled)
- return;
+ return false;
delayed_free.scheduled = true;
WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf);
delayed_free.index ^= 1;
- call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+ return true;
}
/* The caller must hold the graph lock. May be called from RCU context. */
@@ -6307,6 +6312,7 @@ static void free_zapped_rcu(struct rcu_head *ch)
{
struct pending_free *pf;
unsigned long flags;
+ bool need_callback;
if (WARN_ON_ONCE(ch != &delayed_free.rcu_head))
return;
@@ -6318,14 +6324,18 @@ static void free_zapped_rcu(struct rcu_head *ch)
pf = delayed_free.pf + (delayed_free.index ^ 1);
__free_zapped_classes(pf);
delayed_free.scheduled = false;
+ need_callback =
+ prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
/*
- * If there's anything on the open list, close and start a new callback.
- */
- call_rcu_zapped(delayed_free.pf + delayed_free.index);
+ * If there's pending free and its callback has not been scheduled,
+ * queue an RCU callback.
+ */
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
- lockdep_unlock();
- raw_local_irq_restore(flags);
}
/*
@@ -6365,6 +6375,7 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
{
struct pending_free *pf;
unsigned long flags;
+ bool need_callback;
init_data_structures_once();
@@ -6372,10 +6383,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
lockdep_lock();
pf = get_pending_free();
__lockdep_free_key_range(pf, start, size);
- call_rcu_zapped(pf);
+ need_callback = prepare_call_rcu_zapped(pf);
lockdep_unlock();
raw_local_irq_restore(flags);
-
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
/*
* Wait for any possible iterators from look_up_lock_class() to pass
* before continuing to free the memory they refer to.
@@ -6469,6 +6481,7 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock)
struct pending_free *pf;
unsigned long flags;
int locked;
+ bool need_callback = false;
raw_local_irq_save(flags);
locked = graph_lock();
@@ -6477,11 +6490,13 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock)
pf = get_pending_free();
__lockdep_reset_lock(pf, lock);
- call_rcu_zapped(pf);
+ need_callback = prepare_call_rcu_zapped(pf);
graph_unlock();
out_irq:
raw_local_irq_restore(flags);
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
}
/*
@@ -6525,6 +6540,7 @@ void lockdep_unregister_key(struct lock_class_key *key)
struct pending_free *pf;
unsigned long flags;
bool found = false;
+ bool need_callback = false;
might_sleep();
@@ -6545,11 +6561,14 @@ void lockdep_unregister_key(struct lock_class_key *key)
if (found) {
pf = get_pending_free();
__lockdep_free_key_range(pf, key, 1);
- call_rcu_zapped(pf);
+ need_callback = prepare_call_rcu_zapped(pf);
}
lockdep_unlock();
raw_local_irq_restore(flags);
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+
/* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
synchronize_rcu();
}
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index e2bfb1db589d..6db0f43fc4df 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -424,7 +424,7 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
for (i = 0; i < offset; i++)
seq_puts(m, " ");
for (i = 0; i < length; i++)
- seq_printf(m, "%c", c);
+ seq_putc(m, c);
seq_puts(m, "\n");
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 5ded7dff46ef..2bbb6eca5144 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -181,12 +181,21 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
__rwsem_set_reader_owned(sem, current);
}
+#ifdef CONFIG_DEBUG_RWSEMS
+/*
+ * Return just the real task structure pointer of the owner
+ */
+static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+{
+ return (struct task_struct *)
+ (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
/*
* Return true if the rwsem is owned by a reader.
*/
static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
{
-#ifdef CONFIG_DEBUG_RWSEMS
/*
* Check the count to see if it is write-locked.
*/
@@ -194,11 +203,9 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
if (count & RWSEM_WRITER_MASK)
return false;
-#endif
return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
}
-#ifdef CONFIG_DEBUG_RWSEMS
/*
* With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
* is a task pointer in owner of a reader-owned rwsem, it will be the
@@ -266,15 +273,6 @@ static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
}
/*
- * Return just the real task structure pointer of the owner
- */
-static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
-{
- return (struct task_struct *)
- (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
-}
-
-/*
* Return the real task structure pointer of the owner and the embedded
* flags in the owner. pflags must be non-NULL.
*/
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 05a9a06a140c..7c6588148d42 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -229,7 +229,7 @@ comment "Do not forget to sign required modules with scripts/sign-file"
depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL
choice
- prompt "Which hash algorithm should modules be signed with?"
+ prompt "Hash algorithm to sign modules"
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
help
This determines which sort of hashing algorithm will be used during
@@ -239,31 +239,31 @@ choice
the signature on that module.
config MODULE_SIG_SHA1
- bool "Sign modules with SHA-1"
+ bool "SHA-1"
select CRYPTO_SHA1
config MODULE_SIG_SHA256
- bool "Sign modules with SHA-256"
+ bool "SHA-256"
select CRYPTO_SHA256
config MODULE_SIG_SHA384
- bool "Sign modules with SHA-384"
+ bool "SHA-384"
select CRYPTO_SHA512
config MODULE_SIG_SHA512
- bool "Sign modules with SHA-512"
+ bool "SHA-512"
select CRYPTO_SHA512
config MODULE_SIG_SHA3_256
- bool "Sign modules with SHA3-256"
+ bool "SHA3-256"
select CRYPTO_SHA3
config MODULE_SIG_SHA3_384
- bool "Sign modules with SHA3-384"
+ bool "SHA3-384"
select CRYPTO_SHA3
config MODULE_SIG_SHA3_512
- bool "Sign modules with SHA3-512"
+ bool "SHA3-512"
select CRYPTO_SHA3
endchoice
@@ -279,64 +279,65 @@ config MODULE_SIG_HASH
default "sha3-384" if MODULE_SIG_SHA3_384
default "sha3-512" if MODULE_SIG_SHA3_512
-choice
- prompt "Module compression mode"
+config MODULE_COMPRESS
+ bool "Module compression"
help
- This option allows you to choose the algorithm which will be used to
- compress modules when 'make modules_install' is run. (or, you can
- choose to not compress modules at all.)
-
- External modules will also be compressed in the same way during the
- installation.
-
- For modules inside an initrd or initramfs, it's more efficient to
- compress the whole initrd or initramfs instead.
-
+ Enable module compression to reduce on-disk size of module binaries.
This is fully compatible with signed modules.
- Please note that the tool used to load modules needs to support the
- corresponding algorithm. module-init-tools MAY support gzip, and kmod
- MAY support gzip, xz and zstd.
+ The tool used to work with modules needs to support the selected
+ compression type. kmod MAY support gzip, xz and zstd. Other tools
+ might have a limited selection of the supported types.
- Your build system needs to provide the appropriate compression tool
- to compress the modules.
+ Note that for modules inside an initrd or initramfs, it's more
+ efficient to compress the whole ramdisk instead.
- If in doubt, select 'None'.
+ If unsure, say N.
-config MODULE_COMPRESS_NONE
- bool "None"
+choice
+ prompt "Module compression type"
+ depends on MODULE_COMPRESS
help
- Do not compress modules. The installed modules are suffixed
- with .ko.
+ Choose the supported algorithm for module compression.
config MODULE_COMPRESS_GZIP
bool "GZIP"
help
- Compress modules with GZIP. The installed modules are suffixed
- with .ko.gz.
+ Support modules compressed with GZIP. The installed modules are
+ suffixed with .ko.gz.
config MODULE_COMPRESS_XZ
bool "XZ"
help
- Compress modules with XZ. The installed modules are suffixed
- with .ko.xz.
+ Support modules compressed with XZ. The installed modules are
+ suffixed with .ko.xz.
config MODULE_COMPRESS_ZSTD
bool "ZSTD"
help
- Compress modules with ZSTD. The installed modules are suffixed
- with .ko.zst.
+ Support modules compressed with ZSTD. The installed modules are
+ suffixed with .ko.zst.
endchoice
+config MODULE_COMPRESS_ALL
+ bool "Automatically compress all modules"
+ default y
+ depends on MODULE_COMPRESS
+ help
+ Compress all modules during 'make modules_install'.
+
+ Your build system needs to provide the appropriate compression tool
+ for the selected compression type. External modules will also be
+ compressed in the same way during the installation.
+
config MODULE_DECOMPRESS
bool "Support in-kernel module decompression"
- depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
+ depends on MODULE_COMPRESS
select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
select XZ_DEC if MODULE_COMPRESS_XZ
select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD
help
-
Support for decompressing kernel modules by the kernel itself
instead of relying on userspace to perform this task. Useful when
load pinning security policy is enabled.
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
index 12a569d361e8..b4cc03842d70 100644
--- a/kernel/module/debug_kmemleak.c
+++ b/kernel/module/debug_kmemleak.c
@@ -12,19 +12,9 @@
void kmemleak_load_module(const struct module *mod,
const struct load_info *info)
{
- unsigned int i;
-
- /* only scan the sections containing data */
- kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- /* Scan all writable sections that's not executable */
- if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
- !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
- (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
- info->sechdrs[i].sh_size, GFP_KERNEL);
+ /* only scan writable, non-executable sections */
+ for_each_mod_mem_type(type) {
+ if (type != MOD_DATA && type != MOD_INIT_DATA)
+ kmemleak_no_scan(mod->mem[type].base);
}
}
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index 26efe1305c12..456358e1fdc4 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -69,12 +69,13 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
kfree(sect_attrs);
}
-static void add_sect_attrs(struct module *mod, const struct load_info *info)
+static int add_sect_attrs(struct module *mod, const struct load_info *info)
{
unsigned int nloaded = 0, i, size[2];
struct module_sect_attrs *sect_attrs;
struct module_sect_attr *sattr;
struct bin_attribute **gattr;
+ int ret;
/* Count loaded sections and allocate structures */
for (i = 0; i < info->hdr->e_shnum; i++)
@@ -85,7 +86,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
if (!sect_attrs)
- return;
+ return -ENOMEM;
/* Setup section attributes. */
sect_attrs->grp.name = "sections";
@@ -103,8 +104,10 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
sattr->address = sec->sh_addr;
sattr->battr.attr.name =
kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
- if (!sattr->battr.attr.name)
+ if (!sattr->battr.attr.name) {
+ ret = -ENOMEM;
goto out;
+ }
sect_attrs->nsections++;
sattr->battr.read = module_sect_read;
sattr->battr.size = MODULE_SECT_READ_SIZE;
@@ -113,13 +116,15 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
}
*gattr = NULL;
- if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+ ret = sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp);
+ if (ret)
goto out;
mod->sect_attrs = sect_attrs;
- return;
+ return 0;
out:
free_sect_attrs(sect_attrs);
+ return ret;
}
static void remove_sect_attrs(struct module *mod)
@@ -158,15 +163,12 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
kfree(notes_attrs);
}
-static void add_notes_attrs(struct module *mod, const struct load_info *info)
+static int add_notes_attrs(struct module *mod, const struct load_info *info)
{
unsigned int notes, loaded, i;
struct module_notes_attrs *notes_attrs;
struct bin_attribute *nattr;
-
- /* failed to create section attributes, so can't create notes */
- if (!mod->sect_attrs)
- return;
+ int ret;
/* Count notes sections and allocate structures. */
notes = 0;
@@ -176,12 +178,12 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info)
++notes;
if (notes == 0)
- return;
+ return 0;
notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
GFP_KERNEL);
if (!notes_attrs)
- return;
+ return -ENOMEM;
notes_attrs->notes = notes;
nattr = &notes_attrs->attrs[0];
@@ -201,19 +203,23 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info)
}
notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
- if (!notes_attrs->dir)
+ if (!notes_attrs->dir) {
+ ret = -ENOMEM;
goto out;
+ }
- for (i = 0; i < notes; ++i)
- if (sysfs_create_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]))
+ for (i = 0; i < notes; ++i) {
+ ret = sysfs_create_bin_file(notes_attrs->dir, &notes_attrs->attrs[i]);
+ if (ret)
goto out;
+ }
mod->notes_attrs = notes_attrs;
- return;
+ return 0;
out:
free_notes_attrs(notes_attrs, i);
+ return ret;
}
static void remove_notes_attrs(struct module *mod)
@@ -223,9 +229,15 @@ static void remove_notes_attrs(struct module *mod)
}
#else /* !CONFIG_KALLSYMS */
-static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { }
+static inline int add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+ return 0;
+}
static inline void remove_sect_attrs(struct module *mod) { }
-static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { }
+static inline int add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+ return 0;
+}
static inline void remove_notes_attrs(struct module *mod) { }
#endif /* CONFIG_KALLSYMS */
@@ -385,11 +397,20 @@ int mod_sysfs_setup(struct module *mod,
if (err)
goto out_unreg_modinfo_attrs;
- add_sect_attrs(mod, info);
- add_notes_attrs(mod, info);
+ err = add_sect_attrs(mod, info);
+ if (err)
+ goto out_del_usage_links;
+
+ err = add_notes_attrs(mod, info);
+ if (err)
+ goto out_unreg_sect_attrs;
return 0;
+out_unreg_sect_attrs:
+ remove_sect_attrs(mod);
+out_del_usage_links:
+ del_usage_links(mod);
out_unreg_modinfo_attrs:
module_remove_modinfo_attrs(mod, -1);
out_unreg_param:
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3aa41ba22129..3f9e3efb9f6e 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -447,7 +447,6 @@ static const struct file_operations snapshot_fops = {
.release = snapshot_release,
.read = snapshot_read,
.write = snapshot_write,
- .llseek = no_llseek,
.unlocked_ioctl = snapshot_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = snapshot_compat_ioctl,
diff --git a/kernel/relay.c b/kernel/relay.c
index a8e90e98bf2c..a8ae436dc77e 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1079,7 +1079,6 @@ const struct file_operations relay_file_operations = {
.poll = relay_file_poll,
.mmap = relay_file_mmap,
.read = relay_file_read,
- .llseek = no_llseek,
.release = relay_file_release,
};
EXPORT_SYMBOL_GPL(relay_file_operations);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c09e3dc38c34..3cd7c50a51c5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,6 +9,7 @@
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
enum scx_consts {
+ SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4,
SCX_DSP_DFL_MAX_BATCH = 32,
SCX_DSP_MAX_LOOPS = 32,
SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
@@ -778,7 +779,6 @@ enum scx_tg_flags {
};
enum scx_ops_enable_state {
- SCX_OPS_PREPPING,
SCX_OPS_ENABLING,
SCX_OPS_ENABLED,
SCX_OPS_DISABLING,
@@ -786,7 +786,6 @@ enum scx_ops_enable_state {
};
static const char *scx_ops_enable_state_str[] = {
- [SCX_OPS_PREPPING] = "prepping",
[SCX_OPS_ENABLING] = "enabling",
[SCX_OPS_ENABLED] = "enabled",
[SCX_OPS_DISABLING] = "disabling",
@@ -854,6 +853,7 @@ DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
+static bool scx_ops_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -925,8 +925,15 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
*/
static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
-/* dispatch queues */
-static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
+/*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is
+ * to avoid live-locking in bypass mode where all tasks are dispatched to
+ * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't
+ * sufficient, it can be further split.
+ */
+static struct scx_dispatch_q **global_dsqs;
static const struct rhashtable_params dsq_hash_params = {
.key_len = 8,
@@ -1029,6 +1036,16 @@ static bool u32_before(u32 a, u32 b)
return (s32)(a - b) < 0;
}
+static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
+{
+ return global_dsqs[cpu_to_node(task_cpu(p))];
+}
+
+static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
+{
+ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
+}
+
/*
* scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
* ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -1637,7 +1654,7 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
scx_ops_error("attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
- dsq = &scx_dsq_global;
+ dsq = find_global_dsq(p);
raw_spin_lock(&dsq->lock);
}
}
@@ -1803,21 +1820,6 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
raw_spin_unlock(&dsq->lock);
}
-static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
-{
- return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
-}
-
-static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
-{
- lockdep_assert(rcu_read_lock_any_held());
-
- if (dsq_id == SCX_DSQ_GLOBAL)
- return &scx_dsq_global;
- else
- return find_user_dsq(dsq_id);
-}
-
static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
struct task_struct *p)
{
@@ -1830,16 +1832,20 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
- return &scx_dsq_global;
+ return find_global_dsq(p);
return &cpu_rq(cpu)->scx.local_dsq;
}
- dsq = find_non_local_dsq(dsq_id);
+ if (dsq_id == SCX_DSQ_GLOBAL)
+ dsq = find_global_dsq(p);
+ else
+ dsq = find_user_dsq(dsq_id);
+
if (unlikely(!dsq)) {
scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
dsq_id, p->comm, p->pid);
- return &scx_dsq_global;
+ return find_global_dsq(p);
}
return dsq;
@@ -1938,6 +1944,7 @@ static bool scx_rq_online(struct rq *rq)
static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
int sticky_cpu)
{
+ bool bypassing = scx_rq_bypassing(rq);
struct task_struct **ddsp_taskp;
unsigned long qseq;
@@ -1955,7 +1962,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (!scx_rq_online(rq))
goto local;
- if (scx_rq_bypassing(rq))
+ if (bypassing)
goto global;
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2010,8 +2017,8 @@ local_norefill:
global:
touch_core_sched(rq, p); /* see the comment in local: */
- p->scx.slice = SCX_SLICE_DFL;
- dispatch_enqueue(&scx_dsq_global, p, enq_flags);
+ p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL;
+ dispatch_enqueue(find_global_dsq(p), p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -2357,6 +2364,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
}
}
#else /* CONFIG_SMP */
+static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
#endif /* CONFIG_SMP */
@@ -2396,6 +2404,13 @@ retry:
return false;
}
+static bool consume_global_dsq(struct rq *rq)
+{
+ int node = cpu_to_node(cpu_of(rq));
+
+ return consume_dispatch_q(rq, global_dsqs[node]);
+}
+
/**
* dispatch_to_local_dsq - Dispatch a task to a local dsq
* @rq: current rq which is locked
@@ -2429,7 +2444,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
#ifdef CONFIG_SMP
if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
- dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+ dispatch_enqueue(find_global_dsq(p), p,
+ enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
@@ -2629,7 +2645,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_dispatch_q(rq, &scx_dsq_global))
+ if (consume_global_dsq(rq))
goto has_tasks;
if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq))
@@ -2654,7 +2670,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (rq->scx.local_dsq.nr)
goto has_tasks;
- if (consume_dispatch_q(rq, &scx_dsq_global))
+ if (consume_global_dsq(rq))
goto has_tasks;
/*
@@ -3058,22 +3074,13 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
* there is an idle core elsewhere on the system.
*/
cpu = smp_processor_id();
- if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 &&
+ if ((wake_flags & SCX_WAKE_SYNC) &&
!cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) &&
cpu_rq(cpu)->scx.local_dsq.nr == 0) {
if (cpumask_test_cpu(cpu, p->cpus_ptr))
goto cpu_found;
}
- if (p->nr_cpus_allowed == 1) {
- if (test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- } else {
- return prev_cpu;
- }
- }
-
/*
* If CPU has SMT, any wholly idle CPU is likely a better pick than
* partially idle @prev_cpu.
@@ -3550,7 +3557,7 @@ int scx_fork(struct task_struct *p)
{
percpu_rwsem_assert_held(&scx_fork_rwsem);
- if (scx_enabled())
+ if (scx_ops_init_task_enabled)
return scx_ops_init_task(p, task_group(p), true);
else
return 0;
@@ -3558,7 +3565,7 @@ int scx_fork(struct task_struct *p)
void scx_post_fork(struct task_struct *p)
{
- if (scx_enabled()) {
+ if (scx_ops_init_task_enabled) {
scx_set_task_state(p, SCX_TASK_READY);
/*
@@ -3690,6 +3697,7 @@ bool scx_can_stop_tick(struct rq *rq)
#ifdef CONFIG_EXT_GROUP_SCHED
DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+static bool scx_cgroup_enabled;
static bool cgroup_warned_missing_weight;
static bool cgroup_warned_missing_idle;
@@ -3709,8 +3717,7 @@ static void scx_cgroup_warn_missing_weight(struct task_group *tg)
static void scx_cgroup_warn_missing_idle(struct task_group *tg)
{
- if (scx_ops_enable_state() == SCX_OPS_DISABLED ||
- cgroup_warned_missing_idle)
+ if (!scx_cgroup_enabled || cgroup_warned_missing_idle)
return;
if (!tg->idle)
@@ -3731,15 +3738,18 @@ int scx_tg_online(struct task_group *tg)
scx_cgroup_warn_missing_weight(tg);
- if (SCX_HAS_OP(cgroup_init)) {
- struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+ if (scx_cgroup_enabled) {
+ if (SCX_HAS_OP(cgroup_init)) {
+ struct scx_cgroup_init_args args =
+ { .weight = tg->scx_weight };
- ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
- tg->css.cgroup, &args);
- if (!ret)
+ ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
+ tg->css.cgroup, &args);
+ if (ret)
+ ret = ops_sanitize_err("cgroup_init", ret);
+ }
+ if (ret == 0)
tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
- else
- ret = ops_sanitize_err("cgroup_init", ret);
} else {
tg->scx_flags |= SCX_TG_ONLINE;
}
@@ -3770,7 +3780,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
/* released in scx_finish/cancel_attach() */
percpu_down_read(&scx_cgroup_rwsem);
- if (!scx_enabled())
+ if (!scx_cgroup_enabled)
return 0;
cgroup_taskset_for_each(p, css, tset) {
@@ -3813,7 +3823,7 @@ err:
void scx_move_task(struct task_struct *p)
{
- if (!scx_enabled())
+ if (!scx_cgroup_enabled)
return;
/*
@@ -3849,7 +3859,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
struct task_struct *p;
- if (!scx_enabled())
+ if (!scx_cgroup_enabled)
goto out_unlock;
cgroup_taskset_for_each(p, css, tset) {
@@ -3866,7 +3876,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
percpu_down_read(&scx_cgroup_rwsem);
- if (tg->scx_weight != weight) {
+ if (scx_cgroup_enabled && tg->scx_weight != weight) {
if (SCX_HAS_OP(cgroup_set_weight))
SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
tg_cgrp(tg), weight);
@@ -4038,6 +4048,9 @@ static void scx_cgroup_exit(void)
percpu_rwsem_assert_held(&scx_cgroup_rwsem);
+ WARN_ON_ONCE(!scx_cgroup_enabled);
+ scx_cgroup_enabled = false;
+
/*
* scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
* cgroups and exit all the inited ones, all online cgroups are exited.
@@ -4113,6 +4126,9 @@ static int scx_cgroup_init(void)
}
rcu_read_unlock();
+ WARN_ON_ONCE(scx_cgroup_enabled);
+ scx_cgroup_enabled = true;
+
return 0;
}
@@ -4431,19 +4447,23 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
WRITE_ONCE(scx_switching_all, false);
/*
- * Avoid racing against fork and cgroup changes. See scx_ops_enable()
- * for explanation on the locking order.
+ * Shut down cgroup support before tasks so that the cgroup attach path
+ * doesn't race against scx_ops_exit_task().
*/
- percpu_down_write(&scx_fork_rwsem);
- cpus_read_lock();
scx_cgroup_lock();
+ scx_cgroup_exit();
+ scx_cgroup_unlock();
- spin_lock_irq(&scx_tasks_lock);
- scx_task_iter_init(&sti);
/*
* The BPF scheduler is going away. All tasks including %TASK_DEAD ones
* must be switched out and exited synchronously.
*/
+ percpu_down_write(&scx_fork_rwsem);
+
+ scx_ops_init_task_enabled = false;
+
+ spin_lock_irq(&scx_tasks_lock);
+ scx_task_iter_init(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
const struct sched_class *old_class = p->sched_class;
struct sched_enq_and_set_ctx ctx;
@@ -4461,23 +4481,18 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
}
scx_task_iter_exit(&sti);
spin_unlock_irq(&scx_tasks_lock);
+ percpu_up_write(&scx_fork_rwsem);
/* no task is on scx, turn off all the switches and flush in-progress calls */
- static_branch_disable_cpuslocked(&__scx_ops_enabled);
+ static_branch_disable(&__scx_ops_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
- static_branch_disable_cpuslocked(&scx_has_op[i]);
- static_branch_disable_cpuslocked(&scx_ops_enq_last);
- static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
- static_branch_disable_cpuslocked(&scx_ops_cpu_preempt);
- static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
+ static_branch_disable(&scx_has_op[i]);
+ static_branch_disable(&scx_ops_enq_last);
+ static_branch_disable(&scx_ops_enq_exiting);
+ static_branch_disable(&scx_ops_cpu_preempt);
+ static_branch_disable(&scx_builtin_idle_enabled);
synchronize_rcu();
- scx_cgroup_exit();
-
- scx_cgroup_unlock();
- cpus_read_unlock();
- percpu_up_write(&scx_fork_rwsem);
-
if (ei->kind >= SCX_EXIT_ERROR) {
pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
scx_ops.name, ei->reason);
@@ -4929,7 +4944,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
- int i, cpu, ret;
+ int i, cpu, node, ret;
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
cpu_possible_mask)) {
@@ -4948,6 +4963,34 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
}
}
+ if (!global_dsqs) {
+ struct scx_dispatch_q **dsqs;
+
+ dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL);
+ if (!dsqs) {
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
+
+ for_each_node_state(node, N_POSSIBLE) {
+ struct scx_dispatch_q *dsq;
+
+ dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
+ if (!dsq) {
+ for_each_node_state(node, N_POSSIBLE)
+ kfree(dsqs[node]);
+ kfree(dsqs);
+ ret = -ENOMEM;
+ goto err_unlock;
+ }
+
+ init_dsq(dsq, SCX_DSQ_GLOBAL);
+ dsqs[node] = dsq;
+ }
+
+ global_dsqs = dsqs;
+ }
+
if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
ret = -EBUSY;
goto err_unlock;
@@ -4971,12 +5014,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
}
/*
- * Set scx_ops, transition to PREPPING and clear exit info to arm the
+ * Set scx_ops, transition to ENABLING and clear exit info to arm the
* disable path. Failure triggers full disabling from here on.
*/
scx_ops = *ops;
- WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) !=
SCX_OPS_DISABLED);
atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
@@ -4997,7 +5040,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
if (ret) {
ret = ops_sanitize_err("init", ret);
- goto err_disable_unlock_cpus;
+ cpus_read_unlock();
+ goto err_disable;
}
}
@@ -5005,6 +5049,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (((void (**)(void))ops)[i])
static_branch_enable_cpuslocked(&scx_has_op[i]);
+ check_hotplug_seq(ops);
cpus_read_unlock();
ret = validate_ops(ops);
@@ -5032,57 +5077,40 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_watchdog_timeout / 2);
/*
- * Lock out forks, cgroup on/offlining and moves before opening the
- * floodgate so that they don't wander into the operations prematurely.
- *
- * We don't need to keep the CPUs stable but static_branch_*() requires
- * cpus_read_lock() and scx_cgroup_rwsem must nest inside
- * cpu_hotplug_lock because of the following dependency chain:
- *
- * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem
- *
- * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use
- * static_branch_*_cpuslocked().
- *
- * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
- * following dependency chain:
- *
- * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
+ * Once __scx_ops_enabled is set, %current can be switched to SCX
+ * anytime. This can lead to stalls as some BPF schedulers (e.g.
+ * userspace scheduling) may not function correctly before all tasks are
+ * switched. Init in bypass mode to guarantee forward progress.
*/
- percpu_down_write(&scx_fork_rwsem);
- cpus_read_lock();
- scx_cgroup_lock();
-
- check_hotplug_seq(ops);
+ scx_ops_bypass(true);
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i])
- static_branch_enable_cpuslocked(&scx_has_op[i]);
+ static_branch_enable(&scx_has_op[i]);
if (ops->flags & SCX_OPS_ENQ_LAST)
- static_branch_enable_cpuslocked(&scx_ops_enq_last);
+ static_branch_enable(&scx_ops_enq_last);
if (ops->flags & SCX_OPS_ENQ_EXITING)
- static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
+ static_branch_enable(&scx_ops_enq_exiting);
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
- static_branch_enable_cpuslocked(&scx_ops_cpu_preempt);
+ static_branch_enable(&scx_ops_cpu_preempt);
if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
reset_idle_masks();
- static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
+ static_branch_enable(&scx_builtin_idle_enabled);
} else {
- static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
+ static_branch_disable(&scx_builtin_idle_enabled);
}
/*
- * All cgroups should be initialized before letting in tasks. cgroup
- * on/offlining and task migrations are already locked out.
+ * Lock out forks, cgroup on/offlining and moves before opening the
+ * floodgate so that they don't wander into the operations prematurely.
*/
- ret = scx_cgroup_init();
- if (ret)
- goto err_disable_unlock_all;
+ percpu_down_write(&scx_fork_rwsem);
- static_branch_enable_cpuslocked(&__scx_ops_enabled);
+ WARN_ON_ONCE(scx_ops_init_task_enabled);
+ scx_ops_init_task_enabled = true;
/*
* Enable ops for every task. Fork is excluded by scx_fork_rwsem
@@ -5090,9 +5118,19 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* leaving as sched_ext_free() can handle both prepped and enabled
* tasks. Prep all tasks first and then enable them with preemption
* disabled.
+ *
+ * All cgroups should be initialized before scx_ops_init_task() so that
+ * the BPF scheduler can reliably track each task's cgroup membership
+ * from scx_ops_init_task(). Lock out cgroup on/offlining and task
+ * migrations while tasks are being initialized so that
+ * scx_cgroup_can_attach() never sees uninitialized tasks.
*/
- spin_lock_irq(&scx_tasks_lock);
+ scx_cgroup_lock();
+ ret = scx_cgroup_init();
+ if (ret)
+ goto err_disable_unlock_all;
+ spin_lock_irq(&scx_tasks_lock);
scx_task_iter_init(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
/*
@@ -5117,43 +5155,30 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_disable_unlock_all;
}
+ scx_set_task_state(p, SCX_TASK_READY);
+
put_task_struct(p);
spin_lock_irq(&scx_tasks_lock);
}
scx_task_iter_exit(&sti);
+ spin_unlock_irq(&scx_tasks_lock);
+ scx_cgroup_unlock();
+ percpu_up_write(&scx_fork_rwsem);
/*
- * All tasks are prepped but are still ops-disabled. Ensure that
- * %current can't be scheduled out and switch everyone.
- * preempt_disable() is necessary because we can't guarantee that
- * %current won't be starved if scheduled out while switching.
- */
- preempt_disable();
-
- /*
- * From here on, the disable path must assume that tasks have ops
- * enabled and need to be recovered.
- *
- * Transition to ENABLING fails iff the BPF scheduler has already
- * triggered scx_bpf_error(). Returning an error code here would lose
- * the recorded error information. Exit indicating success so that the
- * error is notified through ops.exit() with all the details.
+ * All tasks are READY. It's safe to turn on scx_enabled() and switch
+ * all eligible tasks.
*/
- if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) {
- preempt_enable();
- spin_unlock_irq(&scx_tasks_lock);
- WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
- ret = 0;
- goto err_disable_unlock_all;
- }
+ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
+ static_branch_enable(&__scx_ops_enabled);
/*
- * We're fully committed and can't fail. The PREPPED -> ENABLED
+ * We're fully committed and can't fail. The task READY -> ENABLED
* transitions here are synchronized against sched_ext_free() through
* scx_tasks_lock.
*/
- WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
-
+ percpu_down_write(&scx_fork_rwsem);
+ spin_lock_irq(&scx_tasks_lock);
scx_task_iter_init(&sti);
while ((p = scx_task_iter_next_locked(&sti))) {
const struct sched_class *old_class = p->sched_class;
@@ -5161,7 +5186,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
- scx_set_task_state(p, SCX_TASK_READY);
__setscheduler_prio(p, p->prio);
check_class_changing(task_rq(p), p, old_class);
@@ -5170,14 +5194,16 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
check_class_changed(task_rq(p), p, old_class, p->prio);
}
scx_task_iter_exit(&sti);
-
spin_unlock_irq(&scx_tasks_lock);
- preempt_enable();
- scx_cgroup_unlock();
- cpus_read_unlock();
percpu_up_write(&scx_fork_rwsem);
- /* see above ENABLING transition for the explanation on exiting with 0 */
+ scx_ops_bypass(false);
+
+ /*
+ * Returning an error code here would lose the recorded error
+ * information. Exit indicating success so that the error is notified
+ * through ops.exit() with all the details.
+ */
if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
ret = 0;
@@ -5212,8 +5238,7 @@ err_unlock:
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
-err_disable_unlock_cpus:
- cpus_read_unlock();
+ scx_ops_bypass(false);
err_disable:
mutex_unlock(&scx_ops_enable_mutex);
/* must be fully disabled before returning */
@@ -5782,7 +5807,6 @@ void __init init_sched_ext_class(void)
SCX_TG_ONLINE);
BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
- init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
#ifdef CONFIG_SMP
BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
@@ -6058,7 +6082,7 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
if (dst_dsq->id == SCX_DSQ_LOCAL) {
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
- dst_dsq = &scx_dsq_global;
+ dst_dsq = find_global_dsq(p);
dst_rq = src_rq;
}
} else {
@@ -6175,7 +6199,7 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
flush_dispatch_buf(dspc->rq);
- dsq = find_non_local_dsq(dsq_id);
+ dsq = find_user_dsq(dsq_id);
if (unlikely(!dsq)) {
scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
return false;
@@ -6496,7 +6520,7 @@ __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
goto out;
}
} else {
- dsq = find_non_local_dsq(dsq_id);
+ dsq = find_user_dsq(dsq_id);
if (dsq) {
ret = READ_ONCE(dsq->nr);
goto out;
@@ -6545,7 +6569,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
if (flags & ~__SCX_DSQ_ITER_USER_FLAGS)
return -EINVAL;
- kit->dsq = find_non_local_dsq(dsq_id);
+ kit->dsq = find_user_dsq(dsq_id);
if (!kit->dsq)
return -ENOENT;
diff --git a/kernel/signal.c b/kernel/signal.c
index 6e57036f947f..4344860ffcac 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2888,8 +2888,6 @@ relock:
current->flags |= PF_SIGNALED;
if (sig_kernel_coredump(signr)) {
- int ret;
-
if (print_fatal_signals)
print_fatal_signal(signr);
proc_coredump_connector(current);
@@ -2901,24 +2899,7 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- ret = do_coredump(&ksig->info);
- if (ret)
- coredump_report_failure("coredump has not been created, error %d",
- ret);
- else if (!IS_ENABLED(CONFIG_COREDUMP)) {
- /*
- * Coredumps are not available, can't fail collecting
- * the coredump.
- *
- * Leave a note though that the coredump is going to be
- * not created. This is not an error or a warning as disabling
- * support in the kernel for coredumps isn't commonplace, and
- * the user must've built the kernel with the custom config so
- * let them know all works as desired.
- */
- coredump_report("no coredump collected as "
- "that is disabled in the kernel configuration");
- }
+ do_coredump(&ksig->info);
}
/*
diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c
index 639397b5491c..5259cda486d0 100644
--- a/kernel/static_call_inline.c
+++ b/kernel/static_call_inline.c
@@ -411,6 +411,17 @@ static void static_call_del_module(struct module *mod)
for (site = start; site < stop; site++) {
key = static_call_key(site);
+
+ /*
+ * If the key was not updated due to a memory allocation
+ * failure in __static_call_init() then treating key::sites
+ * as key::mods in the code below would cause random memory
+ * access and #GP. In that case all subsequent sites have
+ * not been touched either, so stop iterating.
+ */
+ if (!static_call_key_has_mods(key))
+ break;
+
if (key == prev_key)
continue;
@@ -442,7 +453,7 @@ static int static_call_module_notify(struct notifier_block *nb,
case MODULE_STATE_COMING:
ret = static_call_add_module(mod);
if (ret) {
- WARN(1, "Failed to allocate memory for static calls");
+ pr_warn("Failed to allocate memory for static calls\n");
static_call_del_module(mod);
}
break;
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 4782edcbe7b9..c2f3d0c490d5 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -168,7 +168,6 @@ static int posix_clock_release(struct inode *inode, struct file *fp)
static const struct file_operations posix_clock_file_operations = {
.owner = THIS_MODULE,
- .llseek = no_llseek,
.read = posix_clock_read,
.poll = posix_clock_poll,
.unlocked_ioctl = posix_clock_ioctl,
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index df0745a42a3f..dc819aec43e8 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -306,7 +306,6 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u
static const struct file_operations interface_enable_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = monitor_enable_write_data,
.read = monitor_enable_read_data,
};
@@ -329,7 +328,6 @@ static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf,
static const struct file_operations interface_desc_fops = {
.open = simple_open,
- .llseek = no_llseek,
.read = monitor_desc_read_data,
};
@@ -674,7 +672,6 @@ static ssize_t monitoring_on_write_data(struct file *filp, const char __user *us
static const struct file_operations monitoring_on_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = monitoring_on_write_data,
.read = monitoring_on_read_data,
};
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 6aae106695b6..7b49cbe388d4 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -426,7 +426,6 @@ static ssize_t reacting_on_write_data(struct file *filp, const char __user *user
static const struct file_operations reacting_on_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = reacting_on_write_data,
.read = reacting_on_read_data,
};
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b4f348b4653f..c01375adc471 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7557,7 +7557,6 @@ static const struct file_operations tracing_pipe_fops = {
.read = tracing_read_pipe,
.splice_read = tracing_splice_read_pipe,
.release = tracing_release_pipe,
- .llseek = no_llseek,
};
static const struct file_operations tracing_entries_fops = {
@@ -7636,7 +7635,6 @@ static const struct file_operations snapshot_raw_fops = {
.read = tracing_buffers_read,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
- .llseek = no_llseek,
};
#endif /* CONFIG_TRACER_SNAPSHOT */
@@ -8466,7 +8464,6 @@ static const struct file_operations tracing_buffers_fops = {
.flush = tracing_buffers_flush,
.splice_read = tracing_buffers_splice_read,
.unlocked_ioctl = tracing_buffers_ioctl,
- .llseek = no_llseek,
.mmap = tracing_buffers_mmap,
};