From be68d63a139bd4a0eae44d06234eaff8c07d207c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:36 -0400 Subject: ring-buffer: Add ring_buffer_alloc_range() In preparation to allowing the trace ring buffer to be allocated in a range of memory that is persistent across reboots, add ring_buffer_alloc_range(). It takes a contiguous range of memory and will split it up evenly for the per CPU ring buffers. If there's not enough memory to handle all CPUs with the minimum size, it will fail to allocate the ring buffer. Link: https://lkml.kernel.org/r/20240612232025.363998725@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 96d2140b471e..a50b0223b1d3 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -89,6 +89,11 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer, struct trace_buffer * __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key); +struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flags, + int order, unsigned long start, + unsigned long range_size, + struct lock_class_key *key); + /* * Because the ring buffer is generic, if other users of the ring buffer get * traced by ftrace, it can produce lockdep warnings. We need to keep each @@ -100,6 +105,18 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) +/* + * Because the ring buffer is generic, if other users of the ring buffer get + * traced by ftrace, it can produce lockdep warnings. We need to keep each + * ring buffer's lock class separate. + */ +#define ring_buffer_alloc_range(size, flags, order, start, range_size) \ +({ \ + static struct lock_class_key __key; \ + __ring_buffer_alloc_range((size), (flags), (order), (start), \ + (range_size), &__key); \ +}) + typedef bool (*ring_buffer_cond_fn)(void *data); int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, ring_buffer_cond_fn cond, void *data); -- cgit v1.2.3 From 7a1d1e4b9639ff08b2f42605c2950ae1ba4a43bf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Wed, 12 Jun 2024 19:19:44 -0400 Subject: tracing/ring-buffer: Add last_boot_info file to boot instance If an instance is mapped to memory on boot up, create a new file called "last_boot_info" that will hold information that can be used to properly parse the raw data in the ring buffer. It will export the delta of the addresses for text and data from what it was from the last boot. It does not expose actually addresses (unless you knew what the actual address was from the last boot). The output will look like: # cat last_boot_info text delta: -268435456 data delta: -268435456 The text and data are kept separate in case they are ever made different. Link: https://lkml.kernel.org/r/20240612232026.658680738@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Vincent Donnefort Cc: Joel Fernandes Cc: Daniel Bristot de Oliveira Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vineeth Pillai Cc: Youssef Esmat Cc: Beau Belgrave Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Petkov Cc: "Paul E. McKenney" Cc: David Howells Cc: Mike Rapoport Cc: Dave Hansen Cc: Tony Luck Cc: Guenter Roeck Cc: Ross Zwisler Cc: Kees Cook Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index a50b0223b1d3..55de3798a9b9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -94,6 +94,9 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag unsigned long range_size, struct lock_class_key *key); +bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text, + long *data); + /* * Because the ring buffer is generic, if other users of the ring buffer get * traced by ftrace, it can produce lockdep warnings. We need to keep each -- cgit v1.2.3 From 304b3f2bc07ba7c74a1ebc7917a8f0549e6b8d54 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:16 -1000 Subject: sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork() A new BPF extensible sched_class will need more control over the forking process. It wants to be able to fail from sched_cgroup_fork() after the new task's sched_task_group is initialized so that the loaded BPF program can prepare the task with its cgroup association is established and reject fork if e.g. allocation fails. Allow sched_cgroup_fork() to fail by making it return int instead of void and adding sched_cancel_fork() to undo sched_fork() in the error path. sched_cgroup_fork() doesn't fail yet and this patch shouldn't cause any behavior changes. v2: Patch description updated to detail the expected use. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/sched/task.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index d362aacf9f89..4df2f9055587 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_cancel_fork(struct task_struct *p); extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); -- cgit v1.2.3 From 4f9c7ca851044273df5d67e00ca0b4d0476a48f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:16 -1000 Subject: sched: Factor out cgroup weight conversion functions Factor out sched_weight_from/to_cgroup() which convert between scheduler shares and cgroup weight. No functional change. The factored out functions will be used by a new BPF extensible sched_class so that the weights can be exposed to the BPF programs in a way which is consistent cgroup weights and easier to interpret. The weight conversions will be used regardless of cgroup usage. It's just borrowing the cgroup weight range as it's more intuitive. CGROUP_WEIGHT_MIN/DFL/MAX constants are moved outside CONFIG_CGROUPS so that the conversion helpers can always be defined. v2: The helpers are now defined regardless of COFNIG_CGROUPS. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/cgroup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2150ca60394b..3cdaec701600 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -29,8 +29,6 @@ struct kernel_clone_args; -#ifdef CONFIG_CGROUPS - /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of @@ -40,6 +38,8 @@ struct kernel_clone_args; #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 +#ifdef CONFIG_CGROUPS + enum { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ -- cgit v1.2.3 From a7a9fc549293168c1edbc8433d5d4fbbe1171630 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:17 -1000 Subject: sched_ext: Add boilerplate for extensible scheduler class This adds dummy implementations of sched_ext interfaces which interact with the scheduler core and hook them in the correct places. As they're all dummies, this doesn't cause any behavior changes. This is split out to help reviewing. v2: balance_scx_on_up() dropped. This will be handled in sched_ext proper. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/sched/ext.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 include/linux/sched/ext.h (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h new file mode 100644 index 000000000000..a05dfcf533b0 --- /dev/null +++ b/include/linux/sched/ext.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_EXT_H +#define _LINUX_SCHED_EXT_H + +#ifdef CONFIG_SCHED_CLASS_EXT +#error "NOT IMPLEMENTED YET" +#else /* !CONFIG_SCHED_CLASS_EXT */ + +static inline void sched_ext_free(struct task_struct *p) {} + +#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* _LINUX_SCHED_EXT_H */ -- cgit v1.2.3 From f0e1a0643a59bf1f922fa209cec86a170b784f3f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:17 -1000 Subject: sched_ext: Implement BPF extensible scheduler class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a new scheduler class sched_ext (SCX), which allows scheduling policies to be implemented as BPF programs to achieve the following: 1. Ease of experimentation and exploration: Enabling rapid iteration of new scheduling policies. 2. Customization: Building application-specific schedulers which implement policies that are not applicable to general-purpose schedulers. 3. Rapid scheduler deployments: Non-disruptive swap outs of scheduling policies in production environments. sched_ext leverages BPF’s struct_ops feature to define a structure which exports function callbacks and flags to BPF programs that wish to implement scheduling policies. The struct_ops structure exported by sched_ext is struct sched_ext_ops, and is conceptually similar to struct sched_class. The role of sched_ext is to map the complex sched_class callbacks to the more simple and ergonomic struct sched_ext_ops callbacks. For more detailed discussion on the motivations and overview, please refer to the cover letter. Later patches will also add several example schedulers and documentation. This patch implements the minimum core framework to enable implementation of BPF schedulers. Subsequent patches will gradually add functionalities including safety guarantee mechanisms, nohz and cgroup support. include/linux/sched/ext.h defines struct sched_ext_ops. With the comment on top, each operation should be self-explanatory. The followings are worth noting: - Both "sched_ext" and its shorthand "scx" are used. If the identifier already has "sched" in it, "ext" is used; otherwise, "scx". - In sched_ext_ops, only .name is mandatory. Every operation is optional and if omitted a simple but functional default behavior is provided. - A new policy constant SCHED_EXT is added and a task can select sched_ext by invoking sched_setscheduler(2) with the new policy constant. However, if the BPF scheduler is not loaded, SCHED_EXT is the same as SCHED_NORMAL and the task is scheduled by CFS. When the BPF scheduler is loaded, all tasks which have the SCHED_EXT policy are switched to sched_ext. - To bridge the workflow imbalance between the scheduler core and sched_ext_ops callbacks, sched_ext uses simple FIFOs called dispatch queues (dsq's). By default, there is one global dsq (SCX_DSQ_GLOBAL), and one local per-CPU dsq (SCX_DSQ_LOCAL). SCX_DSQ_GLOBAL is provided for convenience and need not be used by a scheduler that doesn't require it. SCX_DSQ_LOCAL is the per-CPU FIFO that sched_ext pulls from when putting the next task on the CPU. The BPF scheduler can manage an arbitrary number of dsq's using scx_bpf_create_dsq() and scx_bpf_destroy_dsq(). - sched_ext guarantees system integrity no matter what the BPF scheduler does. To enable this, each task's ownership is tracked through p->scx.ops_state and all tasks are put on scx_tasks list. The disable path can always recover and revert all tasks back to CFS. See p->scx.ops_state and scx_tasks. - A task is not tied to its rq while enqueued. This decouples CPU selection from queueing and allows sharing a scheduling queue across an arbitrary subset of CPUs. This adds some complexities as a task may need to be bounced between rq's right before it starts executing. See dispatch_to_local_dsq() and move_task_to_local_dsq(). - One complication that arises from the above weak association between task and rq is that synchronizing with dequeue() gets complicated as dequeue() may happen anytime while the task is enqueued and the dispatch path might need to release the rq lock to transfer the task. Solving this requires a bit of complexity. See the logic around p->scx.sticky_cpu and p->scx.ops_qseq. - Both enable and disable paths are a bit complicated. The enable path switches all tasks without blocking to avoid issues which can arise from partially switched states (e.g. the switching task itself being starved). The disable path can't trust the BPF scheduler at all, so it also has to guarantee forward progress without blocking. See scx_ops_enable() and scx_ops_disable_workfn(). - When sched_ext is disabled, static_branches are used to shut down the entry points from hot paths. v7: - scx_ops_bypass() was incorrectly and unnecessarily trying to grab scx_ops_enable_mutex which can lead to deadlocks in the disable path. Fixed. - Fixed TASK_DEAD handling bug in scx_ops_enable() path which could lead to use-after-free. - Consolidated per-cpu variable usages and other cleanups. v6: - SCX_NR_ONLINE_OPS replaced with SCX_OPI_*_BEGIN/END so that multiple groups can be expressed. Later CPU hotplug operations are put into their own group. - SCX_OPS_DISABLING state is replaced with the new bypass mechanism which allows temporarily putting the system into simple FIFO scheduling mode bypassing the BPF scheduler. In addition to the shut down path, this will also be used to isolate the BPF scheduler across PM events. Enabling and disabling the bypass mode requires iterating all runnable tasks. rq->scx.runnable_list addition is moved from the later watchdog patch. - ops.prep_enable() is replaced with ops.init_task() and ops.enable/disable() are now called whenever the task enters and leaves sched_ext instead of when the task becomes schedulable on sched_ext and stops being so. A new operation - ops.exit_task() - is called when the task stops being schedulable on sched_ext. - scx_bpf_dispatch() can now be called from ops.select_cpu() too. This removes the need for communicating local dispatch decision made by ops.select_cpu() to ops.enqueue() via per-task storage. SCX_KF_SELECT_CPU is added to support the change. - SCX_TASK_ENQ_LOCAL which told the BPF scheudler that scx_select_cpu_dfl() wants the task to be dispatched to the local DSQ was removed. Instead, scx_bpf_select_cpu_dfl() now dispatches directly if it finds a suitable idle CPU. If such behavior is not desired, users can use scx_bpf_select_cpu_dfl() which returns the verdict in a bool out param. - scx_select_cpu_dfl() was mishandling WAKE_SYNC and could end up queueing many tasks on a local DSQ which makes tasks to execute in order while other CPUs stay idle which made some hackbench numbers really bad. Fixed. - The current state of sched_ext can now be monitored through files under /sys/sched_ext instead of /sys/kernel/debug/sched/ext. This is to enable monitoring on kernels which don't enable debugfs. - sched_ext wasn't telling BPF that ops.dispatch()'s @prev argument may be NULL and a BPF scheduler which derefs the pointer without checking could crash the kernel. Tell BPF. This is currently a bit ugly. A better way to annotate this is expected in the future. - scx_exit_info updated to carry pointers to message buffers instead of embedding them directly. This decouples buffer sizes from API so that they can be changed without breaking compatibility. - exit_code added to scx_exit_info. This is used to indicate different exit conditions on non-error exits and will be used to handle e.g. CPU hotplugs. - The patch "sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext" is folded in and the interface is changed so that partial switching is indicated with a new ops flag %SCX_OPS_SWITCH_PARTIAL. This makes scx_bpf_switch_all() unnecessasry and in turn SCX_KF_INIT. ops.init() is now called with SCX_KF_SLEEPABLE. - Code reorganized so that only the parts necessary to integrate with the rest of the kernel are in the header files. - Changes to reflect the BPF and other kernel changes including the addition of bpf_sched_ext_ops.cfi_stubs. v5: - To accommodate 32bit configs, p->scx.ops_state is now atomic_long_t instead of atomic64_t and scx_dsp_buf_ent.qseq which uses load_acquire/store_release is now unsigned long instead of u64. - Fix the bug where bpf_scx_btf_struct_access() was allowing write access to arbitrary fields. - Distinguish kfuncs which can be called from any sched_ext ops and from anywhere. e.g. scx_bpf_pick_idle_cpu() can now be called only from sched_ext ops. - Rename "type" to "kind" in scx_exit_info to make it easier to use on languages in which "type" is a reserved keyword. - Since cff9b2332ab7 ("kernel/sched: Modify initial boot task idle setup"), PF_IDLE is not set on idle tasks which haven't been online yet which made scx_task_iter_next_filtered() include those idle tasks in iterations leading to oopses. Update scx_task_iter_next_filtered() to directly test p->sched_class against idle_sched_class instead of using is_idle_task() which tests PF_IDLE. - Other updates to match upstream changes such as adding const to set_cpumask() param and renaming check_preempt_curr() to wakeup_preempt(). v4: - SCHED_CHANGE_BLOCK replaced with the previous sched_deq_and_put_task()/sched_enq_and_set_tsak() pair. This is because upstream is adaopting a different generic cleanup mechanism. Once that lands, the code will be adapted accordingly. - task_on_scx() used to test whether a task should be switched into SCX, which is confusing. Renamed to task_should_scx(). task_on_scx() now tests whether a task is currently on SCX. - scx_has_idle_cpus is barely used anymore and replaced with direct check on the idle cpumask. - SCX_PICK_IDLE_CORE added and scx_pick_idle_cpu() improved to prefer fully idle cores. - ops.enable() now sees up-to-date p->scx.weight value. - ttwu_queue path is disabled for tasks on SCX to avoid confusing BPF schedulers expecting ->select_cpu() call. - Use cpu_smt_mask() instead of topology_sibling_cpumask() like the rest of the scheduler. v3: - ops.set_weight() added to allow BPF schedulers to track weight changes without polling p->scx.weight. - move_task_to_local_dsq() was losing SCX-specific enq_flags when enqueueing the task on the target dsq because it goes through activate_task() which loses the upper 32bit of the flags. Carry the flags through rq->scx.extra_enq_flags. - scx_bpf_dispatch(), scx_bpf_pick_idle_cpu(), scx_bpf_task_running() and scx_bpf_task_cpu() now use the new KF_RCU instead of KF_TRUSTED_ARGS to make it easier for BPF schedulers to call them. - The kfunc helper access control mechanism implemented through sched_ext_entity.kf_mask is improved. Now SCX_CALL_OP*() is always used when invoking scx_ops operations. v2: - balance_scx_on_up() is dropped. Instead, on UP, balance_scx() is called from put_prev_taks_scx() and pick_next_task_scx() as necessary. To determine whether balance_scx() should be called from put_prev_task_scx(), SCX_TASK_DEQD_FOR_SLEEP flag is added. See the comment in put_prev_task_scx() for details. - sched_deq_and_put_task() / sched_enq_and_set_task() sequences replaced with SCHED_CHANGE_BLOCK(). - Unused all_dsqs list removed. This was a left-over from previous iterations. - p->scx.kf_mask is added to track and enforce which kfunc helpers are allowed. Also, init/exit sequences are updated to make some kfuncs always safe to call regardless of the current BPF scheduler state. Combined, this should make all the kfuncs safe. - BPF now supports sleepable struct_ops operations. Hacky workaround removed and operations and kfunc helpers are tagged appropriately. - BPF now supports bitmask / cpumask helpers. scx_bpf_get_idle_cpumask() and friends are added so that BPF schedulers can use the idle masks with the generic helpers. This replaces the hacky kfunc helpers added by a separate patch in V1. - CONFIG_SCHED_CLASS_EXT can no longer be enabled if SCHED_CORE is enabled. This restriction will be removed by a later patch which adds core-sched support. - Add MAINTAINERS entries and other misc changes. Signed-off-by: Tejun Heo Co-authored-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Cc: Andrea Righi --- include/linux/sched.h | 5 ++ include/linux/sched/ext.h | 141 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 145 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 90691d99027e..06beb8a6e0ca 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -80,6 +80,8 @@ struct task_group; struct task_struct; struct user_event_mm; +#include + /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). @@ -802,6 +804,9 @@ struct task_struct { struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; +#ifdef CONFIG_SCHED_CLASS_EXT + struct sched_ext_entity scx; +#endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index a05dfcf533b0..c1530a7992cc 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -1,9 +1,148 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo + * Copyright (c) 2022 David Vernet + */ #ifndef _LINUX_SCHED_EXT_H #define _LINUX_SCHED_EXT_H #ifdef CONFIG_SCHED_CLASS_EXT -#error "NOT IMPLEMENTED YET" + +#include +#include + +enum scx_public_consts { + SCX_OPS_NAME_LEN = 128, + + SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ +}; + +/* + * DSQ (dispatch queue) IDs are 64bit of the format: + * + * Bits: [63] [62 .. 0] + * [ B] [ ID ] + * + * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs + * ID: 63 bit ID + * + * Built-in IDs: + * + * Bits: [63] [62] [61..32] [31 .. 0] + * [ 1] [ L] [ R ] [ V ] + * + * 1: 1 for built-in DSQs. + * L: 1 for LOCAL_ON DSQ IDs, 0 for others + * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. + */ +enum scx_dsq_id_flags { + SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, + SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, + + SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, + SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, + SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, + SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, +}; + +/* + * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the + * scheduler core and the BPF scheduler. See the documentation for more details. + */ +struct scx_dispatch_q { + raw_spinlock_t lock; + struct list_head list; /* tasks in dispatch order */ + u32 nr; + u64 id; + struct rhash_head hash_node; + struct llist_node free_node; + struct rcu_head rcu; +}; + +/* scx_entity.flags */ +enum scx_ent_flags { + SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */ + SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ + SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + + SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + SCX_TASK_STATE_BITS = 2, + SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, + + SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ +}; + +/* scx_entity.flags & SCX_TASK_STATE_MASK */ +enum scx_task_state { + SCX_TASK_NONE, /* ops.init_task() not called yet */ + SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ + SCX_TASK_READY, /* fully initialized, but not in sched_ext */ + SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + + SCX_TASK_NR_STATES, +}; + +/* + * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from + * everywhere and the following bits track which kfunc sets are currently + * allowed for %current. This simple per-task tracking works because SCX ops + * nest in a limited way. BPF will likely implement a way to allow and disallow + * kfuncs depending on the calling context which will replace this manual + * mechanism. See scx_kf_allow(). + */ +enum scx_kf_mask { + SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ + /* all non-sleepables may be nested inside SLEEPABLE */ + SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */ + /* ops.dequeue (in REST) may be nested inside DISPATCH */ + SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */ + SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */ + SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */ + SCX_KF_REST = 1 << 5, /* other rq-locked operations */ + + __SCX_KF_RQ_LOCKED = SCX_KF_DISPATCH | + SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, +}; + +/* + * The following is embedded in task_struct and contains all fields necessary + * for a task to be scheduled by SCX. + */ +struct sched_ext_entity { + struct scx_dispatch_q *dsq; + struct list_head dsq_node; + u32 flags; /* protected by rq lock */ + u32 weight; + s32 sticky_cpu; + s32 holding_cpu; + u32 kf_mask; /* see scx_kf_mask above */ + atomic_long_t ops_state; + + struct list_head runnable_node; /* rq->scx.runnable_list */ + + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; + + /* BPF scheduler modifiable fields */ + + /* + * Runtime budget in nsecs. This is usually set through + * scx_bpf_dispatch() but can also be modified directly by the BPF + * scheduler. Automatically decreased by SCX as the task executes. On + * depletion, a scheduling event is triggered. + */ + u64 slice; + + /* cold fields */ + /* must be the last field, see init_scx_entity() */ + struct list_head tasks_node; +}; + +void sched_ext_free(struct task_struct *p); + #else /* !CONFIG_SCHED_CLASS_EXT */ static inline void sched_ext_free(struct task_struct *p) {} -- cgit v1.2.3 From 8a010b81b3a50b033fc3cddc613517abda586cbe Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 18 Jun 2024 10:09:18 -1000 Subject: sched_ext: Implement runnable task stall watchdog The most common and critical way that a BPF scheduler can misbehave is by failing to run runnable tasks for too long. This patch implements a watchdog. * All tasks record when they become runnable. * A watchdog work periodically scans all runnable tasks. If any task has stayed runnable for too long, the BPF scheduler is aborted. * scheduler_tick() monitors whether the watchdog itself is stuck. If so, the BPF scheduler is aborted. Because the watchdog only scans the tasks which are currently runnable and usually very infrequently, the overhead should be negligible. scx_qmap is updated so that it can be told to stall user and/or kernel tasks. A detected task stall looks like the following: sched_ext: BPF scheduler "qmap" errored, disabling sched_ext: runnable task stall (dbus-daemon[953] failed to run for 6.478s) scx_check_timeout_workfn+0x10e/0x1b0 process_one_work+0x287/0x560 worker_thread+0x234/0x420 kthread+0xe9/0x100 ret_from_fork+0x1f/0x30 A detected watchdog stall: sched_ext: BPF scheduler "qmap" errored, disabling sched_ext: runnable task stall (watchdog failed to check in for 5.001s) scheduler_tick+0x2eb/0x340 update_process_times+0x7a/0x90 tick_sched_timer+0xd8/0x130 __hrtimer_run_queues+0x178/0x3b0 hrtimer_interrupt+0xfc/0x390 __sysvec_apic_timer_interrupt+0xb7/0x2b0 sysvec_apic_timer_interrupt+0x90/0xb0 asm_sysvec_apic_timer_interrupt+0x1b/0x20 default_idle+0x14/0x20 arch_cpu_idle+0xf/0x20 default_idle_call+0x50/0x90 do_idle+0xe8/0x240 cpu_startup_entry+0x1d/0x20 kernel_init+0x0/0x190 start_kernel+0x0/0x392 start_kernel+0x324/0x392 x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x104/0x109 secondary_startup_64_no_verify+0xce/0xdb Note that this patch exposes scx_ops_error[_type]() in kernel/sched/ext.h to inline scx_notify_sched_tick(). v4: - While disabling, cancel_delayed_work_sync(&scx_watchdog_work) was being called before forward progress was guaranteed and thus could lead to system lockup. Relocated. - While enabling, it was comparing msecs against jiffies without conversion leading to spurious load failures on lower HZ kernels. Fixed. - runnable list management is now used by core bypass logic and moved to the patch implementing sched_ext core. v3: - bpf_scx_init_member() was incorrectly comparing ops->timeout_ms against SCX_WATCHDOG_MAX_TIMEOUT which is in jiffies without conversion leading to spurious load failures in lower HZ kernels. Fixed. v2: - Julia Lawall noticed that the watchdog code was mixing msecs and jiffies. Fix by using jiffies for everything. Signed-off-by: David Vernet Reviewed-by: Tejun Heo Signed-off-by: Tejun Heo Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Cc: Julia Lawall --- include/linux/sched/ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index c1530a7992cc..96031252436f 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -122,6 +122,7 @@ struct sched_ext_entity { atomic_long_t ops_state; struct list_head runnable_node; /* rq->scx.runnable_list */ + unsigned long runnable_at; u64 ddsp_dsq_id; u64 ddsp_enq_flags; -- cgit v1.2.3 From 7bb6f0810ecfb73a9d7a2ca56fb001e0201a6758 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:18 -1000 Subject: sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT BPF schedulers might not want to schedule certain tasks - e.g. kernel threads. This patch adds p->scx.disallow which can be set by BPF schedulers in such cases. The field can be changed anytime and setting it in ops.prep_enable() guarantees that the task can never be scheduled by sched_ext. scx_qmap is updated with the -d option to disallow a specific PID: # echo $$ 1092 # grep -E '(policy)|(ext\.enabled)' /proc/self/sched policy : 0 ext.enabled : 0 # ./set-scx 1092 # grep -E '(policy)|(ext\.enabled)' /proc/self/sched policy : 7 ext.enabled : 0 Run "scx_qmap -p -d 1092" in another terminal. # cat /sys/kernel/sched_ext/nr_rejected 1 # grep -E '(policy)|(ext\.enabled)' /proc/self/sched policy : 0 ext.enabled : 0 # ./set-scx 1092 setparam failed for 1092 (Permission denied) - v4: Refreshed on top of tip:sched/core. - v3: Update description to reflect /sys/kernel/sched_ext interface change. - v2: Use atomic_long_t instead of atomic64_t for scx_kick_cpus_pnt_seqs to accommodate 32bit archs. Signed-off-by: Tejun Heo Suggested-by: Barret Rhoden Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/sched/ext.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 96031252436f..ea7c501ac819 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -137,6 +137,18 @@ struct sched_ext_entity { */ u64 slice; + /* + * If set, reject future sched_setscheduler(2) calls updating the policy + * to %SCHED_EXT with -%EACCES. + * + * If set from ops.init_task() and the task's policy is already + * %SCHED_EXT, which can happen while the BPF scheduler is being loaded + * or by inhering the parent's policy during fork, the task's policy is + * rejected and forcefully reverted to %SCHED_NORMAL. The number of + * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected. + */ + bool disallow; /* reject switching into SCX */ + /* cold fields */ /* must be the last field, see init_scx_entity() */ struct list_head tasks_node; -- cgit v1.2.3 From 1538e33995eaf3f315cbb5506019b9f913ed8555 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 18 Jun 2024 10:09:18 -1000 Subject: sched_ext: Print sched_ext info when dumping stack It would be useful to see what the sched_ext scheduler state is, and what scheduler is running, when we're dumping a task's stack. This patch therefore adds a new print_scx_info() function that's called in the same context as print_worker_info() and print_stop_info(). An example dump follows. BUG: kernel NULL pointer dereference, address: 0000000000000999 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] PREEMPT SMP CPU: 13 PID: 2047 Comm: insmod Tainted: G O 6.6.0-work-10323-gb58d4cae8e99-dirty #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022 Sched_ext: qmap (enabled+all), task: runnable_at=-17ms RIP: 0010:init_module+0x9/0x1000 [test_module] ... v3: - scx_ops_enable_state_str[] definition moved to an earlier patch as it's now used by core implementation. - Convert jiffy delta to msecs using jiffies_to_msecs() instead of multiplying by (HZ / MSEC_PER_SEC). The conversion is implemented in jiffies_delta_msecs(). v2: - We are now using scx_ops_enable_state_str[] outside CONFIG_SCHED_DEBUG. Move it outside of CONFIG_SCHED_DEBUG and to the top. This was reported by Changwoo and Andrea. Signed-off-by: David Vernet Reported-by: Changwoo Min Reported-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index ea7c501ac819..85fb5dc725ef 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -155,10 +155,12 @@ struct sched_ext_entity { }; void sched_ext_free(struct task_struct *p); +void print_scx_info(const char *log_lvl, struct task_struct *p); #else /* !CONFIG_SCHED_CLASS_EXT */ static inline void sched_ext_free(struct task_struct *p) {} +static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} #endif /* CONFIG_SCHED_CLASS_EXT */ #endif /* _LINUX_SCHED_EXT_H */ -- cgit v1.2.3 From 81aae789181b5850d77dfdf74d4b85c63f0705e9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:19 -1000 Subject: sched_ext: Implement scx_bpf_kick_cpu() and task preemption support It's often useful to wake up and/or trigger reschedule on other CPUs. This patch adds scx_bpf_kick_cpu() kfunc helper that BPF scheduler can call to kick the target CPU into the scheduling path. As a sched_ext task relinquishes its CPU only after its slice is depleted, this patch also adds SCX_KICK_PREEMPT and SCX_ENQ_PREEMPT which clears the slice of the target CPU's current task to guarantee that sched_ext's scheduling path runs on the CPU. If SCX_KICK_IDLE is specified, the target CPU is kicked iff the CPU is idle to guarantee that the target CPU will go through at least one full sched_ext scheduling cycle after the kicking. This can be used to wake up idle CPUs without incurring unnecessary overhead if it isn't currently idle. As a demonstration of how backward compatibility can be supported using BPF CO-RE, tools/sched_ext/include/scx/compat.bpf.h is added. It provides __COMPAT_scx_bpf_kick_cpu_IDLE() which uses SCX_KICK_IDLE if available or becomes a regular kicking otherwise. This allows schedulers to use the new SCX_KICK_IDLE while maintaining support for older kernels. The plan is to temporarily use compat helpers to ease API updates and drop them after a few kernel releases. v5: - SCX_KICK_IDLE added. Note that this also adds a compat mechanism for schedulers so that they can support kernels without SCX_KICK_IDLE. This is useful as a demonstration of how new feature flags can be added in a backward compatible way. - kick_cpus_irq_workfn() reimplemented so that it touches the pending cpumasks only as necessary to reduce kicking overhead on machines with a lot of CPUs. - tools/sched_ext/include/scx/compat.bpf.h added. v4: - Move example scheduler to its own patch. v3: - Make scx_example_central switch all tasks by default. - Convert to BPF inline iterators. v2: - Julia Lawall reported that scx_example_central can overflow the dispatch buffer and malfunction. As scheduling for other CPUs can't be handled by the automatic retry mechanism, fix by implementing an explicit overflow and retry handling. - Updated to use generic BPF cpumask helpers. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/sched/ext.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 85fb5dc725ef..3b2809b980ac 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -134,6 +134,10 @@ struct sched_ext_entity { * scx_bpf_dispatch() but can also be modified directly by the BPF * scheduler. Automatically decreased by SCX as the task executes. On * depletion, a scheduling event is triggered. + * + * This value is cleared to zero if the task is preempted by + * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the + * task ran. Use p->se.sum_exec_runtime instead. */ u64 slice; -- cgit v1.2.3 From 22a920209ab6aa4f8ec960ed81041643fddeaec6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:19 -1000 Subject: sched_ext: Implement tickless support Allow BPF schedulers to indicate tickless operation by setting p->scx.slice to SCX_SLICE_INF. A CPU whose current task has infinte slice goes into tickless operation. scx_central is updated to use tickless operations for all tasks and instead use a BPF timer to expire slices. This also uses the SCX_ENQ_PREEMPT and task state tracking added by the previous patches. Currently, there is no way to pin the timer on the central CPU, so it may end up on one of the worker CPUs; however, outside of that, the worker CPUs can go tickless both while running sched_ext tasks and idling. With schbench running, scx_central shows: root@test ~# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts LOC: 142024 656 664 449 Local timer interrupts LOC: 161663 663 665 449 Local timer interrupts Without it: root@test ~ [SIGINT]# grep ^LOC /proc/interrupts; sleep 10; grep ^LOC /proc/interrupts LOC: 188778 3142 3793 3993 Local timer interrupts LOC: 198993 5314 6323 6438 Local timer interrupts While scx_central itself is too barebone to be useful as a production scheduler, a more featureful central scheduler can be built using the same approach. Google's experience shows that such an approach can have significant benefits for certain applications such as VM hosting. v4: Allow operation even if BPF_F_TIMER_CPU_PIN is not available. v3: Pin the central scheduler's timer on the central_cpu using BPF_F_TIMER_CPU_PIN. v2: Convert to BPF inline iterators. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/sched/ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 3b2809b980ac..6f1a4977e9f8 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -16,6 +16,7 @@ enum scx_public_consts { SCX_OPS_NAME_LEN = 128, SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ + SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ }; /* -- cgit v1.2.3 From 36454023f50b2aadd25b7f47c50b5edc0c59e1c0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:19 -1000 Subject: sched_ext: Track tasks that are subjects of the in-flight SCX operation When some SCX operations are in flight, it is known that the subject task's rq lock is held throughout which makes it safe to access certain fields of the task - e.g. its current task_group. We want to add SCX kfunc helpers that can make use of this guarantee - e.g. to help determining the currently associated CPU cgroup from the task's current task_group. As it'd be dangerous call such a helper on a task which isn't rq lock protected, the helper should be able to verify the input task and reject accordingly. This patch adds sched_ext_entity.kf_tasks[] that track the tasks which are currently being operated on by a terminal SCX operation. The new SCX_CALL_OP_[2]TASK[_RET]() can be used when invoking SCX operations which take tasks as arguments and the scx_kf_allowed_on_arg_tasks() can be used by kfunc helpers to verify the input task status. Note that as sched_ext_entity.kf_tasks[] can't handle nesting, the tracking is currently only limited to terminal SCX operations. If needed in the future, this restriction can be removed by moving the tracking to the task side with a couple per-task counters. v2: Updated to reflect the addition of SCX_KF_SELECT_CPU. Signed-off-by: Tejun Heo Reviewed-by: David Vernet --- include/linux/sched/ext.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 6f1a4977e9f8..74341dbc6a19 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -106,6 +106,7 @@ enum scx_kf_mask { __SCX_KF_RQ_LOCKED = SCX_KF_DISPATCH | SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, + __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, }; /* @@ -120,6 +121,7 @@ struct sched_ext_entity { s32 sticky_cpu; s32 holding_cpu; u32 kf_mask; /* see scx_kf_mask above */ + struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ atomic_long_t ops_state; struct list_head runnable_node; /* rq->scx.runnable_list */ -- cgit v1.2.3 From 245254f7081dbe1c8da54675d0e4ddbe74cee61b Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 18 Jun 2024 10:09:20 -1000 Subject: sched_ext: Implement sched_ext_ops.cpu_acquire/release() Scheduler classes are strictly ordered and when a higher priority class has tasks to run, the lower priority ones lose access to the CPU. Being able to monitor and act on these events are necessary for use cases includling strict core-scheduling and latency management. This patch adds two operations ops.cpu_acquire() and .cpu_release(). The former is invoked when a CPU becomes available to the BPF scheduler and the opposite for the latter. This patch also implements scx_bpf_reenqueue_local() which can be called from .cpu_release() to trigger requeueing of all tasks in the local dsq of the CPU so that the tasks can be reassigned to other available CPUs. scx_pair is updated to use .cpu_acquire/release() along with %SCX_KICK_WAIT to make the pair scheduling guarantee strict even when a CPU is preempted by a higher priority scheduler class. scx_qmap is updated to use .cpu_acquire/release() to empty the local dsq of a preempted CPU. A similar approach can be adopted by BPF schedulers that want to have a tight control over latency. v4: Use the new SCX_KICK_IDLE to wake up a CPU after re-enqueueing. v3: Drop the const qualifier from scx_cpu_release_args.task. BPF enforces access control through the verifier, so the qualifier isn't actually operative and only gets in the way when interacting with various helpers. v2: Add p->scx.kf_mask annotation to allow calling scx_bpf_reenqueue_local() from ops.cpu_release() nested inside ops.init() and other sleepable operations. Signed-off-by: David Vernet Reviewed-by: Tejun Heo Signed-off-by: Tejun Heo Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden --- include/linux/sched/ext.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 74341dbc6a19..21c627337e01 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -98,13 +98,15 @@ enum scx_kf_mask { SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ /* all non-sleepables may be nested inside SLEEPABLE */ SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */ + /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ + SCX_KF_CPU_RELEASE = 1 << 1, /* ops.cpu_release() */ /* ops.dequeue (in REST) may be nested inside DISPATCH */ SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */ SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */ SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */ SCX_KF_REST = 1 << 5, /* other rq-locked operations */ - __SCX_KF_RQ_LOCKED = SCX_KF_DISPATCH | + __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, }; -- cgit v1.2.3 From 7b0888b7cc1924b74ce660e02f6211df8dd46e7b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:20 -1000 Subject: sched_ext: Implement core-sched support The core-sched support is composed of the following parts: - task_struct->scx.core_sched_at is added. This is a timestamp which can be used to order tasks. Depending on whether the BPF scheduler implements custom ordering, it tracks either global FIFO ordering of all tasks or local-DSQ ordering within the dispatched tasks on a CPU. - prio_less() is updated to call scx_prio_less() when comparing SCX tasks. scx_prio_less() calls ops.core_sched_before() if available or uses the core_sched_at timestamp. For global FIFO ordering, the BPF scheduler doesn't need to do anything. Otherwise, it should implement ops.core_sched_before() which reflects the ordering. - When core-sched is enabled, balance_scx() balances all SMT siblings so that they all have tasks dispatched if necessary before pick_task_scx() is called. pick_task_scx() picks between the current task and the first dispatched task on the local DSQ based on availability and the core_sched_at timestamps. Note that FIFO ordering is expected among the already dispatched tasks whether running or on the local DSQ, so this path always compares core_sched_at instead of calling into ops.core_sched_before(). qmap_core_sched_before() is added to scx_qmap. It scales the distances from the heads of the queues to compare the tasks across different priority queues and seems to behave as expected. v3: Fixed build error when !CONFIG_SCHED_SMT reported by Andrea Righi. v2: Sched core added the const qualifiers to prio_less task arguments. Explicitly drop them for ops.core_sched_before() task arguments. BPF enforces access control through the verifier, so the qualifier isn't actually operative and only gets in the way when interacting with various helpers. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Reviewed-by: Josh Don Cc: Andrea Righi --- include/linux/sched/ext.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 21c627337e01..3db7b32b2d1d 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -129,6 +129,9 @@ struct sched_ext_entity { struct list_head runnable_node; /* rq->scx.runnable_list */ unsigned long runnable_at; +#ifdef CONFIG_SCHED_CORE + u64 core_sched_at; /* see scx_prio_less() */ +#endif u64 ddsp_dsq_id; u64 ddsp_enq_flags; -- cgit v1.2.3 From 06e51be3d5e7a07aea5c9012773df8d5de01db6c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:21 -1000 Subject: sched_ext: Add vtime-ordered priority queue to dispatch_q's Currently, a dsq is always a FIFO. A task which is dispatched earlier gets consumed or executed earlier. While this is sufficient when dsq's are used for simple staging areas for tasks which are ready to execute, it'd make dsq's a lot more useful if they can implement custom ordering. This patch adds a vtime-ordered priority queue to dsq's. When the BPF scheduler dispatches a task with the new scx_bpf_dispatch_vtime() helper, it can specify the vtime tha the task should be inserted at and the task is inserted into the priority queue in the dsq which is ordered according to time_before64() comparison of the vtime values. A DSQ can either be a FIFO or priority queue and automatically switches between the two depending on whether scx_bpf_dispatch() or scx_bpf_dispatch_vtime() is used. Using the wrong variant while the DSQ already has the other type queued is not allowed and triggers an ops error. Built-in DSQs must always be FIFOs. This makes it very easy for the BPF schedulers to implement proper vtime based scheduling within each dsq very easy and efficient at a negligible cost in terms of code complexity and overhead. scx_simple and scx_example_flatcg are updated to default to weighted vtime scheduling (the latter within each cgroup). FIFO scheduling can be selected with -f option. v4: - As allowing mixing priority queue and FIFO on the same DSQ sometimes led to unexpected starvations, DSQs now error out if both modes are used at the same time and the built-in DSQs are no longer allowed to be priority queues. - Explicit type struct scx_dsq_node added to contain fields needed to be linked on DSQs. This will be used to implement stateful iterator. - Tasks are now always linked on dsq->list whether the DSQ is in FIFO or PRIQ mode. This confines PRIQ related complexities to the enqueue and dequeue paths. Other paths only need to look at dsq->list. This will also ease implementing BPF iterator. - Print p->scx.dsq_flags in debug dump. v3: - SCX_TASK_DSQ_ON_PRIQ flag is moved from p->scx.flags into its own p->scx.dsq_flags. The flag is protected with the dsq lock unlike other flags in p->scx.flags. This led to flag corruption in some cases. - Add comments explaining the interaction between using consumption of p->scx.slice to determine vtime progress and yielding. v2: - p->scx.dsq_vtime was not initialized on load or across cgroup migrations leading to some tasks being stalled for extended period of time depending on how saturated the machine is. Fixed. Signed-off-by: Tejun Heo Reviewed-by: David Vernet --- include/linux/sched/ext.h | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 3db7b32b2d1d..9cee193dab19 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -49,12 +49,15 @@ enum scx_dsq_id_flags { }; /* - * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the - * scheduler core and the BPF scheduler. See the documentation for more details. + * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered + * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to + * buffer between the scheduler core and the BPF scheduler. See the + * documentation for more details. */ struct scx_dispatch_q { raw_spinlock_t lock; struct list_head list; /* tasks in dispatch order */ + struct rb_root priq; /* used to order by p->scx.dsq_vtime */ u32 nr; u64 id; struct rhash_head hash_node; @@ -86,6 +89,11 @@ enum scx_task_state { SCX_TASK_NR_STATES, }; +/* scx_entity.dsq_flags */ +enum scx_ent_dsq_flags { + SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ +}; + /* * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from * everywhere and the following bits track which kfunc sets are currently @@ -111,13 +119,19 @@ enum scx_kf_mask { __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, }; +struct scx_dsq_node { + struct list_head list; /* dispatch order */ + struct rb_node priq; /* p->scx.dsq_vtime order */ + u32 flags; /* SCX_TASK_DSQ_* flags */ +}; + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. */ struct sched_ext_entity { struct scx_dispatch_q *dsq; - struct list_head dsq_node; + struct scx_dsq_node dsq_node; /* protected by dsq lock */ u32 flags; /* protected by rq lock */ u32 weight; s32 sticky_cpu; @@ -149,6 +163,15 @@ struct sched_ext_entity { */ u64 slice; + /* + * Used to order tasks when dispatching to the vtime-ordered priority + * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() + * but can also be modified directly by the BPF scheduler. Modifying it + * while a task is queued on a dsq may mangle the ordering and is not + * recommended. + */ + u64 dsq_vtime; + /* * If set, reject future sched_setscheduler(2) calls updating the policy * to %SCHED_EXT with -%EACCES. -- cgit v1.2.3 From fa48e8d2c7b58d242c1db3a09c14f4274e055087 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jun 2024 10:09:21 -1000 Subject: sched_ext: Documentation: scheduler: Document extensible scheduler class Add Documentation/scheduler/sched-ext.rst which gives a high-level overview and pointers to the examples. v6: - Add paragraph explaining debug dump. v5: - Updated to reflect /sys/kernel interface change. Kconfig options added. v4: - README improved, reformatted in markdown and renamed to README.md. v3: - Added tools/sched_ext/README. - Dropped _example prefix from scheduler names. v2: - Apply minor edits suggested by Bagas. Caveats section dropped as all of them are addressed. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Cc: Bagas Sanjaya --- include/linux/sched/ext.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 9cee193dab19..fe9a67ffe6b1 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -1,5 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. * Copyright (c) 2022 Tejun Heo * Copyright (c) 2022 David Vernet -- cgit v1.2.3 From d4af01c3731ff9c6e224d7183f8226a56d72b56c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Jul 2024 14:30:55 -1000 Subject: sched_ext: Take out ->priq and ->flags from scx_dsq_node struct scx_dsq_node contains two data structure nodes to link the containing task to a DSQ and a flags field that is protected by the lock of the associated DSQ. One reason why they are grouped into a struct is to use the type independently as a cursor node when iterating tasks on a DSQ. However, when iterating, the cursor only needs to be linked on the FIFO list and the rb_node part ends up inflating the size of the iterator data structure unnecessarily making it potentially too expensive to place it on stack. Take ->priq and ->flags out of scx_dsq_node and put them in sched_ext_entity as ->dsq_priq and ->dsq_flags, respectively. scx_dsq_node is renamed to scx_dsq_list_node and the field names are renamed accordingly. This will help implementing DSQ task iterator that can be allocated on stack. No functional change intended. Signed-off-by: Tejun Heo Suggested-by: Alexei Starovoitov Acked-by: Alexei Starovoitov Cc: David Vernet --- include/linux/sched/ext.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index fe9a67ffe6b1..eb9cfd18a923 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -121,10 +121,8 @@ enum scx_kf_mask { __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, }; -struct scx_dsq_node { - struct list_head list; /* dispatch order */ - struct rb_node priq; /* p->scx.dsq_vtime order */ - u32 flags; /* SCX_TASK_DSQ_* flags */ +struct scx_dsq_list_node { + struct list_head node; }; /* @@ -133,7 +131,9 @@ struct scx_dsq_node { */ struct sched_ext_entity { struct scx_dispatch_q *dsq; - struct scx_dsq_node dsq_node; /* protected by dsq lock */ + struct scx_dsq_list_node dsq_list; /* dispatch order */ + struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ + u32 dsq_flags; /* protected by DSQ lock */ u32 flags; /* protected by rq lock */ u32 weight; s32 sticky_cpu; -- cgit v1.2.3 From 650ba21b131ed1f8ee57826b2c6295a3be221132 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Jul 2024 14:30:55 -1000 Subject: sched_ext: Implement DSQ iterator DSQs are very opaque in the consumption path. The BPF scheduler has no way of knowing which tasks are being considered and which is picked. This patch adds BPF DSQ iterator. - Allows iterating tasks queued on a DSQ in the dispatch order or reverse from anywhere using bpf_for_each(scx_dsq) or calling the iterator kfuncs directly. - Has ordering guarantee where only tasks which were already queued when the iteration started are visible and consumable during the iteration. v5: - Add a comment to the naked list_empty(&dsq->list) test in consume_dispatch_q() to explain the reasoning behind the lockless test and by extension why nldsq_next_task() isn't used there. - scx_qmap changes separated into its own patch. v4: - bpf_iter_scx_dsq_new() declaration in common.bpf.h was using the wrong type for the last argument (bool rev instead of u64 flags). Fix it. v3: - Alexei pointed out that the iterator is too big to allocate on stack. Added a prep patch to reduce the size of the cursor. Now bpf_iter_scx_dsq is 48 bytes and bpf_iter_scx_dsq_kern is 40 bytes on 64bit. - u32_before() comparison factored out. v2: - scx_bpf_consume_task() is separated out into a separate patch. - DSQ seq and iter flags don't need to be u64. Use u32. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Alexei Starovoitov Cc: bpf@vger.kernel.org --- include/linux/sched/ext.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index eb9cfd18a923..593d2f4909dd 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -61,6 +61,7 @@ struct scx_dispatch_q { struct list_head list; /* tasks in dispatch order */ struct rb_root priq; /* used to order by p->scx.dsq_vtime */ u32 nr; + u32 seq; /* used by BPF iter */ u64 id; struct rhash_head hash_node; struct llist_node free_node; @@ -123,6 +124,7 @@ enum scx_kf_mask { struct scx_dsq_list_node { struct list_head node; + bool is_bpf_iter_cursor; }; /* @@ -133,6 +135,7 @@ struct sched_ext_entity { struct scx_dispatch_q *dsq; struct scx_dsq_list_node dsq_list; /* dispatch order */ struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ + u32 dsq_seq; u32 dsq_flags; /* protected by DSQ lock */ u32 flags; /* protected by rq lock */ u32 weight; -- cgit v1.2.3 From e8858fc07eb8386b9b6436d3e86eeb4b81e4f660 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sat, 13 Jul 2024 23:00:27 -0700 Subject: Input: msc5000_ts - remove the driver MCS-5000 belongs to the 1st generation of Melfas chips, manufactured in 2000-2007. The driver relies on custom platform data (no DT support) and there never were any users of this driver in the mainline kernel. The commit adding the driver mentioned that the driver was tested on S3C6410 NCP board (with Samsung S3C6410 SoC) but the touchscreen device was never added to the board file. This board was removed in v6.3 in commit 743c8fbb90ca ("ARM: s3c: remove most s3c64xx board support"). Remove the driver since there are no users. Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240714060029.1528662-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/mcs.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/mcs.h b/include/linux/platform_data/mcs.h index fcc6f2a1f5c3..f3b0749f1630 100644 --- a/include/linux/platform_data/mcs.h +++ b/include/linux/platform_data/mcs.h @@ -16,10 +16,6 @@ struct mcs_platform_data { void (*poweron)(bool); void (*cfg_pin)(void); - /* touchscreen */ - unsigned int x_size; - unsigned int y_size; - /* touchkey */ const u32 *keymap; unsigned int keymap_size; -- cgit v1.2.3 From dd29eadee1e8f07bbec57dddf4befbcd3e3c0af7 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sat, 13 Jul 2024 23:00:28 -0700 Subject: Input: msc_touchkey - remove the driver MCS-5000/5080 chips belong to the 1st generation of Melfas chips, manufactured in 2000-2007. The driver relies on custom platform data (no DT support) and there never were any users of this driver in the mainline kernel. It is likely that the driver was (like mcs5000_ts driver) was tested on S3C6410 NCP board (with Samsung S3C6410 SoC), but the touchkey device was never added to the board file. This board was removed in v6.3 in commit 743c8fbb90ca ("ARM: s3c: remove most s3c64xx board support"). Remove the driver since there are no users. Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240714060029.1528662-2-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/mcs.h | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 include/linux/platform_data/mcs.h (limited to 'include/linux') diff --git a/include/linux/platform_data/mcs.h b/include/linux/platform_data/mcs.h deleted file mode 100644 index f3b0749f1630..000000000000 --- a/include/linux/platform_data/mcs.h +++ /dev/null @@ -1,26 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (C) 2009 - 2010 Samsung Electronics Co.Ltd - * Author: Joonyoung Shim - * Author: HeungJun Kim - */ - -#ifndef __LINUX_MCS_H -#define __LINUX_MCS_H - -#define MCS_KEY_MAP(v, c) ((((v) & 0xff) << 16) | ((c) & 0xffff)) -#define MCS_KEY_VAL(v) (((v) >> 16) & 0xff) -#define MCS_KEY_CODE(v) ((v) & 0xffff) - -struct mcs_platform_data { - void (*poweron)(bool); - void (*cfg_pin)(void); - - /* touchkey */ - const u32 *keymap; - unsigned int keymap_size; - unsigned int key_maxval; - bool no_autorepeat; -}; - -#endif /* __LINUX_MCS_H */ -- cgit v1.2.3 From e1a261ba599eec97e1c5c7760d5c3698fc24e6a6 Mon Sep 17 00:00:00 2001 From: Jocelyn Falempe Date: Tue, 2 Jul 2024 14:26:04 +0200 Subject: printk: Add a short description string to kmsg_dump() kmsg_dump doesn't forward the panic reason string to the kmsg_dumper callback. This patch adds a new struct kmsg_dump_detail, that will hold the reason and description, and pass it to the dump() callback. To avoid updating all kmsg_dump() call, it adds a kmsg_dump_desc() function and a macro for backward compatibility. I've written this for drm_panic, but it can be useful for other kmsg_dumper. It allows to see the panic reason, like "sysrq triggered crash" or "VFS: Unable to mount root fs on xxxx" on the drm panic screen. v2: * Use a struct kmsg_dump_detail to hold the reason and description pointer, for more flexibility if we want to add other parameters. (Kees Cook) * Fix powerpc/nvram_64 build, as I didn't update the forward declaration of oops_to_nvram() Signed-off-by: Jocelyn Falempe Acked-by: Petr Mladek Acked-by: Michael Ellerman (powerpc) Acked-by: Kees Cook Link: https://patchwork.freedesktop.org/patch/msgid/20240702122639.248110-1-jfalempe@redhat.com --- include/linux/kmsg_dump.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 906521c2329c..6055fc969877 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -39,6 +39,17 @@ struct kmsg_dump_iter { u64 next_seq; }; +/** + * struct kmsg_dump_detail - kernel crash detail + * @reason: reason for the crash, see kmsg_dump_reason. + * @description: optional short string, to provide additional information. + */ + +struct kmsg_dump_detail { + enum kmsg_dump_reason reason; + const char *description; +}; + /** * struct kmsg_dumper - kernel crash message dumper structure * @list: Entry in the dumper list (private) @@ -49,13 +60,13 @@ struct kmsg_dump_iter { */ struct kmsg_dumper { struct list_head list; - void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); + void (*dump)(struct kmsg_dumper *dumper, struct kmsg_dump_detail *detail); enum kmsg_dump_reason max_reason; bool registered; }; #ifdef CONFIG_PRINTK -void kmsg_dump(enum kmsg_dump_reason reason); +void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc); bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, char *line, size_t size, size_t *len); @@ -71,7 +82,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper); const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason); #else -static inline void kmsg_dump(enum kmsg_dump_reason reason) +static inline void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc) { } @@ -107,4 +118,9 @@ static inline const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason) } #endif +static inline void kmsg_dump(enum kmsg_dump_reason reason) +{ + kmsg_dump_desc(reason, NULL); +} + #endif /* _LINUX_KMSG_DUMP_H */ -- cgit v1.2.3 From d20a9f568f99591886ec73b80ff7283486710643 Mon Sep 17 00:00:00 2001 From: Jocelyn Falempe Date: Wed, 17 Jul 2024 10:48:40 +0200 Subject: fbcon: Add an option to disable fbcon in panic This is required to avoid conflict between DRM_PANIC, and fbcon. If a drm device already handle panic with drm_panic, it should set the skip_panic field in fb_info, so that fbcon will stay quiet, and not overwrite the panic_screen. Signed-off-by: Jocelyn Falempe Reviewed-by: Javier Martinez Canillas Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20240717090102.968152-3-jfalempe@redhat.com --- include/linux/fb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index db7d97b10964..865dad03e73e 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -510,6 +510,7 @@ struct fb_info { void *par; bool skip_vt_switch; /* no VT switch on suspend/resume required */ + bool skip_panic; /* Do not write to the fb after a panic */ }; /* This will go away -- cgit v1.2.3 From a838e5dca63d1dc701e63b2b1176943c57485c45 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 15 Jul 2024 21:05:32 +0800 Subject: quota: remove unneeded return value of register_quota_format The register_quota_format always returns 0, simply remove unneeded return value. Link: https://patch.msgid.link/20240715130534.2112678-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Joseph Qi Signed-off-by: Jan Kara --- include/linux/quota.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/quota.h b/include/linux/quota.h index 07071e64abf3..89a0d83ddad0 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -526,7 +526,7 @@ struct quota_info { const struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ }; -int register_quota_format(struct quota_format_type *fmt); +void register_quota_format(struct quota_format_type *fmt); void unregister_quota_format(struct quota_format_type *fmt); struct quota_module_name { -- cgit v1.2.3 From d5e79eeba3086a52593b295ac4bf6eddd64d4aad Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Sat, 20 Jul 2024 15:15:42 +0800 Subject: dma-buf: heaps: Deduplicate docs and adopt common format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docs for dma_heap_get_name were incorrect, and since they were duplicated in the header they were wrong there too. The docs formatting was inconsistent so I tried to make it more consistent across functions since I'm already in here doing cleanup. Remove multiple unused includes and alphabetize. Signed-off-by: T.J. Mercier Signed-off-by: Yong Wu [Yong: Just add a comment for "priv" to mute build warning] Signed-off-by: Yunfei Dong Link: https://patchwork.freedesktop.org/patch/msgid/20240720071606.27930-5-yunfei.dong@mediatek.com Signed-off-by: Christian König --- include/linux/dma-heap.h | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-heap.h b/include/linux/dma-heap.h index 064bad725061..27d15f60950a 100644 --- a/include/linux/dma-heap.h +++ b/include/linux/dma-heap.h @@ -9,14 +9,13 @@ #ifndef _DMA_HEAPS_H #define _DMA_HEAPS_H -#include #include struct dma_heap; /** * struct dma_heap_ops - ops to operate on a given heap - * @allocate: allocate dmabuf and return struct dma_buf ptr + * @allocate: allocate dmabuf and return struct dma_buf ptr * * allocate returns dmabuf on success, ERR_PTR(-errno) on error. */ @@ -41,28 +40,10 @@ struct dma_heap_export_info { void *priv; }; -/** - * dma_heap_get_drvdata() - get per-heap driver data - * @heap: DMA-Heap to retrieve private data for - * - * Returns: - * The per-heap data for the heap. - */ void *dma_heap_get_drvdata(struct dma_heap *heap); -/** - * dma_heap_get_name() - get heap name - * @heap: DMA-Heap to retrieve private data for - * - * Returns: - * The char* for the heap name. - */ const char *dma_heap_get_name(struct dma_heap *heap); -/** - * dma_heap_add - adds a heap to dmabuf heaps - * @exp_info: information needed to register this heap - */ struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info); #endif /* _DMA_HEAPS_H */ -- cgit v1.2.3 From 23a55f4492fcf868d068da31a2cd30c15f46207d Mon Sep 17 00:00:00 2001 From: Ofir Gal Date: Thu, 18 Jul 2024 11:45:12 +0300 Subject: net: introduce helper sendpages_ok() Network drivers are using sendpage_ok() to check the first page of an iterator in order to disable MSG_SPLICE_PAGES. The iterator can represent list of contiguous pages. When MSG_SPLICE_PAGES is enabled skb_splice_from_iter() is being used, it requires all pages in the iterator to be sendable. Therefore it needs to check that each page is sendable. The patch introduces a helper sendpages_ok(), it returns true if all the contiguous pages are sendable. Drivers who want to send contiguous pages with MSG_SPLICE_PAGES may use this helper to check whether the page list is OK. If the helper does not return true, the driver should remove MSG_SPLICE_PAGES flag. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Ofir Gal Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20240718084515.3833733-2-ofir.gal@volumez.com Signed-off-by: Jens Axboe --- include/linux/net.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 688320b79fcc..b75bc534c1b3 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -322,6 +322,25 @@ static inline bool sendpage_ok(struct page *page) return !PageSlab(page) && page_count(page) >= 1; } +/* + * Check sendpage_ok on contiguous pages. + */ +static inline bool sendpages_ok(struct page *page, size_t len, size_t offset) +{ + struct page *p = page + (offset >> PAGE_SHIFT); + size_t count = 0; + + while (count < len) { + if (!sendpage_ok(p)) + return false; + + p++; + count += PAGE_SIZE; + } + + return true; +} + int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, -- cgit v1.2.3 From 320f6693097bf89d67f9cabad24a2b911e23073f Mon Sep 17 00:00:00 2001 From: Marcelo Schmitt Date: Fri, 12 Jul 2024 16:21:03 -0300 Subject: spi: bitbang: Implement support for MOSI idle state configuration Some SPI peripherals may require strict MOSI line state when the controller is not clocking out data. Implement support for MOSI idle state configuration (low or high) by setting the data output line level on controller setup and after transfers. Bitbang operations now call controller specific set_mosi_idle() callback to set MOSI to its idle state. The MOSI line is kept at its idle state if no tx buffer is provided. Acked-by: Nuno Sa Reviewed-by: David Lechner Reviewed-by: Jonathan Cameron Signed-off-by: Marcelo Schmitt Link: https://patch.msgid.link/de61a600b56ed9cb714d5ea87afa88948e70041e.1720810545.git.marcelo.schmitt@analog.com Signed-off-by: Mark Brown --- include/linux/spi/spi_bitbang.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/spi/spi_bitbang.h b/include/linux/spi/spi_bitbang.h index d4cb83195f7a..c92cd43a47f4 100644 --- a/include/linux/spi/spi_bitbang.h +++ b/include/linux/spi/spi_bitbang.h @@ -24,6 +24,7 @@ struct spi_bitbang { #define BITBANG_CS_ACTIVE 1 /* normally nCS, active low */ #define BITBANG_CS_INACTIVE 0 + void (*set_mosi_idle)(struct spi_device *spi); /* txrx_bufs() may handle dma mapping for transfers that don't * already have one (transfer.{tx,rx}_dma is zero), or use PIO */ -- cgit v1.2.3 From d65d411c9259a2499081e1e7ed91088232666b57 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Jul 2023 12:08:50 +0100 Subject: treewide: context_tracking: Rename CONTEXT_* into CT_STATE_* Context tracking state related symbols currently use a mix of the CONTEXT_ (e.g. CONTEXT_KERNEL) and CT_SATE_ (e.g. CT_STATE_MASK) prefixes. Clean up the naming and make the ctx_state enum use the CT_STATE_ prefix. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Acked-by: Frederic Weisbecker Acked-by: Thomas Gleixner Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking.h | 16 ++++++++-------- include/linux/context_tracking_state.h | 20 ++++++++++---------- include/linux/entry-common.h | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 6e76b9dba00e..28fcfa184903 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -26,26 +26,26 @@ extern void user_exit_callable(void); static inline void user_enter(void) { if (context_tracking_enabled()) - ct_user_enter(CONTEXT_USER); + ct_user_enter(CT_STATE_USER); } static inline void user_exit(void) { if (context_tracking_enabled()) - ct_user_exit(CONTEXT_USER); + ct_user_exit(CT_STATE_USER); } /* Called with interrupts disabled. */ static __always_inline void user_enter_irqoff(void) { if (context_tracking_enabled()) - __ct_user_enter(CONTEXT_USER); + __ct_user_enter(CT_STATE_USER); } static __always_inline void user_exit_irqoff(void) { if (context_tracking_enabled()) - __ct_user_exit(CONTEXT_USER); + __ct_user_exit(CT_STATE_USER); } static inline enum ctx_state exception_enter(void) @@ -57,7 +57,7 @@ static inline enum ctx_state exception_enter(void) return 0; prev_ctx = __ct_state(); - if (prev_ctx != CONTEXT_KERNEL) + if (prev_ctx != CT_STATE_KERNEL) ct_user_exit(prev_ctx); return prev_ctx; @@ -67,7 +67,7 @@ static inline void exception_exit(enum ctx_state prev_ctx) { if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK) && context_tracking_enabled()) { - if (prev_ctx != CONTEXT_KERNEL) + if (prev_ctx != CT_STATE_KERNEL) ct_user_enter(prev_ctx); } } @@ -75,7 +75,7 @@ static inline void exception_exit(enum ctx_state prev_ctx) static __always_inline bool context_tracking_guest_enter(void) { if (context_tracking_enabled()) - __ct_user_enter(CONTEXT_GUEST); + __ct_user_enter(CT_STATE_GUEST); return context_tracking_enabled_this_cpu(); } @@ -83,7 +83,7 @@ static __always_inline bool context_tracking_guest_enter(void) static __always_inline void context_tracking_guest_exit(void) { if (context_tracking_enabled()) - __ct_user_exit(CONTEXT_GUEST); + __ct_user_exit(CT_STATE_GUEST); } #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index bbff5f7f8803..f1c53125edee 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -10,18 +10,18 @@ #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) enum ctx_state { - CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */ - CONTEXT_KERNEL = 0, - CONTEXT_IDLE = 1, - CONTEXT_USER = 2, - CONTEXT_GUEST = 3, - CONTEXT_MAX = 4, + CT_STATE_DISABLED = -1, /* returned by ct_state() if unknown */ + CT_STATE_KERNEL = 0, + CT_STATE_IDLE = 1, + CT_STATE_USER = 2, + CT_STATE_GUEST = 3, + CT_STATE_MAX = 4, }; /* Even value for idle, else odd. */ -#define RCU_DYNTICKS_IDX CONTEXT_MAX +#define RCU_DYNTICKS_IDX CT_STATE_MAX -#define CT_STATE_MASK (CONTEXT_MAX - 1) +#define CT_STATE_MASK (CT_STATE_MAX - 1) #define CT_DYNTICKS_MASK (~CT_STATE_MASK) struct context_tracking { @@ -123,14 +123,14 @@ static inline bool context_tracking_enabled_this_cpu(void) * * Returns the current cpu's context tracking state if context tracking * is enabled. If context tracking is disabled, returns - * CONTEXT_DISABLED. This should be used primarily for debugging. + * CT_STATE_DISABLED. This should be used primarily for debugging. */ static __always_inline int ct_state(void) { int ret; if (!context_tracking_enabled()) - return CONTEXT_DISABLED; + return CT_STATE_DISABLED; preempt_disable(); ret = __ct_state(); diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index b0fb775a600d..1e50cdb83ae5 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -108,7 +108,7 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) arch_enter_from_user_mode(regs); lockdep_hardirqs_off(CALLER_ADDR0); - CT_WARN_ON(__ct_state() != CONTEXT_USER); + CT_WARN_ON(__ct_state() != CT_STATE_USER); user_exit_irqoff(); instrumentation_begin(); -- cgit v1.2.3 From 4aa35e0db6d758e8f952ed30d7ac3117c376562c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Jul 2023 12:19:01 +0100 Subject: context_tracking, rcu: Rename RCU_DYNTICKS_IDX into CT_RCU_WATCHING The symbols relating to the CT_STATE part of context_tracking.state are now all prefixed with CT_STATE. The RCU dynticks counter part of that atomic variable still involves symbols with different prefixes, align them all to be prefixed with CT_RCU_WATCHING. Suggested-by: "Paul E. McKenney" Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Acked-by: Thomas Gleixner Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking.h | 6 +++--- include/linux/context_tracking_state.h | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 28fcfa184903..a6c36780cc3b 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -119,7 +119,7 @@ extern void ct_idle_exit(void); */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - return !(raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & RCU_DYNTICKS_IDX); + return !(raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING); } /* @@ -142,7 +142,7 @@ static __always_inline bool warn_rcu_enter(void) preempt_disable_notrace(); if (rcu_dynticks_curr_cpu_in_eqs()) { ret = true; - ct_state_inc(RCU_DYNTICKS_IDX); + ct_state_inc(CT_RCU_WATCHING); } return ret; @@ -151,7 +151,7 @@ static __always_inline bool warn_rcu_enter(void) static __always_inline void warn_rcu_exit(bool rcu) { if (rcu) - ct_state_inc(RCU_DYNTICKS_IDX); + ct_state_inc(CT_RCU_WATCHING); preempt_enable_notrace(); } diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index f1c53125edee..94d6a935af3b 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -18,11 +18,11 @@ enum ctx_state { CT_STATE_MAX = 4, }; -/* Even value for idle, else odd. */ -#define RCU_DYNTICKS_IDX CT_STATE_MAX +/* Odd value for watching, else even. */ +#define CT_RCU_WATCHING CT_STATE_MAX #define CT_STATE_MASK (CT_STATE_MAX - 1) -#define CT_DYNTICKS_MASK (~CT_STATE_MASK) +#define CT_RCU_WATCHING_MASK (~CT_STATE_MASK) struct context_tracking { #ifdef CONFIG_CONTEXT_TRACKING_USER @@ -58,21 +58,21 @@ static __always_inline int __ct_state(void) #ifdef CONFIG_CONTEXT_TRACKING_IDLE static __always_inline int ct_dynticks(void) { - return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_DYNTICKS_MASK; + return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK; } static __always_inline int ct_dynticks_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); - return atomic_read(&ct->state) & CT_DYNTICKS_MASK; + return atomic_read(&ct->state) & CT_RCU_WATCHING_MASK; } static __always_inline int ct_dynticks_cpu_acquire(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); - return atomic_read_acquire(&ct->state) & CT_DYNTICKS_MASK; + return atomic_read_acquire(&ct->state) & CT_RCU_WATCHING_MASK; } static __always_inline long ct_dynticks_nesting(void) -- cgit v1.2.3 From a4a7921ec08d016f9d692752e2f82f66369c7ffa Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 10:45:56 +0200 Subject: context_tracking, rcu: Rename ct_dynticks() into ct_rcu_watching() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 94d6a935af3b..cb90d8c17810 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -56,7 +56,7 @@ static __always_inline int __ct_state(void) #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE -static __always_inline int ct_dynticks(void) +static __always_inline int ct_rcu_watching(void) { return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK; } -- cgit v1.2.3 From a9fde9d1a5dd42edadf61cf4e64aa234b4c5bd3f Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 10:47:14 +0200 Subject: context_tracking, rcu: Rename ct_dynticks_cpu() into ct_rcu_watching_cpu() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index cb90d8c17810..ad5a06a42b4a 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -61,7 +61,7 @@ static __always_inline int ct_rcu_watching(void) return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK; } -static __always_inline int ct_dynticks_cpu(int cpu) +static __always_inline int ct_rcu_watching_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); -- cgit v1.2.3 From 125716c393889f9035567d4d78da239483f63ece Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 14:47:12 +0200 Subject: context_tracking, rcu: Rename ct_dynticks_cpu_acquire() into ct_rcu_watching_cpu_acquire() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index ad5a06a42b4a..ad6570ffeff3 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -68,7 +68,7 @@ static __always_inline int ct_rcu_watching_cpu(int cpu) return atomic_read(&ct->state) & CT_RCU_WATCHING_MASK; } -static __always_inline int ct_dynticks_cpu_acquire(int cpu) +static __always_inline int ct_rcu_watching_cpu_acquire(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); -- cgit v1.2.3 From 7aeba709a048d870c15940af8b620b16281c3b9e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:42 +0200 Subject: rcu/nocb: Introduce RCU_NOCB_LOCKDEP_WARN() Checking for races against concurrent (de-)offloading implies the creation of !CONFIG_RCU_NOCB_CPU stubs to check if each relevant lock is held. For now this only implies the nocb_lock but more are to be expected. Create instead a NOCB specific version of RCU_LOCKDEP_WARN() to avoid the proliferation of stubs. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13f6f00aecf9..d48d3c237305 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -144,11 +144,18 @@ void rcu_init_nohz(void); int rcu_nocb_cpu_offload(int cpu); int rcu_nocb_cpu_deoffload(int cpu); void rcu_nocb_flush_deferred_wakeup(void); + +#define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s) + #else /* #ifdef CONFIG_RCU_NOCB_CPU */ + static inline void rcu_init_nohz(void) { } static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; } static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } static inline void rcu_nocb_flush_deferred_wakeup(void) { } + +#define RCU_NOCB_LOCKDEP_WARN(c, s) + #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* -- cgit v1.2.3 From bae6076ebbd14cc1f1bd4de65c2db21d3ab109d7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:50 +0200 Subject: rcu/nocb: Remove SEGCBLIST_RCU_CORE RCU core can't be running anymore while in the middle of (de-)offloading since this sort of transition now only applies to offline CPUs. The SEGCBLIST_RCU_CORE state can therefore be removed. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcu_segcblist.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index ba95c06675e1..5469c54cd778 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -185,11 +185,10 @@ struct rcu_cblist { * ---------------------------------------------------------------------------- */ #define SEGCBLIST_ENABLED BIT(0) -#define SEGCBLIST_RCU_CORE BIT(1) -#define SEGCBLIST_LOCKING BIT(2) -#define SEGCBLIST_KTHREAD_CB BIT(3) -#define SEGCBLIST_KTHREAD_GP BIT(4) -#define SEGCBLIST_OFFLOADED BIT(5) +#define SEGCBLIST_LOCKING BIT(1) +#define SEGCBLIST_KTHREAD_CB BIT(2) +#define SEGCBLIST_KTHREAD_GP BIT(3) +#define SEGCBLIST_OFFLOADED BIT(4) struct rcu_segcblist { struct rcu_head *head; -- cgit v1.2.3 From 91e43b9044a4f9b6976dc28b3f1446b733cc68de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 30 May 2024 15:45:51 +0200 Subject: rcu/nocb: Remove SEGCBLIST_KTHREAD_CB This state excerpt from the (de-)offloading state machine was used to implement an ad-hoc kthread parking of rcuo kthreads. This code has been removed and therefore the related state can be erased as well. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcu_segcblist.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 5469c54cd778..1ef1bb54853d 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -186,9 +186,8 @@ struct rcu_cblist { */ #define SEGCBLIST_ENABLED BIT(0) #define SEGCBLIST_LOCKING BIT(1) -#define SEGCBLIST_KTHREAD_CB BIT(2) -#define SEGCBLIST_KTHREAD_GP BIT(3) -#define SEGCBLIST_OFFLOADED BIT(4) +#define SEGCBLIST_KTHREAD_GP BIT(2) +#define SEGCBLIST_OFFLOADED BIT(3) struct rcu_segcblist { struct rcu_head *head; -- cgit v1.2.3 From 59c34008d3bdeef4c8ebc0ed2426109b474334d4 Mon Sep 17 00:00:00 2001 From: Shyam Sundar S K Date: Mon, 22 Jul 2024 14:58:01 +0530 Subject: x86/amd_nb: Add new PCI IDs for AMD family 1Ah model 60h Add new PCI device IDs into the root IDs and miscellaneous IDs lists to provide support for the latest generation of AMD 1Ah family 60h processor models. Signed-off-by: Shyam Sundar S K Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Link: https://lore.kernel.org/r/20240722092801.3480266-1-Shyam-sundar.S-k@amd.com --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index e388c8b1cbc2..91182aa1d2ec 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -580,6 +580,7 @@ #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F3 0x12fb #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3 #define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb +#define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F3 0x124b #define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3 0x12bb #define PCI_DEVICE_ID_AMD_MI200_DF_F3 0x14d3 #define PCI_DEVICE_ID_AMD_MI300_DF_F3 0x152b -- cgit v1.2.3 From 52c3fb1a0f822fd64529ca64f3792095524de450 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jul 2024 10:21:49 +0200 Subject: perf/x86: Add hw_perf_event::aux_config Start a new section for AUX PMUs in hw_perf_event. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1a8942277dda..6bb0c21d6335 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -168,6 +168,9 @@ struct hw_perf_event { struct hw_perf_event_extra extra_reg; struct hw_perf_event_extra branch_reg; }; + struct { /* aux / Intel-PT */ + u64 aux_config; + }; struct { /* software */ struct hrtimer hrtimer; }; -- cgit v1.2.3 From f23c042ce34ba265cf3129d530702b5d218e3f4b Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 27 May 2024 14:06:47 +0200 Subject: sched/deadline: Comment sched_dl_entity::dl_server variable Add an explanation for the newly added variable. Fixes: 63ba8422f876 ("sched/deadline: Introduce deadline servers") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Tested-by: Juri Lelli Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/147f7aa8cb8fd925f36aa8059af6a35aad08b45a.1716811044.git.bristot@kernel.org --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f8d150343d42..1c771ea4481d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -639,6 +639,8 @@ struct sched_dl_entity { * * @dl_overrun tells if the task asked to be informed about runtime * overruns. + * + * @dl_server tells if this is a server entity. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; -- cgit v1.2.3 From a110a81c52a9de73e2e57e826dd3bf0fd4c22226 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 27 May 2024 14:06:51 +0200 Subject: sched/deadline: Deferrable dl server Among the motivations for the DL servers is the real-time throttling mechanism. This mechanism works by throttling the rt_rq after running for a long period without leaving space for fair tasks. The base dl server avoids this problem by boosting fair tasks instead of throttling the rt_rq. The point is that it boosts without waiting for potential starvation, causing some non-intuitive cases. For example, an IRQ dispatches two tasks on an idle system, a fair and an RT. The DL server will be activated, running the fair task before the RT one. This problem can be avoided by deferring the dl server activation. By setting the defer option, the dl_server will dispatch an SCHED_DEADLINE reservation with replenished runtime, but throttled. The dl_timer will be set for the defer time at (period - runtime) ns from start time. Thus boosting the fair rq at defer time. If the fair scheduler has the opportunity to run while waiting for defer time, the dl server runtime will be consumed. If the runtime is completely consumed before the defer time, the server will be replenished while still in a throttled state. Then, the dl_timer will be reset to the new defer time If the fair server reaches the defer time without consuming its runtime, the server will start running, following CBS rules (thus without breaking SCHED_DEADLINE). Then the server will continue the running state (without deferring) until it fair tasks are able to execute as regular fair scheduler (end of the starvation). Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Tested-by: Juri Lelli Link: https://lore.kernel.org/r/dd175943c72533cd9f0b87767c6499204879cc38.1716811044.git.bristot@kernel.org --- include/linux/sched.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c771ea4481d..4edd7e2096fb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -641,12 +641,24 @@ struct sched_dl_entity { * overruns. * * @dl_server tells if this is a server entity. + * + * @dl_defer tells if this is a deferred or regular server. For + * now only defer server exists. + * + * @dl_defer_armed tells if the deferrable server is waiting + * for the replenishment timer to activate it. + * + * @dl_defer_running tells if the deferrable server is actually + * running, skipping the defer phase. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; + unsigned int dl_defer : 1; + unsigned int dl_defer_armed : 1; + unsigned int dl_defer_running : 1; /* * Bandwidth enforcement timer. Each -deadline task has its -- cgit v1.2.3 From c8a85394cfdb4696b4e2f8a0f3066a1c921af426 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Mon, 27 May 2024 14:06:54 +0200 Subject: sched/core: Fix picking of tasks for core scheduling with DL server * Use simple CFS pick_task for DL pick_task DL server's pick_task calls CFS's pick_next_task_fair(), this is wrong because core scheduling's pick_task only calls CFS's pick_task() for evaluation / checking of the CFS task (comparing across CPUs), not for actually affirmatively picking the next task. This causes RB tree corruption issues in CFS that were found by syzbot. * Make pick_task_fair clear DL server A DL task pick might set ->dl_server, but it is possible the task will never run (say the other HT has a stop task). If the CFS task is picked in the future directly (say without DL server), ->dl_server will be set. So clear it in pick_task_fair(). This fixes the KASAN issue reported by syzbot in set_next_entity(). (DL refactoring suggestions by Vineeth Pillai). Reported-by: Suleiman Souhlal Signed-off-by: "Joel Fernandes (Google)" Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vineeth Pillai Tested-by: Juri Lelli Link: https://lore.kernel.org/r/b10489ab1f03d23e08e6097acea47442e7d6466f.1716811044.git.bristot@kernel.org --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4edd7e2096fb..2c1b4ee3234f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -686,7 +686,8 @@ struct sched_dl_entity { */ struct rq *rq; dl_server_has_tasks_f server_has_tasks; - dl_server_pick_f server_pick; + dl_server_pick_f server_pick_next; + dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES /* -- cgit v1.2.3 From 5297bba507dc54045e6efeb9955c1271ca9aafe1 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Tue, 23 Jul 2024 15:27:01 +0200 Subject: genirq/msi: Silence 'set affinity failed' warning Various PCI controllers that mux MSIs onto a single IRQ line produce these "IRQ%d: set affinity failed" warnings when entering suspend. This has been discussed before [1] [2] and an example test case is included at the end of this commit message. Controller drivers that create MSI IRQ domain with MSI_FLAG_USE_DEF_CHIP_OPS and do not override the .irq_set_affinity() irqchip callback get assigned the default msi_domain_set_affinity() callback. That is not desired on controllers where it is not possible to set affinity of each MSI IRQ line to a specific CPU core due to hardware limitation. Introduce flag MSI_FLAG_NO_AFFINITY, which keeps .irq_set_affinity() unset if the controller driver did not assign it. This way, migrate_one_irq() can exit right away, without printing the warning. The .irq_set_affinity() implementations which only return -EINVAL can be removed from multiple controller drivers. $ grep 25 /proc/interrupts 25: 0 0 0 0 0 0 0 0 PCIe MSI 0 Edge PCIe PME $ echo core > /sys/power/pm_test ; echo mem > /sys/power/state ... Disabling non-boot CPUs ... IRQ25: set affinity failed(-22). <---------- This is being silenced here psci: CPU7 killed (polled 4 ms) ... [1] https://lore.kernel.org/all/d4a6eea3c5e33a3a4056885419df95a7@kernel.org/ [2] https://lore.kernel.org/all/5f4947b18bf381615a37aa81c2242477@kernel.org/ Link: https://lore.kernel.org/r/20240723132958.41320-2-marek.vasut+renesas@mailbox.org Signed-off-by: Marek Vasut [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Damien Le Moal Reviewed-by: Manivannan Sadhasivam Acked-by: Thomas Gleixner --- include/linux/msi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 944979763825..b10093c4d00e 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -554,6 +554,8 @@ enum { MSI_FLAG_MSIX_CONTIGUOUS = (1 << 19), /* PCI/MSI-X vectors can be dynamically allocated/freed post MSI-X enable */ MSI_FLAG_PCI_MSIX_ALLOC_DYN = (1 << 20), + /* PCI MSIs cannot be steered separately to CPU cores */ + MSI_FLAG_NO_AFFINITY = (1 << 21), }; /** -- cgit v1.2.3 From de79583ffe794663c53b77f97be814522d4edc4f Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Tue, 2 Jul 2024 18:02:33 +0200 Subject: iio: core: add accessors 'masklength' 'masklength' is supposed to be an IIO private member. However, drivers (often in trigger handlers) need to access it to iterate over the enabled channels for example (there are other reasons). Hence, a couple of new accessors are being added: * iio_for_each_active_channel() - Iterates over the active channels; * iio_get_masklength() - Get length of the channels mask. The goal of these new accessors is to annotate 'masklength' as private as soon as all drivers accessing it are converted to use the new helpers. Signed-off-by: Nuno Sa Reviewed-by: Alexandru Ardelean Link: https://patch.msgid.link/20240702-dev-iio-masklength-private-v1-1-98193bf536a6@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 894309294182..dd6bbc468283 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -855,6 +855,24 @@ static inline const struct iio_scan_type return &chan->scan_type; } +/** + * iio_get_masklength - Get length of the channels mask + * @indio_dev: the IIO device to get the masklength for + */ +static inline unsigned int iio_get_masklength(const struct iio_dev *indio_dev) +{ + return indio_dev->masklength; +} + +/** + * iio_for_each_active_channel - Iterated over active channels + * @indio_dev: the IIO device + * @chan: Holds the index of the enabled channel + */ +#define iio_for_each_active_channel(indio_dev, chan) \ + for_each_set_bit((chan), (indio_dev)->active_scan_mask, \ + iio_get_masklength(indio_dev)) + ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals); int iio_str_to_fixpoint(const char *str, int fract_mult, int *integer, -- cgit v1.2.3 From 4bf79f9be434e000c8e12fe83b2f4402480f1460 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 18 Jul 2024 13:23:53 -0700 Subject: bpf: Track equal scalars history on per-instruction level Use bpf_verifier_state->jmp_history to track which registers were updated by find_equal_scalars() (renamed to collect_linked_regs()) when conditional jump was verified. Use recorded information in backtrack_insn() to propagate precision. E.g. for the following program: while verifying instructions 1: r1 = r0 | 2: if r1 < 8 goto ... | push r0,r1 as linked registers in jmp_history 3: if r0 > 16 goto ... | push r0,r1 as linked registers in jmp_history 4: r2 = r10 | 5: r2 += r0 v mark_chain_precision(r0) while doing mark_chain_precision(r0) 5: r2 += r0 | mark r0 precise 4: r2 = r10 | 3: if r0 > 16 goto ... | mark r0,r1 as precise 2: if r1 < 8 goto ... | mark r0,r1 as precise 1: r1 = r0 v Technically, do this as follows: - Use 10 bits to identify each register that gains range because of sync_linked_regs(): - 3 bits for frame number; - 6 bits for register or stack slot number; - 1 bit to indicate if register is spilled. - Use u64 as a vector of 6 such records + 4 bits for vector length. - Augment struct bpf_jmp_history_entry with a field 'linked_regs' representing such vector. - When doing check_cond_jmp_op() remember up to 6 registers that gain range because of sync_linked_regs() in such a vector. - Don't propagate range information and reset IDs for registers that don't fit in 6-value vector. - Push a pair {instruction index, linked registers vector} to bpf_verifier_state->jmp_history. - When doing backtrack_insn() check if any of recorded linked registers is currently marked precise, if so mark all linked registers as precise. This also requires fixes for two test_verifier tests: - precise: test 1 - precise: test 2 Both tests contain the following instruction sequence: 19: (bf) r2 = r9 ; R2=scalar(id=3) R9=scalar(id=3) 20: (a5) if r2 < 0x8 goto pc+1 ; R2=scalar(id=3,umin=8) 21: (95) exit 22: (07) r2 += 1 ; R2_w=scalar(id=3+1,...) 23: (bf) r1 = r10 ; R1_w=fp0 R10=fp0 24: (07) r1 += -8 ; R1_w=fp-8 25: (b7) r3 = 0 ; R3_w=0 26: (85) call bpf_probe_read_kernel#113 The call to bpf_probe_read_kernel() at (26) forces r2 to be precise. Previously, this forced all registers with same id to become precise immediately when mark_chain_precision() is called. After this change, the precision is propagated to registers sharing same id only when 'if' instruction is backtracked. Hence verification log for both tests is changed: regs=r2,r9 -> regs=r2 for instructions 25..20. Fixes: 904e6ddf4133 ("bpf: Use scalar ids in mark_chain_precision()") Reported-by: Hao Sun Suggested-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240718202357.1746514-2-eddyz87@gmail.com Closes: https://lore.kernel.org/bpf/CAEf4BzZ0xidVCqB47XnkXcNhkPWF6_nTV7yt+_Lf0kcFEut2Mg@mail.gmail.com/ --- include/linux/bpf_verifier.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6503c85b10a3..731a0a4ac13c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -371,6 +371,10 @@ struct bpf_jmp_history_entry { u32 prev_idx : 22; /* special flags, e.g., whether insn is doing register stack spill/load */ u32 flags : 10; + /* additional registers that need precision tracking when this + * jump is backtracked, vector of six 10-bit records + */ + u64 linked_regs; }; /* Maximum number of register states that can exist at once */ -- cgit v1.2.3 From e42ac14180554fa23a3312d4f921dc4ea7972fb7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 22 Jul 2024 11:30:45 -0700 Subject: bpf: Check unsupported ops from the bpf_struct_ops's cfi_stubs The bpf_tcp_ca struct_ops currently uses a "u32 unsupported_ops[]" array to track which ops is not supported. After cfi_stubs had been added, the function pointer in cfi_stubs is also NULL for the unsupported ops. Thus, the "u32 unsupported_ops[]" becomes redundant. This observation was originally brought up in the bpf/cfi discussion: https://lore.kernel.org/bpf/CAADnVQJoEkdjyCEJRPASjBw1QGsKYrF33QdMGc1RZa9b88bAEA@mail.gmail.com/ The recent bpf qdisc patch (https://lore.kernel.org/bpf/20240714175130.4051012-6-amery.hung@bytedance.com/) also needs to specify quite many unsupported ops. It is a good time to clean it up. This patch removes the need of "u32 unsupported_ops[]" and tests for null-ness in the cfi_stubs instead. Testing the cfi_stubs is done in a new function bpf_struct_ops_supported(). The verifier will call bpf_struct_ops_supported() when loading the struct_ops program. The ".check_member" is removed from the bpf_tcp_ca in this patch. ".check_member" could still be useful for other subsytems to enforce other restrictions (e.g. sched_ext checks for prog->sleepable). To keep the same error return, ENOTSUPP is used. Cc: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240722183049.2254692-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3b94ec161e8c..4c54864316ee 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1795,6 +1795,7 @@ struct bpf_struct_ops_common_value { #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA)) bool bpf_struct_ops_get(const void *kdata); void bpf_struct_ops_put(const void *kdata); +int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff); int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, @@ -1851,6 +1852,10 @@ static inline void bpf_module_put(const void *data, struct module *owner) { module_put(owner); } +static inline int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) +{ + return -ENOTSUPP; +} static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value) -- cgit v1.2.3 From 52dea0a15cc888e89f0144dca7b712fb56d12a28 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2024 18:42:28 +0200 Subject: posix-timers: Convert timer list to hlist No requirement for a real list. Spare a few bytes. Signed-off-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) --- include/linux/posix-timers.h | 2 +- include/linux/sched/signal.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index dc7b738de299..453691710839 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -158,7 +158,7 @@ static inline void posix_cputimers_init_work(void) { } * @rcu: RCU head for freeing the timer. */ struct k_itimer { - struct list_head list; + struct hlist_node list; struct hlist_node t_hash; spinlock_t it_lock; const struct k_clock *kclock; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0a0e23c45406..622fdef78e14 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -137,7 +137,7 @@ struct signal_struct { /* POSIX.1b Interval Timers */ unsigned int next_posix_timer_id; - struct list_head posix_timers; + struct hlist_head posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; -- cgit v1.2.3 From a2b80ce87a87fc18c594e74d13031d5e347b69cb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2024 18:42:33 +0200 Subject: signal: Remove task argument from dequeue_signal() The task pointer which is handed to dequeue_signal() is always current. The argument along with the first comment about signalfd in that function is confusing at best. Remove it and use current internally. Update the stale comment for dequeue_signal() while at it. Signed-off-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Reviewed-by: Oleg Nesterov Acked-by: Peter Zijlstra (Intel) --- include/linux/sched/signal.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 622fdef78e14..c8ed09ac29ac 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -276,8 +276,7 @@ static inline void signal_set_stop_flags(struct signal_struct *sig, extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); -extern int dequeue_signal(struct task_struct *task, sigset_t *mask, - kernel_siginfo_t *info, enum pid_type *type); +extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { @@ -287,7 +286,7 @@ static inline int kernel_dequeue_signal(void) int ret; spin_lock_irq(&task->sighand->siglock); - ret = dequeue_signal(task, &task->blocked, &__info, &__type); + ret = dequeue_signal(&task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; -- cgit v1.2.3 From 5d99e198be279045e6ecefe220f5c52f8ce9bfd5 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:52 +0800 Subject: bpf, lsm: Add check for BPF LSM return value A bpf prog returning a positive number attached to file_alloc_security hook makes kernel panic. This happens because file system can not filter out the positive number returned by the LSM prog using IS_ERR, and misinterprets this positive number as a file pointer. Given that hook file_alloc_security never returned positive number before the introduction of BPF LSM, and other BPF LSM hooks may encounter similar issues, this patch adds LSM return value check in verifier, to ensure no unexpected value is returned. Fixes: 520b7aa00d8c ("bpf: lsm: Initialize the BPF LSM hooks") Reported-by: Xin Liu Signed-off-by: Xu Kuohai Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240719110059.797546-3-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 1 + include/linux/bpf_lsm.h | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c54864316ee..5739cc9986f8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -927,6 +927,7 @@ struct bpf_insn_access_aux { }; }; struct bpf_verifier_log *log; /* for verbose logs */ + bool is_retval; /* is accessing function return value ? */ }; static inline void diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 1de7ece5d36d..aefcd6564251 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -9,6 +9,7 @@ #include #include +#include #include #ifdef CONFIG_BPF_LSM @@ -45,6 +46,8 @@ void bpf_inode_storage_free(struct inode *inode); void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, bpf_func_t *bpf_func); +int bpf_lsm_get_retval_range(const struct bpf_prog *prog, + struct bpf_retval_range *range); #else /* !CONFIG_BPF_LSM */ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) @@ -78,6 +81,11 @@ static inline void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog, { } +static inline int bpf_lsm_get_retval_range(const struct bpf_prog *prog, + struct bpf_retval_range *range) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ -- cgit v1.2.3 From 28ead3eaabc16ecc907cfb71876da028080f6356 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Fri, 19 Jul 2024 19:00:53 +0800 Subject: bpf: Prevent tail call between progs attached to different hooks bpf progs can be attached to kernel functions, and the attached functions can take different parameters or return different return values. If prog attached to one kernel function tail calls prog attached to another kernel function, the ctx access or return value verification could be bypassed. For example, if prog1 is attached to func1 which takes only 1 parameter and prog2 is attached to func2 which takes two parameters. Since verifier assumes the bpf ctx passed to prog2 is constructed based on func2's prototype, verifier allows prog2 to access the second parameter from the bpf ctx passed to it. The problem is that verifier does not prevent prog1 from passing its bpf ctx to prog2 via tail call. In this case, the bpf ctx passed to prog2 is constructed from func1 instead of func2, that is, the assumption for ctx access verification is bypassed. Another example, if BPF LSM prog1 is attached to hook file_alloc_security, and BPF LSM prog2 is attached to hook bpf_lsm_audit_rule_known. Verifier knows the return value rules for these two hooks, e.g. it is legal for bpf_lsm_audit_rule_known to return positive number 1, and it is illegal for file_alloc_security to return positive number. So verifier allows prog2 to return positive number 1, but does not allow prog1 to return positive number. The problem is that verifier does not prevent prog1 from calling prog2 via tail call. In this case, prog2's return value 1 will be used as the return value for prog1's hook file_alloc_security. That is, the return value rule is bypassed. This patch adds restriction for tail call to prevent such bypasses. Signed-off-by: Xu Kuohai Link: https://lore.kernel.org/r/20240719110059.797546-4-xukuohai@huaweicloud.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5739cc9986f8..f16d0753f518 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -294,6 +294,7 @@ struct bpf_map { * same prog type, JITed flag and xdp_has_frags flag. */ struct { + const struct btf_type *attach_func_proto; spinlock_t lock; enum bpf_prog_type type; bool jited; -- cgit v1.2.3 From 2aff9d20d50ac45dd13a013ef5231f4fb8912356 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 10 Jul 2024 14:32:25 -0700 Subject: lsm: infrastructure management of the sock security Move management of the sock->sk_security blob out of the individual security modules and into the security infrastructure. Instead of allocating the blobs from within the modules the modules tell the infrastructure how much space is required, and the space is allocated there. Acked-by: Paul Moore Reviewed-by: Kees Cook Reviewed-by: John Johansen Acked-by: Stephen Smalley Signed-off-by: Casey Schaufler [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index a2ade0ffe9e7..efd4a0655159 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -73,6 +73,7 @@ struct lsm_blob_sizes { int lbs_cred; int lbs_file; int lbs_inode; + int lbs_sock; int lbs_superblock; int lbs_ipc; int lbs_msg_msg; -- cgit v1.2.3 From 5f8d28f6d7d568dbbc8c5bce94894474c07afd4f Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 10 Jul 2024 14:32:26 -0700 Subject: lsm: infrastructure management of the key security blob Move management of the key->security blob out of the individual security modules and into the security infrastructure. Instead of allocating the blobs from within the modules the modules tell the infrastructure how much space is required, and the space is allocated there. There are no existing modules that require a key_free hook, so the call to it and the definition for it have been removed. Signed-off-by: Casey Schaufler Reviewed-by: John Johansen [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 - include/linux/lsm_hooks.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 855db460e08b..8cc60644f3bd 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -403,7 +403,6 @@ LSM_HOOK(int, 0, xfrm_decode_session, struct sk_buff *skb, u32 *secid, #ifdef CONFIG_KEYS LSM_HOOK(int, 0, key_alloc, struct key *key, const struct cred *cred, unsigned long flags) -LSM_HOOK(void, LSM_RET_VOID, key_free, struct key *key) LSM_HOOK(int, 0, key_permission, key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) LSM_HOOK(int, 0, key_getsecurity, struct key *key, char **buffer) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index efd4a0655159..7233bc0737be 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -76,6 +76,7 @@ struct lsm_blob_sizes { int lbs_sock; int lbs_superblock; int lbs_ipc; + int lbs_key; int lbs_msg_msg; int lbs_task; int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ -- cgit v1.2.3 From a39c0f77dbbe083f3eec6c3b32d90f168f7575eb Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 10 Jul 2024 14:32:28 -0700 Subject: lsm: infrastructure management of the dev_tun blob Move management of the dev_tun security blob out of the individual security modules and into the LSM infrastructure. The security modules tell the infrastructure how much space they require at initialization. There are no longer any modules that require the dev_tun_free hook. The hook definition has been removed. Signed-off-by: Casey Schaufler Reviewed-by: John Johansen [PM: subject tweak, selinux style fixes] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 3 +-- include/linux/lsm_hooks.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 8cc60644f3bd..22c475f530ae 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -353,8 +353,7 @@ LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_inc, void) LSM_HOOK(void, LSM_RET_VOID, secmark_refcount_dec, void) LSM_HOOK(void, LSM_RET_VOID, req_classify_flow, const struct request_sock *req, struct flowi_common *flic) -LSM_HOOK(int, 0, tun_dev_alloc_security, void **security) -LSM_HOOK(void, LSM_RET_VOID, tun_dev_free_security, void *security) +LSM_HOOK(int, 0, tun_dev_alloc_security, void *security) LSM_HOOK(int, 0, tun_dev_create, void) LSM_HOOK(int, 0, tun_dev_attach_queue, void *security) LSM_HOOK(int, 0, tun_dev_attach, struct sock *sk, void *security) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 7233bc0737be..0ff14ff128c8 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -80,6 +80,7 @@ struct lsm_blob_sizes { int lbs_msg_msg; int lbs_task; int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ + int lbs_tun_dev; }; /** -- cgit v1.2.3 From 66de33a0bbb59ef3909d2c65dbbb7fc503d573bd Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 10 Jul 2024 14:32:29 -0700 Subject: lsm: infrastructure management of the infiniband blob Move management of the infiniband security blob out of the individual security modules and into the LSM infrastructure. The security modules tell the infrastructure how much space they require at initialization. There are no longer any modules that require the ib_free() hook. The hook definition has been removed. Signed-off-by: Casey Schaufler Reviewed-by: John Johansen [PM: subject tweak, selinux style fixes] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 3 +-- include/linux/lsm_hooks.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 22c475f530ae..4fcbc5b3f58c 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -373,8 +373,7 @@ LSM_HOOK(int, 0, mptcp_add_subflow, struct sock *sk, struct sock *ssk) LSM_HOOK(int, 0, ib_pkey_access, void *sec, u64 subnet_prefix, u16 pkey) LSM_HOOK(int, 0, ib_endport_manage_subnet, void *sec, const char *dev_name, u8 port_num) -LSM_HOOK(int, 0, ib_alloc_security, void **sec) -LSM_HOOK(void, LSM_RET_VOID, ib_free_security, void *sec) +LSM_HOOK(int, 0, ib_alloc_security, void *sec) #endif /* CONFIG_SECURITY_INFINIBAND */ #ifdef CONFIG_SECURITY_NETWORK_XFRM diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0ff14ff128c8..b6fc6ac88723 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -72,6 +72,7 @@ struct security_hook_list { struct lsm_blob_sizes { int lbs_cred; int lbs_file; + int lbs_ib; int lbs_inode; int lbs_sock; int lbs_superblock; -- cgit v1.2.3 From 61a1dcdceb44d79e5ab511295791b88ea178c045 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Wed, 10 Jul 2024 14:32:30 -0700 Subject: lsm: infrastructure management of the perf_event security blob Move management of the perf_event->security blob out of the individual security modules and into the security infrastructure. Instead of allocating the blobs from within the modules the modules tell the infrastructure how much space is required, and the space is allocated there. There are no longer any modules that require the perf_event_free() hook. The hook definition has been removed. Signed-off-by: Casey Schaufler Reviewed-by: John Johansen [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 - include/linux/lsm_hooks.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 4fcbc5b3f58c..12c81be78eb9 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -439,7 +439,6 @@ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) #ifdef CONFIG_PERF_EVENTS LSM_HOOK(int, 0, perf_event_open, struct perf_event_attr *attr, int type) LSM_HOOK(int, 0, perf_event_alloc, struct perf_event *event) -LSM_HOOK(void, LSM_RET_VOID, perf_event_free, struct perf_event *event) LSM_HOOK(int, 0, perf_event_read, struct perf_event *event) LSM_HOOK(int, 0, perf_event_write, struct perf_event *event) #endif /* CONFIG_PERF_EVENTS */ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index b6fc6ac88723..f1ca8082075a 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -79,6 +79,7 @@ struct lsm_blob_sizes { int lbs_ipc; int lbs_key; int lbs_msg_msg; + int lbs_perf_event; int lbs_task; int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ int lbs_tun_dev; -- cgit v1.2.3 From 92de36080c93296ef9005690705cba260b9bd68a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 23 Jul 2024 08:34:39 -0700 Subject: bpf: Fail verification for sign-extension of packet data/data_end/data_meta syzbot reported a kernel crash due to commit 1f1e864b6555 ("bpf: Handle sign-extenstin ctx member accesses"). The reason is due to sign-extension of 32-bit load for packet data/data_end/data_meta uapi field. The original code looks like: r2 = *(s32 *)(r1 + 76) /* load __sk_buff->data */ r3 = *(u32 *)(r1 + 80) /* load __sk_buff->data_end */ r0 = r2 r0 += 8 if r3 > r0 goto +1 ... Note that __sk_buff->data load has 32-bit sign extension. After verification and convert_ctx_accesses(), the final asm code looks like: r2 = *(u64 *)(r1 +208) r2 = (s32)r2 r3 = *(u64 *)(r1 +80) r0 = r2 r0 += 8 if r3 > r0 goto pc+1 ... Note that 'r2 = (s32)r2' may make the kernel __sk_buff->data address invalid which may cause runtime failure. Currently, in C code, typically we have void *data = (void *)(long)skb->data; void *data_end = (void *)(long)skb->data_end; ... and it will generate r2 = *(u64 *)(r1 +208) r3 = *(u64 *)(r1 +80) r0 = r2 r0 += 8 if r3 > r0 goto pc+1 If we allow sign-extension, void *data = (void *)(long)(int)skb->data; void *data_end = (void *)(long)skb->data_end; ... the generated code looks like r2 = *(u64 *)(r1 +208) r2 <<= 32 r2 s>>= 32 r3 = *(u64 *)(r1 +80) r0 = r2 r0 += 8 if r3 > r0 goto pc+1 and this will cause verification failure since "r2 <<= 32" is not allowed as "r2" is a packet pointer. To fix this issue for case r2 = *(s32 *)(r1 + 76) /* load __sk_buff->data */ this patch added additional checking in is_valid_access() callback function for packet data/data_end/data_meta access. If those accesses are with sign-extenstion, the verification will fail. [1] https://lore.kernel.org/bpf/000000000000c90eee061d236d37@google.com/ Reported-by: syzbot+ad9ec60c8eaf69e6f99c@syzkaller.appspotmail.com Fixes: 1f1e864b6555 ("bpf: Handle sign-extenstin ctx member accesses") Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20240723153439.2429035-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f16d0753f518..f560ea0c2b36 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -920,6 +920,7 @@ static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT); */ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; + bool is_ldsx; union { int ctx_field_size; struct { -- cgit v1.2.3 From 5b5f51bff1b66cedb62b5ba74a1878341204e057 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Mon, 22 Jul 2024 16:38:36 -0700 Subject: bpf: no_caller_saved_registers attribute for helper calls GCC and LLVM define a no_caller_saved_registers function attribute. This attribute means that function scratches only some of the caller saved registers defined by ABI. For BPF the set of such registers could be defined as follows: - R0 is scratched only if function is non-void; - R1-R5 are scratched only if corresponding parameter type is defined in the function prototype. This commit introduces flag bpf_func_prot->allow_nocsr. If this flag is set for some helper function, verifier assumes that it follows no_caller_saved_registers calling convention. The contract between kernel and clang allows to simultaneously use such functions and maintain backwards compatibility with old kernels that don't understand no_caller_saved_registers calls (nocsr for short): - clang generates a simple pattern for nocsr calls, e.g.: r1 = 1; r2 = 2; *(u64 *)(r10 - 8) = r1; *(u64 *)(r10 - 16) = r2; call %[to_be_inlined] r2 = *(u64 *)(r10 - 16); r1 = *(u64 *)(r10 - 8); r0 = r1; r0 += r2; exit; - kernel removes unnecessary spills and fills, if called function is inlined by verifier or current JIT (with assumption that patch inserted by verifier or JIT honors nocsr contract, e.g. does not scratch r3-r5 for the example above), e.g. the code above would be transformed to: r1 = 1; r2 = 2; call %[to_be_inlined] r0 = r1; r0 += r2; exit; Technically, the transformation is split into the following phases: - function mark_nocsr_patterns(), called from bpf_check() searches and marks potential patterns in instruction auxiliary data; - upon stack read or write access, function check_nocsr_stack_contract() is used to verify if stack offsets, presumably reserved for nocsr patterns, are used only from those patterns; - function remove_nocsr_spills_fills(), called from bpf_check(), applies the rewrite for valid patterns. See comment in mark_nocsr_pattern_for_call() for more details. Suggested-by: Alexei Starovoitov Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240722233844.1406874-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 6 ++++++ include/linux/bpf_verifier.h | 14 ++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f560ea0c2b36..b9425e410bcb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -808,6 +808,12 @@ struct bpf_func_proto { bool gpl_only; bool pkt_access; bool might_sleep; + /* set to true if helper follows contract for gcc/llvm + * attribute no_caller_saved_registers: + * - void functions do not scratch r0 + * - functions taking N arguments scratch only registers r1-rN + */ + bool allow_nocsr; enum bpf_return_type ret_type; union { struct { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 731a0a4ac13c..5cea15c81b8a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -576,6 +576,14 @@ struct bpf_insn_aux_data { bool is_iter_next; /* bpf_iter__next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ + /* true if STX or LDX instruction is a part of a spill/fill + * pattern for a no_caller_saved_registers call. + */ + u8 nocsr_pattern:1; + /* for CALL instructions, a number of spill/fill pairs in the + * no_caller_saved_registers pattern. + */ + u8 nocsr_spills_num:3; /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ @@ -645,6 +653,10 @@ struct bpf_subprog_info { u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; + /* offsets in range [stack_depth .. nocsr_stack_off) + * are used for no_caller_saved_registers spills and fills. + */ + s16 nocsr_stack_off; bool has_tail_call: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; @@ -652,6 +664,8 @@ struct bpf_subprog_info { bool is_async_cb: 1; bool is_exception_cb: 1; bool args_cached: 1; + /* true if nocsr stack region is used by functions that can't be inlined */ + bool keep_nocsr_stack: 1; u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; -- cgit v1.2.3 From 7ebd8c5acad5f8ca41f37b36dc62570e1fa13d8b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 18 Jul 2024 16:59:06 +0900 Subject: ata: libata: Use QUIRK instead of HORKAGE According to Wiktionary, the verb "hork" is computing slang defined as "To foul up; to be occupied with difficulty, tangle, or unpleasantness; to be broken" (https://en.wiktionary.org/wiki/hork#Verb). libata uses this with the term "horkage" to refer to broken device features. Given that this term is not widely used and its meaning unknown to many, rename it to the more commonly used term "quirk", similar to many other places in the kernel. The renaming done is: 1) Rename all ATA_HORKAGE_XXX flags to ATA_QUIRK_XXX 2) Rename struct ata_device horkage field to quirks 3) Rename struct ata_blacklist_entry to struct ata_dev_quirks_entry. The array of these structures defining quirks for known devices is renamed __ata_dev_quirks. 4) The functions ata_dev_blacklisted() and ata_force_horkage() are renamed to ata_dev_quirks() and ata_force_quirks() respectively. 5) All the force_horkage_xxx() macros are renamed to force_quirk_xxx() And while at it, make sure that the type "unsigned int" is used consistantly for quirk flags variables and data structure fields. Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Reviewed-by: Igor Pylypiv --- include/linux/libata.h | 71 +++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 17394098bee9..05dd7038ab30 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -362,40 +362,41 @@ enum { */ ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 8, - /* Horkage types. May be set by libata or controller on drives - (some horkage may be drive/controller pair dependent */ - - ATA_HORKAGE_DIAGNOSTIC = (1 << 0), /* Failed boot diag */ - ATA_HORKAGE_NODMA = (1 << 1), /* DMA problems */ - ATA_HORKAGE_NONCQ = (1 << 2), /* Don't use NCQ */ - ATA_HORKAGE_MAX_SEC_128 = (1 << 3), /* Limit max sects to 128 */ - ATA_HORKAGE_BROKEN_HPA = (1 << 4), /* Broken HPA */ - ATA_HORKAGE_DISABLE = (1 << 5), /* Disable it */ - ATA_HORKAGE_HPA_SIZE = (1 << 6), /* native size off by one */ - ATA_HORKAGE_IVB = (1 << 8), /* cbl det validity bit bugs */ - ATA_HORKAGE_STUCK_ERR = (1 << 9), /* stuck ERR on next PACKET */ - ATA_HORKAGE_BRIDGE_OK = (1 << 10), /* no bridge limits */ - ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands - not multiple of 16 bytes */ - ATA_HORKAGE_FIRMWARE_WARN = (1 << 12), /* firmware update warning */ - ATA_HORKAGE_1_5_GBPS = (1 << 13), /* force 1.5 Gbps */ - ATA_HORKAGE_NOSETXFER = (1 << 14), /* skip SETXFER, SATA only */ - ATA_HORKAGE_BROKEN_FPDMA_AA = (1 << 15), /* skip AA */ - ATA_HORKAGE_DUMP_ID = (1 << 16), /* dump IDENTIFY data */ - ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17), /* Set max sects to 65535 */ - ATA_HORKAGE_ATAPI_DMADIR = (1 << 18), /* device requires dmadir */ - ATA_HORKAGE_NO_NCQ_TRIM = (1 << 19), /* don't use queued TRIM */ - ATA_HORKAGE_NOLPM = (1 << 20), /* don't use LPM */ - ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21), /* some WDs have broken LPM */ - ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */ - ATA_HORKAGE_NO_DMA_LOG = (1 << 23), /* don't use DMA for log read */ - ATA_HORKAGE_NOTRIM = (1 << 24), /* don't use TRIM */ - ATA_HORKAGE_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ - ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ - ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ - ATA_HORKAGE_NO_ID_DEV_LOG = (1 << 28), /* Identify device log missing */ - ATA_HORKAGE_NO_LOG_DIR = (1 << 29), /* Do not read log directory */ - ATA_HORKAGE_NO_FUA = (1 << 30), /* Do not use FUA */ + /* + * Quirk flags: may be set by libata or controller drivers on drives. + * Some quirks may be drive/controller pair dependent. + */ + ATA_QUIRK_DIAGNOSTIC = (1 << 0), /* Failed boot diag */ + ATA_QUIRK_NODMA = (1 << 1), /* DMA problems */ + ATA_QUIRK_NONCQ = (1 << 2), /* Do not use NCQ */ + ATA_QUIRK_MAX_SEC_128 = (1 << 3), /* Limit max sects to 128 */ + ATA_QUIRK_BROKEN_HPA = (1 << 4), /* Broken HPA */ + ATA_QUIRK_DISABLE = (1 << 5), /* Disable it */ + ATA_QUIRK_HPA_SIZE = (1 << 6), /* Native size off by one */ + ATA_QUIRK_IVB = (1 << 8), /* CBL det validity bit bugs */ + ATA_QUIRK_STUCK_ERR = (1 << 9), /* Stuck ERR on next PACKET */ + ATA_QUIRK_BRIDGE_OK = (1 << 10), /* No bridge limits */ + ATA_QUIRK_ATAPI_MOD16_DMA = (1 << 11), /* Use ATAPI DMA for commands */ + /* not multiple of 16 bytes */ + ATA_QUIRK_FIRMWARE_WARN = (1 << 12), /* Firmware update warning */ + ATA_QUIRK_1_5_GBPS = (1 << 13), /* Force 1.5 Gbps */ + ATA_QUIRK_NOSETXFER = (1 << 14), /* Skip SETXFER, SATA only */ + ATA_QUIRK_BROKEN_FPDMA_AA = (1 << 15), /* Skip AA */ + ATA_QUIRK_DUMP_ID = (1 << 16), /* Dump IDENTIFY data */ + ATA_QUIRK_MAX_SEC_LBA48 = (1 << 17), /* Set max sects to 65535 */ + ATA_QUIRK_ATAPI_DMADIR = (1 << 18), /* Device requires dmadir */ + ATA_QUIRK_NO_NCQ_TRIM = (1 << 19), /* Do not use queued TRIM */ + ATA_QUIRK_NOLPM = (1 << 20), /* Do not use LPM */ + ATA_QUIRK_WD_BROKEN_LPM = (1 << 21), /* Some WDs have broken LPM */ + ATA_QUIRK_ZERO_AFTER_TRIM = (1 << 22), /* Guarantees zero after trim */ + ATA_QUIRK_NO_DMA_LOG = (1 << 23), /* Do not use DMA for log read */ + ATA_QUIRK_NOTRIM = (1 << 24), /* Do not use TRIM */ + ATA_QUIRK_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ + ATA_QUIRK_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ + ATA_QUIRK_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ + ATA_QUIRK_NO_ID_DEV_LOG = (1 << 28), /* Identify device log missing */ + ATA_QUIRK_NO_LOG_DIR = (1 << 29), /* Do not read log directory */ + ATA_QUIRK_NO_FUA = (1 << 30), /* Do not use FUA */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ @@ -663,7 +664,7 @@ struct ata_cpr_log { struct ata_device { struct ata_link *link; unsigned int devno; /* 0 or 1 */ - unsigned int horkage; /* List of broken features */ + unsigned int quirks; /* List of broken features */ unsigned long flags; /* ATA_DFLAG_xxx */ struct scsi_device *sdev; /* attached SCSI device */ void *private_data; -- cgit v1.2.3 From 58157d607aecb4e05ab793408038b014c84e466f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 18 Jul 2024 16:54:03 +0900 Subject: ata: libata: Print quirks applied to devices Introduce the function ata_dev_print_quirks() to print the quirk flags that will be applied to a scanned device. This new function is called from ata_dev_quirks() when a match on a device model or device model and revision is found for a device in the __ata_dev_quirks array. To implement this function, the ATA_QUIRK_ flags are redefined using the new enum ata_quirk which defines the bit shift for each quirk flag. The array of strings ata_quirk_names is used to define the name of each flag, which are printed by ata_dev_print_quirks(). Example output for a device listed in the __ata_dev_quirks array and which has the ATA_QUIRK_DISABLE flag applied: [10193.461270] ata1: SATA link up 6.0 Gbps (SStatus 133 SControl 300) [10193.469190] ata1.00: Model 'ASMT109x- Config', rev '2143 5', applying quirks: disable [10193.469195] ata1.00: unsupported device, disabling [10193.481564] ata1.00: disable device enum ata_quirk also defines the __ATA_QUIRK_MAX value as one plus the last quirk flag defined. This value is used in ata_dev_quirks() to add a build time check that all quirk flags fit within the unsigned int (32-bits) quirks field of struct ata_device. Signed-off-by: Damien Le Moal Reviewed-by: Igor Pylypiv Reviewed-by: Niklas Cassel --- include/linux/libata.h | 106 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 72 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 05dd7038ab30..d598ef690e50 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -55,6 +55,46 @@ /* defines only for the constants which don't work well as enums */ #define ATA_TAG_POISON 0xfafbfcfdU +/* + * Quirk flags bits. + * ata_device->quirks is an unsigned int, so __ATA_QUIRK_MAX must not exceed 32. + */ +enum ata_quirks { + __ATA_QUIRK_DIAGNOSTIC, /* Failed boot diag */ + __ATA_QUIRK_NODMA, /* DMA problems */ + __ATA_QUIRK_NONCQ, /* Don't use NCQ */ + __ATA_QUIRK_MAX_SEC_128, /* Limit max sects to 128 */ + __ATA_QUIRK_BROKEN_HPA, /* Broken HPA */ + __ATA_QUIRK_DISABLE, /* Disable it */ + __ATA_QUIRK_HPA_SIZE, /* Native size off by one */ + __ATA_QUIRK_IVB, /* cbl det validity bit bugs */ + __ATA_QUIRK_STUCK_ERR, /* Stuck ERR on next PACKET */ + __ATA_QUIRK_BRIDGE_OK, /* No bridge limits */ + __ATA_QUIRK_ATAPI_MOD16_DMA, /* Use ATAPI DMA for commands that */ + /* are not a multiple of 16 bytes */ + __ATA_QUIRK_FIRMWARE_WARN, /* Firmware update warning */ + __ATA_QUIRK_1_5_GBPS, /* Force 1.5 Gbps */ + __ATA_QUIRK_NOSETXFER, /* Skip SETXFER, SATA only */ + __ATA_QUIRK_BROKEN_FPDMA_AA, /* Skip AA */ + __ATA_QUIRK_DUMP_ID, /* Dump IDENTIFY data */ + __ATA_QUIRK_MAX_SEC_LBA48, /* Set max sects to 65535 */ + __ATA_QUIRK_ATAPI_DMADIR, /* Device requires dmadir */ + __ATA_QUIRK_NO_NCQ_TRIM, /* Do not use queued TRIM */ + __ATA_QUIRK_NOLPM, /* Do not use LPM */ + __ATA_QUIRK_WD_BROKEN_LPM, /* Some WDs have broken LPM */ + __ATA_QUIRK_ZERO_AFTER_TRIM, /* Guarantees zero after trim */ + __ATA_QUIRK_NO_DMA_LOG, /* Do not use DMA for log read */ + __ATA_QUIRK_NOTRIM, /* Do not use TRIM */ + __ATA_QUIRK_MAX_SEC_1024, /* Limit max sects to 1024 */ + __ATA_QUIRK_MAX_TRIM_128M, /* Limit max trim size to 128M */ + __ATA_QUIRK_NO_NCQ_ON_ATI, /* Disable NCQ on ATI chipset */ + __ATA_QUIRK_NO_ID_DEV_LOG, /* Identify device log missing */ + __ATA_QUIRK_NO_LOG_DIR, /* Do not read log directory */ + __ATA_QUIRK_NO_FUA, /* Do not use FUA */ + + __ATA_QUIRK_MAX, +}; + enum { /* various global constants */ LIBATA_MAX_PRD = ATA_MAX_PRD / 2, @@ -366,40 +406,38 @@ enum { * Quirk flags: may be set by libata or controller drivers on drives. * Some quirks may be drive/controller pair dependent. */ - ATA_QUIRK_DIAGNOSTIC = (1 << 0), /* Failed boot diag */ - ATA_QUIRK_NODMA = (1 << 1), /* DMA problems */ - ATA_QUIRK_NONCQ = (1 << 2), /* Do not use NCQ */ - ATA_QUIRK_MAX_SEC_128 = (1 << 3), /* Limit max sects to 128 */ - ATA_QUIRK_BROKEN_HPA = (1 << 4), /* Broken HPA */ - ATA_QUIRK_DISABLE = (1 << 5), /* Disable it */ - ATA_QUIRK_HPA_SIZE = (1 << 6), /* Native size off by one */ - ATA_QUIRK_IVB = (1 << 8), /* CBL det validity bit bugs */ - ATA_QUIRK_STUCK_ERR = (1 << 9), /* Stuck ERR on next PACKET */ - ATA_QUIRK_BRIDGE_OK = (1 << 10), /* No bridge limits */ - ATA_QUIRK_ATAPI_MOD16_DMA = (1 << 11), /* Use ATAPI DMA for commands */ - /* not multiple of 16 bytes */ - ATA_QUIRK_FIRMWARE_WARN = (1 << 12), /* Firmware update warning */ - ATA_QUIRK_1_5_GBPS = (1 << 13), /* Force 1.5 Gbps */ - ATA_QUIRK_NOSETXFER = (1 << 14), /* Skip SETXFER, SATA only */ - ATA_QUIRK_BROKEN_FPDMA_AA = (1 << 15), /* Skip AA */ - ATA_QUIRK_DUMP_ID = (1 << 16), /* Dump IDENTIFY data */ - ATA_QUIRK_MAX_SEC_LBA48 = (1 << 17), /* Set max sects to 65535 */ - ATA_QUIRK_ATAPI_DMADIR = (1 << 18), /* Device requires dmadir */ - ATA_QUIRK_NO_NCQ_TRIM = (1 << 19), /* Do not use queued TRIM */ - ATA_QUIRK_NOLPM = (1 << 20), /* Do not use LPM */ - ATA_QUIRK_WD_BROKEN_LPM = (1 << 21), /* Some WDs have broken LPM */ - ATA_QUIRK_ZERO_AFTER_TRIM = (1 << 22), /* Guarantees zero after trim */ - ATA_QUIRK_NO_DMA_LOG = (1 << 23), /* Do not use DMA for log read */ - ATA_QUIRK_NOTRIM = (1 << 24), /* Do not use TRIM */ - ATA_QUIRK_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ - ATA_QUIRK_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ - ATA_QUIRK_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ - ATA_QUIRK_NO_ID_DEV_LOG = (1 << 28), /* Identify device log missing */ - ATA_QUIRK_NO_LOG_DIR = (1 << 29), /* Do not read log directory */ - ATA_QUIRK_NO_FUA = (1 << 30), /* Do not use FUA */ - - /* DMA mask for user DMA control: User visible values; DO NOT - renumber */ + ATA_QUIRK_DIAGNOSTIC = (1U << __ATA_QUIRK_DIAGNOSTIC), + ATA_QUIRK_NODMA = (1U << __ATA_QUIRK_NODMA), + ATA_QUIRK_NONCQ = (1U << __ATA_QUIRK_NONCQ), + ATA_QUIRK_MAX_SEC_128 = (1U << __ATA_QUIRK_MAX_SEC_128), + ATA_QUIRK_BROKEN_HPA = (1U << __ATA_QUIRK_BROKEN_HPA), + ATA_QUIRK_DISABLE = (1U << __ATA_QUIRK_DISABLE), + ATA_QUIRK_HPA_SIZE = (1U << __ATA_QUIRK_HPA_SIZE), + ATA_QUIRK_IVB = (1U << __ATA_QUIRK_IVB), + ATA_QUIRK_STUCK_ERR = (1U << __ATA_QUIRK_STUCK_ERR), + ATA_QUIRK_BRIDGE_OK = (1U << __ATA_QUIRK_BRIDGE_OK), + ATA_QUIRK_ATAPI_MOD16_DMA = (1U << __ATA_QUIRK_ATAPI_MOD16_DMA), + ATA_QUIRK_FIRMWARE_WARN = (1U << __ATA_QUIRK_FIRMWARE_WARN), + ATA_QUIRK_1_5_GBPS = (1U << __ATA_QUIRK_1_5_GBPS), + ATA_QUIRK_NOSETXFER = (1U << __ATA_QUIRK_NOSETXFER), + ATA_QUIRK_BROKEN_FPDMA_AA = (1U << __ATA_QUIRK_BROKEN_FPDMA_AA), + ATA_QUIRK_DUMP_ID = (1U << __ATA_QUIRK_DUMP_ID), + ATA_QUIRK_MAX_SEC_LBA48 = (1U << __ATA_QUIRK_MAX_SEC_LBA48), + ATA_QUIRK_ATAPI_DMADIR = (1U << __ATA_QUIRK_ATAPI_DMADIR), + ATA_QUIRK_NO_NCQ_TRIM = (1U << __ATA_QUIRK_NO_NCQ_TRIM), + ATA_QUIRK_NOLPM = (1U << __ATA_QUIRK_NOLPM), + ATA_QUIRK_WD_BROKEN_LPM = (1U << __ATA_QUIRK_WD_BROKEN_LPM), + ATA_QUIRK_ZERO_AFTER_TRIM = (1U << __ATA_QUIRK_ZERO_AFTER_TRIM), + ATA_QUIRK_NO_DMA_LOG = (1U << __ATA_QUIRK_NO_DMA_LOG), + ATA_QUIRK_NOTRIM = (1U << __ATA_QUIRK_NOTRIM), + ATA_QUIRK_MAX_SEC_1024 = (1U << __ATA_QUIRK_MAX_SEC_1024), + ATA_QUIRK_MAX_TRIM_128M = (1U << __ATA_QUIRK_MAX_TRIM_128M), + ATA_QUIRK_NO_NCQ_ON_ATI = (1U << __ATA_QUIRK_NO_NCQ_ON_ATI), + ATA_QUIRK_NO_ID_DEV_LOG = (1U << __ATA_QUIRK_NO_ID_DEV_LOG), + ATA_QUIRK_NO_LOG_DIR = (1U << __ATA_QUIRK_NO_LOG_DIR), + ATA_QUIRK_NO_FUA = (1U << __ATA_QUIRK_NO_FUA), + + /* User visible DMA mask for DMA control. DO NOT renumber. */ ATA_DMA_MASK_ATA = (1 << 0), /* DMA on ATA Disk */ ATA_DMA_MASK_ATAPI = (1 << 1), /* DMA on ATAPI */ ATA_DMA_MASK_CFA = (1 << 2), /* DMA on CF Card */ -- cgit v1.2.3 From 1cbe8143fd2f588031a5f157d15ae15ce12215e2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 30 Jul 2024 13:37:33 +0800 Subject: bpf: kprobe: Remove unused declaring of bpf_kprobe_override After the commit 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one"), "bpf_kprobe_override" is not used anywhere anymore, and we can remove it now. Fixes: 66665ad2f102 ("tracing/kprobe: bpf: Compare instruction pointer with original one") Signed-off-by: Menglong Dong Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240730053733.885785-1-dongml2@chinatelecom.cn --- include/linux/trace_events.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 9df3e2973626..9435185c10ef 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -880,7 +880,6 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); -DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); -- cgit v1.2.3 From c149c4a48b19afbf0c383614e57b452d39b154de Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 13 Jul 2024 08:59:16 +0000 Subject: cgroup/cpuset: Remove cpuset_slab_spread_rotor Since the SLAB implementation was removed in v6.8, so the cpuset_slab_spread_rotor is no longer used and can be removed. Signed-off-by: Xiu Jianfeng Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 6 ------ include/linux/sched.h | 1 - 2 files changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index de4cf0ee96f7..2a6981eeebf8 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -113,7 +113,6 @@ extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); extern int cpuset_mem_spread_node(void); -extern int cpuset_slab_spread_node(void); static inline int cpuset_do_page_mem_spread(void) { @@ -246,11 +245,6 @@ static inline int cpuset_mem_spread_node(void) return 0; } -static inline int cpuset_slab_spread_node(void) -{ - return 0; -} - static inline int cpuset_do_page_mem_spread(void) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index f8d150343d42..3773c1c8f099 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1243,7 +1243,6 @@ struct task_struct { /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; - int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ -- cgit v1.2.3 From 93c8332c8373fee415bd79f08d5ba4ba7ca5ad15 Mon Sep 17 00:00:00 2001 From: Xavier Date: Thu, 4 Jul 2024 14:24:43 +0800 Subject: Union-Find: add a new module in kernel library This patch implements a union-find data structure in the kernel library, which includes operations for allocating nodes, freeing nodes, finding the root of a node, and merging two nodes. Signed-off-by: Xavier Signed-off-by: Tejun Heo --- include/linux/union_find.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 include/linux/union_find.h (limited to 'include/linux') diff --git a/include/linux/union_find.h b/include/linux/union_find.h new file mode 100644 index 000000000000..cfd49263c138 --- /dev/null +++ b/include/linux/union_find.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_UNION_FIND_H +#define __LINUX_UNION_FIND_H +/** + * union_find.h - union-find data structure implementation + * + * This header provides functions and structures to implement the union-find + * data structure. The union-find data structure is used to manage disjoint + * sets and supports efficient union and find operations. + * + * See Documentation/core-api/union_find.rst for documentation and samples. + */ + +struct uf_node { + struct uf_node *parent; + unsigned int rank; +}; + +/* This macro is used for static initialization of a union-find node. */ +#define UF_INIT_NODE(node) {.parent = &node, .rank = 0} + +/** + * uf_node_init - Initialize a union-find node + * @node: pointer to the union-find node to be initialized + * + * This function sets the parent of the node to itself and + * initializes its rank to 0. + */ +static inline void uf_node_init(struct uf_node *node) +{ + node->parent = node; + node->rank = 0; +} + +/* find the root of a node */ +struct uf_node *uf_find(struct uf_node *node); + +/* Merge two intersecting nodes */ +void uf_union(struct uf_node *node1, struct uf_node *node2); + +#endif /* __LINUX_UNION_FIND_H */ -- cgit v1.2.3 From 6dfbafd8a1d51a52a623d912a2219e9982020854 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 12 Jul 2024 16:08:00 +0200 Subject: soundwire: bus: drop unused driver name field The soundwire driver name field is not currently used by any driver (and even appears to never have been used) so drop it. Signed-off-by: Johan Hovold Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20240712140801.24267-3-johan+linaro@kernel.org Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index 94fc1b57c57b..5e0dd47a0412 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -704,8 +704,6 @@ struct sdw_master_device { container_of(d, struct sdw_master_device, dev) struct sdw_driver { - const char *name; - int (*probe)(struct sdw_slave *sdw, const struct sdw_device_id *id); int (*remove)(struct sdw_slave *sdw); -- cgit v1.2.3 From 990c304930138dcd7a49763417e6e5313b81293e Mon Sep 17 00:00:00 2001 From: Patrick Rohr Date: Mon, 29 Jul 2024 15:00:59 -0700 Subject: Add support for PIO p flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit draft-ietf-6man-pio-pflag is adding a new flag to the Prefix Information Option to signal that the network can allocate a unique IPv6 prefix per client via DHCPv6-PD (see draft-ietf-v6ops-dhcp-pd-per-device). When ra_honor_pio_pflag is enabled, the presence of a P-flag causes SLAAC autoconfiguration to be disabled for that particular PIO. An automated test has been added in Android (r.android.com/3195335) to go along with this change. Cc: Maciej Żenczykowski Cc: Lorenzo Colitti Cc: David Lamparter Cc: Simon Horman Signed-off-by: Patrick Rohr Reviewed-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 383a0ea2ab91..a6e2aadbb91b 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -89,6 +89,7 @@ struct ipv6_devconf { __u8 ioam6_enabled; __u8 ndisc_evict_nocarrier; __u8 ra_honor_pio_life; + __u8 ra_honor_pio_pflag; struct ctl_table_header *sysctl_header; }; -- cgit v1.2.3 From 9aed3b51fd6186582a95abd9fa67782982540749 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 2 Jul 2024 18:49:35 -0700 Subject: rcu: Better define "atomic" for list replacement The kernel-doc headers for list_replace_rcu() and hlist_replace_rcu() claim that the replacement is atomic, which it is, but only for readers. Avoid confusion by making it clear that the atomic nature of these functions applies only to readers, not to concurrent updaters. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rculist.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 3dc1e58865f7..14dfa6008467 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -191,7 +191,10 @@ static inline void hlist_del_init_rcu(struct hlist_node *n) * @old : the element to be replaced * @new : the new element to insert * - * The @old entry will be replaced with the @new entry atomically. + * The @old entry will be replaced with the @new entry atomically from + * the perspective of concurrent readers. It is the caller's responsibility + * to synchronize with concurrent updaters, if any. + * * Note: @old should not be empty. */ static inline void list_replace_rcu(struct list_head *old, @@ -519,7 +522,9 @@ static inline void hlist_del_rcu(struct hlist_node *n) * @old : the element to be replaced * @new : the new element to insert * - * The @old entry will be replaced with the @new entry atomically. + * The @old entry will be replaced with the @new entry atomically from + * the perspective of concurrent readers. It is the caller's responsibility + * to synchronize with concurrent updaters, if any. */ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new) -- cgit v1.2.3 From ab03125268679e058e1e7b6612f6d12610761769 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 Jul 2024 11:00:34 -0400 Subject: cgroup: Show # of subsystem CSSes in cgroup.stat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cgroup subsystem state (CSS) is an abstraction in the cgroup layer to help manage different structures in various cgroup subsystems by being an embedded element inside a larger structure like cpuset or mem_cgroup. The /proc/cgroups file shows the number of cgroups for each of the subsystems. With cgroup v1, the number of CSSes is the same as the number of cgroups. That is not the case anymore with cgroup v2. The /proc/cgroups file cannot show the actual number of CSSes for the subsystems that are bound to cgroup v2. So if a v2 cgroup subsystem is leaking cgroups (usually memory cgroup), we can't tell by looking at /proc/cgroups which cgroup subsystems may be responsible. As cgroup v2 had deprecated the use of /proc/cgroups, the hierarchical cgroup.stat file is now being extended to show the number of live and dying CSSes associated with all the non-inhibited cgroup subsystems that have been bound to cgroup v2. The number includes CSSes in the current cgroup as well as in all the descendants underneath it. This will help us pinpoint which subsystems are responsible for the increasing number of dying (nr_dying_descendants) cgroups. The CSSes dying counts are stored in the cgroup structure itself instead of inside the CSS as suggested by Johannes. This will allow us to accurately track dying counts of cgroup subsystems that have recently been disabled in a cgroup. It is now possible that a zero subsystem number is coupled with a non-zero dying subsystem number. The cgroup-v2.rst file is updated to discuss this new behavior. With this patch applied, a sample output from root cgroup.stat file was shown below. nr_descendants 56 nr_subsys_cpuset 1 nr_subsys_cpu 43 nr_subsys_io 43 nr_subsys_memory 56 nr_subsys_perf_event 57 nr_subsys_hugetlb 1 nr_subsys_pids 56 nr_subsys_rdma 1 nr_subsys_misc 1 nr_dying_descendants 30 nr_dying_subsys_cpuset 0 nr_dying_subsys_cpu 0 nr_dying_subsys_io 0 nr_dying_subsys_memory 30 nr_dying_subsys_perf_event 0 nr_dying_subsys_hugetlb 0 nr_dying_subsys_pids 0 nr_dying_subsys_rdma 0 nr_dying_subsys_misc 0 Another sample output from system.slice/cgroup.stat was: nr_descendants 34 nr_subsys_cpuset 0 nr_subsys_cpu 32 nr_subsys_io 32 nr_subsys_memory 34 nr_subsys_perf_event 35 nr_subsys_hugetlb 0 nr_subsys_pids 34 nr_subsys_rdma 0 nr_subsys_misc 0 nr_dying_descendants 30 nr_dying_subsys_cpuset 0 nr_dying_subsys_cpu 0 nr_dying_subsys_io 0 nr_dying_subsys_memory 30 nr_dying_subsys_perf_event 0 nr_dying_subsys_hugetlb 0 nr_dying_subsys_pids 0 nr_dying_subsys_rdma 0 nr_dying_subsys_misc 0 Note that 'debug' controller wasn't used to provide this information because the controller is not recommended in productions kernels, also many of them won't enable CONFIG_CGROUP_DEBUG by default. Similar information could be retrieved with debuggers like drgn but that's also not always available (e.g. lockdown) and the additional cost of runtime tracking here is deemed marginal. tj: Added Michal's paragraphs on why this is not added the debug controller to the commit message. Signed-off-by: Waiman Long Acked-by: Johannes Weiner Acked-by: Roman Gushchin Reviewed-by: Kamalesh Babulal Cc: Michal Koutný Link: http://lkml.kernel.org/r/20240715150034.2583772-1-longman@redhat.com Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ae04035b6cbe..eb0f6f349496 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -210,6 +210,14 @@ struct cgroup_subsys_state { * fields of the containing structure. */ struct cgroup_subsys_state *parent; + + /* + * Keep track of total numbers of visible descendant CSSes. + * The total number of dying CSSes is tracked in + * css->cgroup->nr_dying_subsys[ssid]. + * Protected by cgroup_mutex. + */ + int nr_descendants; }; /* @@ -470,6 +478,12 @@ struct cgroup { /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; + /* + * Keep track of total number of dying CSSes at and below this cgroup. + * Protected by cgroup_mutex. + */ + int nr_dying_subsys[CGROUP_SUBSYS_COUNT]; + struct cgroup_root *root; /* -- cgit v1.2.3 From f81489a136ac2c18a7b9bb22cf37c0a6f6d58545 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 12 Jul 2024 13:57:13 -0700 Subject: hwmon: (max6697) Drop platform data support Platform data is not used anywhere in the upstram kernel. Drop support for it to simplify code maintenance. Reviewed-by: Tzung-Bi Shih Signed-off-by: Guenter Roeck --- include/linux/platform_data/max6697.h | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 include/linux/platform_data/max6697.h (limited to 'include/linux') diff --git a/include/linux/platform_data/max6697.h b/include/linux/platform_data/max6697.h deleted file mode 100644 index 6fbb70005541..000000000000 --- a/include/linux/platform_data/max6697.h +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * max6697.h - * Copyright (c) 2012 Guenter Roeck - */ - -#ifndef MAX6697_H -#define MAX6697_H - -#include - -/* - * For all bit masks: - * bit 0: local temperature - * bit 1..7: remote temperatures - */ -struct max6697_platform_data { - bool smbus_timeout_disable; /* set to disable SMBus timeouts */ - bool extended_range_enable; /* set to enable extended temp range */ - bool beta_compensation; /* set to enable beta compensation */ - u8 alert_mask; /* set bit to 1 to disable alert */ - u8 over_temperature_mask; /* set bit to 1 to disable */ - u8 resistance_cancellation; /* set bit to 0 to disable - * bit mask for MAX6581, - * boolean for other chips - */ - u8 ideality_mask; /* set bit to 0 to disable */ - u8 ideality_value; /* transistor ideality as per - * MAX6581 datasheet - */ -}; - -#endif /* MAX6697_H */ -- cgit v1.2.3 From 298dec19bdeb6e33ac220502504d969272b50cf6 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 31 Jul 2024 00:14:36 -0500 Subject: scx: Allow calling sleepable kfuncs from BPF_PROG_TYPE_SYSCALL We currently only allow calling sleepable scx kfuncs (i.e. scx_bpf_create_dsq()) from BPF_PROG_TYPE_STRUCT_OPS progs. The idea here was that we'd never have to call scx_bpf_create_dsq() outside of a sched_ext struct_ops callback, but that might not actually be true. For example, a scheduler could do something like the following: 1. Open and load (not yet attach) a scheduler skel 2. Synchronously call into a BPF_PROG_TYPE_SYSCALL prog from user space. For example, to initialize an LLC domain, or some other global, read-only state. 3. Attach the skel, which actually enables the scheduler The advantage of doing this is that it can preclude having to do pretty ugly boilerplate like initializing a read-only, statically sized array of u64[]'s which the kernel consumes literally once at init time to then create struct bpf_cpumask objects which are actually queried at runtime. Doing the above is already possible given that we can invoke core BPF kfuncs, such as bpf_cpumask_create(), from BPF_PROG_TYPE_SYSCALL progs. We already allow many scx kfuncs to be called from BPF_PROG_TYPE_SYSCALL progs (e.g. scx_bpf_kick_cpu()). Let's allow the sleepable kfuncs as well. Signed-off-by: David Vernet Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 593d2f4909dd..26e1c33bc844 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -106,16 +106,14 @@ enum scx_ent_dsq_flags { * mechanism. See scx_kf_allow(). */ enum scx_kf_mask { - SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ - /* all non-sleepables may be nested inside SLEEPABLE */ - SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */ + SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ - SCX_KF_CPU_RELEASE = 1 << 1, /* ops.cpu_release() */ + SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ /* ops.dequeue (in REST) may be nested inside DISPATCH */ - SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */ - SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */ - SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */ - SCX_KF_REST = 1 << 5, /* other rq-locked operations */ + SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ + SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ + SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ + SCX_KF_REST = 1 << 4, /* other rq-locked operations */ __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, -- cgit v1.2.3 From be72a57527fde6c80061c5f9d0e28762eb817b03 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Wed, 24 Jul 2024 10:06:58 +0800 Subject: lsm: Refactor return value of LSM hook vm_enough_memory To be consistent with most LSM hooks, convert the return value of hook vm_enough_memory to 0 or a negative error code. Before: - Hook vm_enough_memory returns 1 if permission is granted, 0 if not. - LSM_RET_DEFAULT(vm_enough_memory_mm) is 1. After: - Hook vm_enough_memory reutrns 0 if permission is granted, negative error code if not. - LSM_RET_DEFAULT(vm_enough_memory_mm) is 0. Signed-off-by: Xu Kuohai Reviewed-by: Casey Schaufler Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 12c81be78eb9..63e2656d1d56 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -48,7 +48,7 @@ LSM_HOOK(int, 0, quota_on, struct dentry *dentry) LSM_HOOK(int, 0, syslog, int type) LSM_HOOK(int, 0, settime, const struct timespec64 *ts, const struct timezone *tz) -LSM_HOOK(int, 1, vm_enough_memory, struct mm_struct *mm, long pages) +LSM_HOOK(int, 0, vm_enough_memory, struct mm_struct *mm, long pages) LSM_HOOK(int, 0, bprm_creds_for_exec, struct linux_binprm *bprm) LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, const struct file *file) LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm) diff --git a/include/linux/security.h b/include/linux/security.h index 1390f1efb4f0..62233fec8ead 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -634,7 +634,7 @@ static inline int security_settime64(const struct timespec64 *ts, static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { - return __vm_enough_memory(mm, pages, cap_vm_enough_memory(mm, pages)); + return __vm_enough_memory(mm, pages, !cap_vm_enough_memory(mm, pages)); } static inline int security_bprm_creds_for_exec(struct linux_binprm *bprm) -- cgit v1.2.3 From c9c0ee5f20c593faf289fa8850c3ed84031dd12a Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 29 Jul 2024 03:47:40 -0700 Subject: net: skbuff: Skip early return in skb_unref when debugging This patch modifies the skb_unref function to skip the early return optimization when CONFIG_DEBUG_NET is enabled. The change ensures that the reference count decrement always occurs in debug builds, allowing for more thorough checking of SKB reference counting. Previously, when the SKB's reference count was 1 and CONFIG_DEBUG_NET was not set, the function would return early after a memory barrier (smp_rmb()) without decrementing the reference count. This optimization assumes it's safe to proceed with freeing the SKB without the overhead of an atomic decrement from 1 to 0. With this change: - In non-debug builds (CONFIG_DEBUG_NET not set), behavior remains unchanged, preserving the performance optimization. - In debug builds (CONFIG_DEBUG_NET set), the reference count is always decremented, even when it's 1, allowing for consistent behavior and potentially catching subtle SKB management bugs. This modification enhances debugging capabilities for networking code without impacting performance in production kernels. It helps kernel developers identify and diagnose issues related to SKB management and reference counting in the network stack. Cc: Chris Mason Suggested-by: Jakub Kicinski Signed-off-by: Breno Leitao Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240729104741.370327-1-leitao@debian.org Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 29c3ea5b6e93..cf8f6ce06742 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1225,7 +1225,7 @@ static inline bool skb_unref(struct sk_buff *skb) { if (unlikely(!skb)) return false; - if (likely(refcount_read(&skb->users) == 1)) + if (!IS_ENABLED(CONFIG_DEBUG_NET) && likely(refcount_read(&skb->users) == 1)) smp_rmb(); else if (likely(!refcount_dec_and_test(&skb->users))) return false; -- cgit v1.2.3 From f0fcdd2cb0db62605d85f3b97a1b443e7c91f886 Mon Sep 17 00:00:00 2001 From: John Allen Date: Tue, 30 Jul 2024 15:17:30 +0000 Subject: ACPI: PRM: Add PRM handler direct call support Platform Runtime Mechanism (PRM) handlers can be invoked from either the AML interpreter or directly by an OS driver. Implement the latter. [ bp: Massage commit message. ] Signed-off-by: John Allen Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Yazen Ghannam Reviewed-by: Ard Biesheuvel Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20240730151731.15363-2-john.allen@amd.com --- include/linux/prmt.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/prmt.h b/include/linux/prmt.h index 24da8364b919..9c094294403f 100644 --- a/include/linux/prmt.h +++ b/include/linux/prmt.h @@ -2,6 +2,11 @@ #ifdef CONFIG_ACPI_PRMT void init_prmt(void); +int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer); #else static inline void init_prmt(void) { } +static inline int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer) +{ + return -EOPNOTSUPP; +} #endif -- cgit v1.2.3 From 6b08d07cac64c0dcf1dbb4584bdf95d0f789a520 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Wed, 10 Jul 2024 12:06:51 +0200 Subject: leds: trigger: netdev: Add support for tx_err and rx_err notification with LEDs This patch provides support for enabling blinking of LEDs when RX or TX errors are detected. Approach taken in this patch is similar to one for TX or RX data transmission indication (i.e. TRIGGER_NETDEV_TX/RX attribute). One can inspect transmission errors with: ip -s link show eth0 Example LED configuration: cd /sys/devices/platform/amba_pl@0/a001a000.leds/leds/ echo netdev > mode:blue/trigger && \ echo eth0 > mode:blue/device_name && \ echo 1 > mode:blue/tx_err Signed-off-by: Lukasz Majewski Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20240710100651.4059887-1-lukma@denx.de Signed-off-by: Lee Jones --- include/linux/leds.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 6885603f211b..e5968c3ed4ae 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -611,6 +611,8 @@ enum led_trigger_netdev_modes { TRIGGER_NETDEV_FULL_DUPLEX, TRIGGER_NETDEV_TX, TRIGGER_NETDEV_RX, + TRIGGER_NETDEV_TX_ERR, + TRIGGER_NETDEV_RX_ERR, /* Keep last */ __TRIGGER_NETDEV_MAX, -- cgit v1.2.3 From 480a8ad683d7a54e8d38c9c7778fb6b15aac241a Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Mon, 22 Jul 2024 15:10:58 +0300 Subject: mfd: adp5585: Add Analog Devices ADP5585 core support The ADP5585 is a 10/11 input/output port expander with a built in keypad matrix decoder, programmable logic, reset generator, and PWM generator. This driver supports the chip by modelling it as an MFD device, with two child devices for the GPIO and PWM functions. The driver is derived from an initial implementation from NXP, available in commit 8059835bee19 ("MLK-25917-1 mfd: adp5585: add ADI adp5585 core support") in their BSP kernel tree. It has been extensively rewritten. Signed-off-by: Haibo Chen Signed-off-by: Laurent Pinchart Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20240722121100.2855-3-laurent.pinchart@ideasonboard.com Signed-off-by: Lee Jones --- include/linux/mfd/adp5585.h | 126 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 include/linux/mfd/adp5585.h (limited to 'include/linux') diff --git a/include/linux/mfd/adp5585.h b/include/linux/mfd/adp5585.h new file mode 100644 index 000000000000..016033cd68e4 --- /dev/null +++ b/include/linux/mfd/adp5585.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Analog Devices ADP5585 I/O expander, PWM controller and keypad controller + * + * Copyright 2022 NXP + * Copyright 2024 Ideas on Board Oy + */ + +#ifndef __MFD_ADP5585_H_ +#define __MFD_ADP5585_H_ + +#include + +#define ADP5585_ID 0x00 +#define ADP5585_MAN_ID_VALUE 0x20 +#define ADP5585_MAN_ID_MASK GENMASK(7, 4) +#define ADP5585_INT_STATUS 0x01 +#define ADP5585_STATUS 0x02 +#define ADP5585_FIFO_1 0x03 +#define ADP5585_FIFO_2 0x04 +#define ADP5585_FIFO_3 0x05 +#define ADP5585_FIFO_4 0x06 +#define ADP5585_FIFO_5 0x07 +#define ADP5585_FIFO_6 0x08 +#define ADP5585_FIFO_7 0x09 +#define ADP5585_FIFO_8 0x0a +#define ADP5585_FIFO_9 0x0b +#define ADP5585_FIFO_10 0x0c +#define ADP5585_FIFO_11 0x0d +#define ADP5585_FIFO_12 0x0e +#define ADP5585_FIFO_13 0x0f +#define ADP5585_FIFO_14 0x10 +#define ADP5585_FIFO_15 0x11 +#define ADP5585_FIFO_16 0x12 +#define ADP5585_GPI_INT_STAT_A 0x13 +#define ADP5585_GPI_INT_STAT_B 0x14 +#define ADP5585_GPI_STATUS_A 0x15 +#define ADP5585_GPI_STATUS_B 0x16 +#define ADP5585_RPULL_CONFIG_A 0x17 +#define ADP5585_RPULL_CONFIG_B 0x18 +#define ADP5585_RPULL_CONFIG_C 0x19 +#define ADP5585_RPULL_CONFIG_D 0x1a +#define ADP5585_Rx_PULL_CFG_PU_300K 0 +#define ADP5585_Rx_PULL_CFG_PD_300K 1 +#define ADP5585_Rx_PULL_CFG_PU_100K 2 +#define ADP5585_Rx_PULL_CFG_DISABLE 3 +#define ADP5585_Rx_PULL_CFG_MASK 3 +#define ADP5585_GPI_INT_LEVEL_A 0x1b +#define ADP5585_GPI_INT_LEVEL_B 0x1c +#define ADP5585_GPI_EVENT_EN_A 0x1d +#define ADP5585_GPI_EVENT_EN_B 0x1e +#define ADP5585_GPI_INTERRUPT_EN_A 0x1f +#define ADP5585_GPI_INTERRUPT_EN_B 0x20 +#define ADP5585_DEBOUNCE_DIS_A 0x21 +#define ADP5585_DEBOUNCE_DIS_B 0x22 +#define ADP5585_GPO_DATA_OUT_A 0x23 +#define ADP5585_GPO_DATA_OUT_B 0x24 +#define ADP5585_GPO_OUT_MODE_A 0x25 +#define ADP5585_GPO_OUT_MODE_B 0x26 +#define ADP5585_GPIO_DIRECTION_A 0x27 +#define ADP5585_GPIO_DIRECTION_B 0x28 +#define ADP5585_RESET1_EVENT_A 0x29 +#define ADP5585_RESET1_EVENT_B 0x2a +#define ADP5585_RESET1_EVENT_C 0x2b +#define ADP5585_RESET2_EVENT_A 0x2c +#define ADP5585_RESET2_EVENT_B 0x2d +#define ADP5585_RESET_CFG 0x2e +#define ADP5585_PWM_OFFT_LOW 0x2f +#define ADP5585_PWM_OFFT_HIGH 0x30 +#define ADP5585_PWM_ONT_LOW 0x31 +#define ADP5585_PWM_ONT_HIGH 0x32 +#define ADP5585_PWM_CFG 0x33 +#define ADP5585_PWM_IN_AND BIT(2) +#define ADP5585_PWM_MODE BIT(1) +#define ADP5585_PWM_EN BIT(0) +#define ADP5585_LOGIC_CFG 0x34 +#define ADP5585_LOGIC_FF_CFG 0x35 +#define ADP5585_LOGIC_INT_EVENT_EN 0x36 +#define ADP5585_POLL_PTIME_CFG 0x37 +#define ADP5585_PIN_CONFIG_A 0x38 +#define ADP5585_PIN_CONFIG_B 0x39 +#define ADP5585_PIN_CONFIG_C 0x3a +#define ADP5585_PULL_SELECT BIT(7) +#define ADP5585_C4_EXTEND_CFG_GPIO11 (0U << 6) +#define ADP5585_C4_EXTEND_CFG_RESET2 (1U << 6) +#define ADP5585_C4_EXTEND_CFG_MASK GENMASK(6, 6) +#define ADP5585_R4_EXTEND_CFG_GPIO5 (0U << 5) +#define ADP5585_R4_EXTEND_CFG_RESET1 (1U << 5) +#define ADP5585_R4_EXTEND_CFG_MASK GENMASK(5, 5) +#define ADP5585_R3_EXTEND_CFG_GPIO4 (0U << 2) +#define ADP5585_R3_EXTEND_CFG_LC (1U << 2) +#define ADP5585_R3_EXTEND_CFG_PWM_OUT (2U << 2) +#define ADP5585_R3_EXTEND_CFG_MASK GENMASK(3, 2) +#define ADP5585_R0_EXTEND_CFG_GPIO1 (0U << 0) +#define ADP5585_R0_EXTEND_CFG_LY (1U << 0) +#define ADP5585_R0_EXTEND_CFG_MASK GENMASK(0, 0) +#define ADP5585_GENERAL_CFG 0x3b +#define ADP5585_OSC_EN BIT(7) +#define ADP5585_OSC_FREQ_50KHZ (0U << 5) +#define ADP5585_OSC_FREQ_100KHZ (1U << 5) +#define ADP5585_OSC_FREQ_200KHZ (2U << 5) +#define ADP5585_OSC_FREQ_500KHZ (3U << 5) +#define ADP5585_OSC_FREQ_MASK GENMASK(6, 5) +#define ADP5585_INT_CFG BIT(1) +#define ADP5585_RST_CFG BIT(0) +#define ADP5585_INT_EN 0x3c + +#define ADP5585_MAX_REG ADP5585_INT_EN + +/* + * Bank 0 covers pins "GPIO 1/R0" to "GPIO 6/R5", numbered 0 to 5 by the + * driver, and bank 1 covers pins "GPIO 7/C0" to "GPIO 11/C4", numbered 6 to + * 10. Some variants of the ADP5585 don't support "GPIO 6/R5". As the driver + * uses identical GPIO numbering for all variants to avoid confusion, GPIO 5 is + * marked as reserved in the device tree for variants that don't support it. + */ +#define ADP5585_BANK(n) ((n) >= 6 ? 1 : 0) +#define ADP5585_BIT(n) ((n) >= 6 ? BIT((n) - 6) : BIT(n)) + +struct regmap; + +struct adp5585_dev { + struct regmap *regmap; +}; + +#endif -- cgit v1.2.3 From bcbfcebda2cbc6a10a347d726e4a4f69e43a864e Mon Sep 17 00:00:00 2001 From: Mohamed Ghanmi Date: Sun, 9 Jun 2024 15:48:49 +0100 Subject: platform/x86: asus-wmi: add support for vivobook fan profiles Add support for vivobook fan profiles wmi call on the ASUS VIVOBOOK to adjust power limits. These fan profiles have a different device id than the ROG series and different order. This reorders the existing modes. As part of keeping the patch clean the throttle_thermal_policy_available boolean stored in the driver struct is removed and throttle_thermal_policy_dev is used in place (as on init it is zeroed). Co-developed-by: Luke D. Jones Signed-off-by: Luke D. Jones Signed-off-by: Mohamed Ghanmi Reviewed-by: Luke D. Jones Link: https://lore.kernel.org/r/20240609144849.2532-2-mohamed.ghanmi@supcom.tn Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 0aeeae1c1943..5e6f407b2def 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -68,6 +68,7 @@ #define ASUS_WMI_DEVID_SCREENPAD_LIGHT 0x00050032 #define ASUS_WMI_DEVID_FAN_BOOST_MODE 0x00110018 #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY 0x00120075 +#define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO 0x00110019 /* Misc */ #define ASUS_WMI_DEVID_PANEL_OD 0x00050019 -- cgit v1.2.3 From b40824500eaa77668026b6d1ade6924901a680f9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 30 Jul 2024 14:38:07 +0900 Subject: ata: libata: Remove ata_noop_qc_prep() The function ata_noop_qc_prep(), as its name implies, does nothing and simply returns AC_ERR_OK. For drivers that do not need any special preparations of queued commands, we can avoid having to define struct ata_port qc_prep operation by simply testing if that operation is defined or not in ata_qc_issue(). Make this change and remove ata_noop_qc_prep(). Signed-off-by: Damien Le Moal Reviewed-by: John Garry Reviewed-by: Sergey Shtylyov --- include/linux/libata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index d598ef690e50..d5446e18d9df 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1168,7 +1168,6 @@ extern int ata_xfer_mode2shift(u8 xfer_mode); extern const char *ata_mode_string(unsigned int xfer_mask); extern unsigned int ata_id_xfermask(const u16 *id); extern int ata_std_qc_defer(struct ata_queued_cmd *qc); -extern enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc); extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg, unsigned int n_elem); extern unsigned int ata_dev_classify(const struct ata_taskfile *tf); -- cgit v1.2.3 From bf1807c6ee1f66608c7835f6f9d9139c9c477942 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 1 Aug 2024 18:04:22 +0900 Subject: ata: libata: Print device quirks only once In ata_dev_print_quirks(), return early if ata_dev_print_info() returns false or if we already printed quirk information. This is to avoid printing a device quirks multiple times (that is, each time ata_dev_revalidate() is called). To remember if ata_dev_print_quirks() was already executed, define the EH context flag ATA_EHI_DID_PRINT_QUIRKS and set this flag in ata_dev_print_quirks(). Reported-by: Geert Uytterhoeven Fixes: 58157d607aec ("ata: libata: Print quirks applied to devices") Signed-off-by: Damien Le Moal Tested-by: Geert Uytterhoeven --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index d5446e18d9df..0279c0a6302f 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -378,6 +378,7 @@ enum { ATA_EHI_PRINTINFO = (1 << 18), /* print configuration info */ ATA_EHI_SETMODE = (1 << 19), /* configure transfer mode */ ATA_EHI_POST_SETMODE = (1 << 20), /* revalidating after setmode */ + ATA_EHI_DID_PRINT_QUIRKS = (1 << 21), /* already printed quirks info */ ATA_EHI_DID_RESET = ATA_EHI_DID_SOFTRESET | ATA_EHI_DID_HARDRESET, -- cgit v1.2.3 From e6ab45005772014ce49a693a63f928203c0cbbb0 Mon Sep 17 00:00:00 2001 From: Luigi Leonardi Date: Tue, 30 Jul 2024 21:43:07 +0200 Subject: vsock/virtio: add SIOCOUTQ support for all virtio based transports Introduce support for virtio_transport_unsent_bytes ioctl for virtio_transport, vhost_vsock and vsock_loopback. For all transports the unsent bytes counter is incremented in virtio_transport_get_credit. In virtio_transport (G2H) and in vhost-vsock (H2G) the counter is decremented when the skbuff is consumed. In vsock_loopback the same skbuff is passed from the transmitter to the receiver, so the counter is decremented before queuing the skbuff to the receiver. Signed-off-by: Luigi Leonardi Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index c82089dee0c8..0387d64e2c66 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -133,6 +133,7 @@ struct virtio_vsock_sock { u32 tx_cnt; u32 peer_fwd_cnt; u32 peer_buf_alloc; + size_t bytes_unsent; /* Protected by rx_lock */ u32 fwd_cnt; @@ -193,6 +194,11 @@ s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk); +ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk); + +void virtio_transport_consume_skb_sent(struct sk_buff *skb, + bool consume); + int virtio_transport_do_socket_init(struct vsock_sock *vsk, struct vsock_sock *psk); int -- cgit v1.2.3 From 79bd233010859463e46a0a4b3926eaaba25a6110 Mon Sep 17 00:00:00 2001 From: Ben Gainey Date: Tue, 30 Jul 2024 09:44:14 +0100 Subject: perf: Rename perf_event_context.nr_pending to nr_no_switch_fast. nr_pending counts the number of events in the context that either pending_sigtrap or pending_work, but it is used to prevent taking the fast path in perf_event_context_sched_out. Renamed to reflect what it is used for, rather than what it counts. This change allows using the field to track other event properties that also require skipping the fast path without possible confusion over the name. Signed-off-by: Ben Gainey Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240730084417.7693-2-ben.gainey@arm.com --- include/linux/perf_event.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6bb0c21d6335..655f66b18418 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -966,12 +966,13 @@ struct perf_event_context { struct rcu_head rcu_head; /* - * Sum (event->pending_work + event->pending_work) + * The count of events for which using the switch-out fast path + * should be avoided. * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ - local_t nr_pending; + local_t nr_no_switch_fast; }; struct perf_cpu_pmu_context { -- cgit v1.2.3 From 7e8b255650fcfa1d05a57e4093d8405e6d8dd488 Mon Sep 17 00:00:00 2001 From: Ben Gainey Date: Tue, 30 Jul 2024 09:44:15 +0100 Subject: perf: Support PERF_SAMPLE_READ with inherit This change allows events to use PERF_SAMPLE_READ with inherit so long as PERF_SAMPLE_TID is also set. This enables sample based profiling of a group of counters over a hierarchy of processes or threads. This is useful, for example, for collecting per-thread counters/metrics, event based sampling of multiple counters as a unit, access to the enabled and running time when using multiplexing and so on. Prior to this, users were restricted to either collecting aggregate statistics for a multi-threaded/-process application (e.g. with "perf stat"), or to sample individual threads, or to profile the entire system (which requires root or CAP_PERFMON, and may produce much more data than is required). Theoretically a tool could poll for or otherwise monitor thread/process creation and construct whatever events the user is interested in using perf_event_open, for each new thread or process, but this is racy, can lead to file-descriptor exhaustion, and ultimately just replicates the behaviour of inherit, but in userspace. This configuration differs from inherit without PERF_SAMPLE_READ in that the accumulated event count, and consequently any sample (such as if triggered by overflow of sample_period) will be on a per-thread rather than on an aggregate basis. The meaning of read_format::value field of both PERF_RECORD_READ and PERF_RECORD_SAMPLE is changed such that if the sampled event uses this new configuration then the values reported will be per-thread rather than the global aggregate value. This is a change from the existing semantics of read_format (where PERF_SAMPLE_READ is used without inherit), but it is necessary to expose the per-thread counter values, and it avoids reinventing a separate "read_format_thread" field that otherwise replicates the same behaviour. This change should not break existing tools, since this configuration was not previously valid and was rejected by the kernel. Tools that opt into this new mode will need to account for this when calculating the counter delta for a given sample. Tools that wish to have both the per-thread and aggregate value can perform the global aggregation themselves from the per-thread values. The change to read_format::value does not affect existing valid perf_event_attr configurations, nor does it change the behaviour of calls to "read" on an event descriptor. Both continue to report the aggregate value for the entire thread/process hierarchy. The difference between the results reported by "read" and PERF_RECORD_SAMPLE in this new configuration is justified on the basis that it is not (easily) possible for "read" to target a specific thread (the caller only has the fd for the original parent event). Signed-off-by: Ben Gainey Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240730084417.7693-3-ben.gainey@arm.com --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 655f66b18418..701549967c18 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -969,6 +969,9 @@ struct perf_event_context { * The count of events for which using the switch-out fast path * should be avoided. * + * Sum (event->pending_work + events with + * (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))) + * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. */ -- cgit v1.2.3 From cfa7f3d2c526c224a6271cc78a4a27a0de06f4f0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 29 Jul 2024 10:52:23 -0700 Subject: perf,x86: avoid missing caller address in stack traces captured in uprobe When tracing user functions with uprobe functionality, it's common to install the probe (e.g., a BPF program) at the first instruction of the function. This is often going to be `push %rbp` instruction in function preamble, which means that within that function frame pointer hasn't been established yet. This leads to consistently missing an actual caller of the traced function, because perf_callchain_user() only records current IP (capturing traced function) and then following frame pointer chain (which would be caller's frame, containing the address of caller's caller). So when we have target_1 -> target_2 -> target_3 call chain and we are tracing an entry to target_3, captured stack trace will report target_1 -> target_3 call chain, which is wrong and confusing. This patch proposes a x86-64-specific heuristic to detect `push %rbp` (`push %ebp` on 32-bit architecture) instruction being traced. Given entire kernel implementation of user space stack trace capturing works under assumption that user space code was compiled with frame pointer register (%rbp/%ebp) preservation, it seems pretty reasonable to use this instruction as a strong indicator that this is the entry to the function. In that case, return address is still pointed to by %rsp/%esp, so we fetch it and add to stack trace before proceeding to unwind the rest using frame pointer-based logic. We also check for `endbr64` (for 64-bit modes) as another common pattern for function entry, as suggested by Josh Poimboeuf. Even if we get this wrong sometimes for uprobes attached not at the function entry, it's OK because stack trace will still be overall meaningful, just with one extra bogus entry. If we don't detect this, we end up with guaranteed to be missing caller function entry in the stack trace, which is worse overall. Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20240729175223.23914-1-andrii@kernel.org --- include/linux/uprobes.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b503fafb7fb3..a270a5892ab4 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -76,6 +76,8 @@ struct uprobe_task { struct uprobe *active_uprobe; unsigned long xol_vaddr; + struct arch_uprobe *auprobe; + struct return_instance *return_instances; unsigned int depth; }; -- cgit v1.2.3 From e04332ebc8ac128fa551e83f1161ab1c094d13a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:28 +0200 Subject: uprobes: kill uprobe_register_refctr() It doesn't make any sense to have 2 versions of _register(). Note that trace_uprobe_enable(), the only user of uprobe_register(), doesn't need to check tu->ref_ctr_offset to decide which one should be used, it could safely pass ref_ctr_offset == 0 to uprobe_register_refctr(). Add this argument to uprobe_register(), update the callers, and kill uprobe_register_refctr(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240801132728.GA8800@redhat.com --- include/linux/uprobes.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index a270a5892ab4..788813c0b8fc 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -112,8 +112,7 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); -extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); -extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); +extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); @@ -154,11 +153,7 @@ static inline void uprobes_init(void) #define uprobe_get_trap_addr(regs) instruction_pointer(regs) static inline int -uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) -{ - return -ENOSYS; -} -static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) +uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { return -ENOSYS; } -- cgit v1.2.3 From 3c83a9ad0295eb63bdeb81d821b8c3b9417fbcac Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2024 15:27:34 +0200 Subject: uprobes: make uprobe_register() return struct uprobe * This way uprobe_unregister() and uprobe_apply() can use "struct uprobe *" rather than inode + offset. This simplifies the code and allows to avoid the unnecessary find_uprobe() + put_uprobe() in these functions. TODO: uprobe_unregister() still needs get_uprobe/put_uprobe to ensure that this uprobe can't be freed before up_write(&uprobe->register_rwsem). Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiri Olsa Link: https://lore.kernel.org/r/20240801132734.GA8803@redhat.com --- include/linux/uprobes.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 788813c0b8fc..f50df1fa93e7 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -16,6 +16,7 @@ #include #include +struct uprobe; struct vm_area_struct; struct mm_struct; struct inode; @@ -112,9 +113,9 @@ extern bool is_trap_insn(uprobe_opcode_t *insn); extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); -extern int uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); -extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); -extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); +extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); +extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); +extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_start_dup_mmap(void); @@ -152,18 +153,18 @@ static inline void uprobes_init(void) #define uprobe_get_trap_addr(regs) instruction_pointer(regs) -static inline int +static inline struct uprobe * uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc) { - return -ENOSYS; + return ERR_PTR(-ENOSYS); } static inline int -uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add) +uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add) { return -ENOSYS; } static inline void -uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc) +uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) { } static inline int uprobe_mmap(struct vm_area_struct *vma) -- cgit v1.2.3 From 6edb8cd87cca5dd4389d4ea2d84b66ea94341bbd Mon Sep 17 00:00:00 2001 From: Kerem Karabay Date: Fri, 5 Jul 2024 11:17:42 +0000 Subject: HID: core: add helper for finding a field with a certain usage This helper will allow HID drivers to easily determine if they should bind to a hid_device by checking for the prescence of a certain field when its ID is not enough, which can be the case on USB devices with multiple interfaces and/or configurations. Convert google-hammer driver to use it, and remove now superfluous hammer_has_usage(). [jkosina@suse.com: expand changelog with the information about google-hammer being added as user of this API ] Signed-off-by: Kerem Karabay Signed-off-by: Aditya Garg Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 1533c9dcd3a6..2deff79f39a1 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -940,6 +940,8 @@ extern void hidinput_report_event(struct hid_device *hid, struct hid_report *rep extern int hidinput_connect(struct hid_device *hid, unsigned int force); extern void hidinput_disconnect(struct hid_device *); +struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type, + unsigned int application, unsigned int usage); int hid_set_field(struct hid_field *, unsigned, __s32); int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size, int interrupt); -- cgit v1.2.3 From 79f194dd54cef3a0212914cd79dda1f91b7bf7f6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jul 2024 18:11:29 +0200 Subject: thermal: trip: Get rid of thermal_zone_get_num_trips() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only existing caller of thermal_zone_get_num_trips(), which is rcar_gen3_thermal_probe(), uses this function to put the number of trip points into a kernel log message, but this information is also available from the thermal sysfs interface. For this reason, remove the thermal_zone_get_num_trips() call from rcar_gen3_thermal_probe() and drop the former altogether. Signed-off-by: Rafael J. Wysocki Reviewed-by: Niklas Söderlund Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/2636988.Lt9SDvczpP@rjwysocki.net --- include/linux/thermal.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 25fbf960b474..4fdfd5abd2c4 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -210,7 +210,6 @@ int for_each_thermal_trip(struct thermal_zone_device *tz, int thermal_zone_for_each_trip(struct thermal_zone_device *tz, int (*cb)(struct thermal_trip *, void *), void *data); -int thermal_zone_get_num_trips(struct thermal_zone_device *tz); void thermal_zone_set_trip_temp(struct thermal_zone_device *tz, struct thermal_trip *trip, int temp); -- cgit v1.2.3 From 8ecd953ca5850d2f83f548d4bbcf672d2d2cd0ca Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Jul 2024 18:12:45 +0200 Subject: thermal: trip: Drop thermal_zone_get_trip() There are no more callers of thermal_zone_get_trip() in the tree, so drop it. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/2220301.Mh6RI2rZIc@rjwysocki.net --- include/linux/thermal.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 4fdfd5abd2c4..db44a57a7474 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -202,8 +202,6 @@ static inline void devm_thermal_of_zone_unregister(struct device *dev, } #endif -int thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id, - struct thermal_trip *trip); int for_each_thermal_trip(struct thermal_zone_device *tz, int (*cb)(struct thermal_trip *, void *), void *data); -- cgit v1.2.3 From 37c6dccd683797a5a4ec85d2e94939bf829d3499 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 28 Jul 2024 20:26:59 +0100 Subject: cpufreq: Remove LATENCY_MULTIPLIER The current LATENCY_MULTIPLIER which has been around for nearly 20 years causes rate_limit_us to be always in ms range. On M1 mac mini I get 50 and 56us transition latency, but due to the 1000 multiplier we end up setting rate_limit_us to 50 and 56ms, which gets capped into 2ms and was 10ms before e13aa799c2a6 ("cpufreq: Change default transition delay to 2ms") On Intel I5 system transition latency is 20us but due to the multiplier we end up with 20ms that again is capped to 2ms. Given how good modern hardware and how modern workloads require systems to be more responsive to cater for sudden changes in workload (tasks sleeping/wakeup/migrating, uclamp causing a sudden boost or cap) and that 2ms is quarter of the time of 120Hz refresh rate system, drop the old logic in favour of providing 50% headroom. rate_limit_us = 1.5 * latency. I considered not adding any headroom which could mean that we can end up with infinite back-to-back requests. I also considered providing a constant headroom (e.g: 100us) assuming that any h/w or f/w dealing with the request shouldn't require a large headroom when transition_latency is actually high. But for both cases I wasn't sure if h/w or f/w can end up being overwhelmed dealing with the freq requests in a potentially busy system. So I opted for providing 50% breathing room. This is expected to impact schedutil only as the other user, dbs_governor, takes the max(2*tick, transition_delay_us) and the former was at least 2ms on 1ms TICK, which is equivalent to the max_delay_us before applying this patch. For systems with TICK of 4ms, this value would have almost always ended up with 8ms sampling rate. For systems that report 0 transition latency, we still default to returning 1ms as transition delay. This helps in eliminating a source of latency for applying requests as mentioned in [1]. For example if we have a 1ms tick, most systems will miss sending an update at tick when updating the util_avg for a task/CPU (rate_limit_us will be 2ms for most systems). Link: https://lore.kernel.org/lkml/20240724212255.mfr2ybiv2j2uqek7@airbuntu/ # [1] Link: https://lore.kernel.org/lkml/20240205022500.2232124-1-qyousef@layalina.io/ Signed-off-by: Qais Yousef Link: https://patch.msgid.link/20240728192659.58115-1-qyousef@layalina.io [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d4d2f4d1d7cb..e0e19d9c1323 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div, #define CPUFREQ_POLICY_POWERSAVE (1) #define CPUFREQ_POLICY_PERFORMANCE (2) -/* - * The polling frequency depends on the capability of the processor. Default - * polling frequency is 1000 times the transition latency of the processor. - */ -#define LATENCY_MULTIPLIER (1000) - struct cpufreq_governor { char name[CPUFREQ_NAME_LEN]; int (*init)(struct cpufreq_policy *policy); -- cgit v1.2.3 From 70e6b7d9ae3c63df90a7bba7700e8d5c300c3c60 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 2 Aug 2024 14:55:54 +0100 Subject: x86/i8253: Disable PIT timer 0 when not in use Leaving the PIT interrupt running can cause noticeable steal time for virtual guests. The VMM generally has a timer which toggles the IRQ input to the PIC and I/O APIC, which takes CPU time away from the guest. Even on real hardware, running the counter may use power needlessly (albeit not much). Make sure it's turned off if it isn't going to be used. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Link: https://lore.kernel.org/all/20240802135555.564941-1-dwmw2@infradead.org --- include/linux/i8253.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/i8253.h b/include/linux/i8253.h index 8336b2f6f834..bf169cfef7f1 100644 --- a/include/linux/i8253.h +++ b/include/linux/i8253.h @@ -24,6 +24,7 @@ extern raw_spinlock_t i8253_lock; extern bool i8253_clear_counter_on_shutdown; extern struct clock_event_device i8253_clockevent; extern void clockevent_i8253_init(bool oneshot); +extern void clockevent_i8253_disable(void); extern void setup_pit_timer(void); -- cgit v1.2.3 From 531b2ca0a940ac9db03f246c8b77c4201de72b00 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Fri, 2 Aug 2024 14:55:55 +0100 Subject: clockevents/drivers/i8253: Fix stop sequence for timer 0 According to the data sheet, writing the MODE register should stop the counter (and thus the interrupts). This appears to work on real hardware, at least modern Intel and AMD systems. It should also work on Hyper-V. However, on some buggy virtual machines the mode change doesn't have any effect until the counter is subsequently loaded (or perhaps when the IRQ next fires). So, set MODE 0 and then load the counter, to ensure that those buggy VMs do the right thing and the interrupts stop. And then write MODE 0 *again* to stop the counter on compliant implementations too. Apparently, Hyper-V keeps firing the IRQ *repeatedly* even in mode zero when it should only happen once, but the second MODE write stops that too. Userspace test program (mostly written by tglx): ===== #include #include #include #include #include static __always_inline void __out##bwl(type value, uint16_t port) \ { \ asm volatile("out" #bwl " %" #bw "0, %w1" \ : : "a"(value), "Nd"(port)); \ } \ \ static __always_inline type __in##bwl(uint16_t port) \ { \ type value; \ asm volatile("in" #bwl " %w1, %" #bw "0" \ : "=a"(value) : "Nd"(port)); \ return value; \ } BUILDIO(b, b, uint8_t) #define inb __inb #define outb __outb #define PIT_MODE 0x43 #define PIT_CH0 0x40 #define PIT_CH2 0x42 static int is8254; static void dump_pit(void) { if (is8254) { // Latch and output counter and status outb(0xC2, PIT_MODE); printf("%02x %02x %02x\n", inb(PIT_CH0), inb(PIT_CH0), inb(PIT_CH0)); } else { // Latch and output counter outb(0x0, PIT_MODE); printf("%02x %02x\n", inb(PIT_CH0), inb(PIT_CH0)); } } int main(int argc, char* argv[]) { int nr_counts = 2; if (argc > 1) nr_counts = atoi(argv[1]); if (argc > 2) is8254 = 1; if (ioperm(0x40, 4, 1) != 0) return 1; dump_pit(); printf("Set oneshot\n"); outb(0x38, PIT_MODE); outb(0x00, PIT_CH0); outb(0x0F, PIT_CH0); dump_pit(); usleep(1000); dump_pit(); printf("Set periodic\n"); outb(0x34, PIT_MODE); outb(0x00, PIT_CH0); outb(0x0F, PIT_CH0); dump_pit(); usleep(1000); dump_pit(); dump_pit(); usleep(100000); dump_pit(); usleep(100000); dump_pit(); printf("Set stop (%d counter writes)\n", nr_counts); outb(0x30, PIT_MODE); while (nr_counts--) outb(0xFF, PIT_CH0); dump_pit(); usleep(100000); dump_pit(); usleep(100000); dump_pit(); printf("Set MODE 0\n"); outb(0x30, PIT_MODE); dump_pit(); usleep(100000); dump_pit(); usleep(100000); dump_pit(); return 0; } ===== Suggested-by: Sean Christopherson Co-developed-by: Li RongQing Signed-off-by: Li RongQing Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Link: https://lore.kernel.org/all/20240802135555.564941-2-dwmw2@infradead.org --- include/linux/i8253.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i8253.h b/include/linux/i8253.h index bf169cfef7f1..56c280eb2d4f 100644 --- a/include/linux/i8253.h +++ b/include/linux/i8253.h @@ -21,7 +21,6 @@ #define PIT_LATCH ((PIT_TICK_RATE + HZ/2) / HZ) extern raw_spinlock_t i8253_lock; -extern bool i8253_clear_counter_on_shutdown; extern struct clock_event_device i8253_clockevent; extern void clockevent_i8253_init(bool oneshot); extern void clockevent_i8253_disable(void); -- cgit v1.2.3 From e99129e5dbf7ca87233d31ad19348f6ce8627b38 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 1 Aug 2024 13:32:59 -1000 Subject: sched_ext: Allow p->scx.disallow only while loading From 1232da7eced620537a78f19c8cf3d4a3508e2419 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 31 Jul 2024 09:14:52 -1000 p->scx.disallow provides a way for the BPF scheduler to reject certain tasks from attaching. It's currently allowed for both the load and fork paths; however, the latter doesn't actually work as p->sched_class is already set by the time scx_ops_init_task() is called during fork. This is a convenience feature which is mostly useful from the load path anyway. Allow it only from the load path. v2: Trigger scx_ops_error() iff @p->policy == SCHED_EXT to make it a bit easier for the BPF scheduler (David). Signed-off-by: Tejun Heo Reported-by: "Zhangqiao (2012 lab)" Link: http://lkml.kernel.org/r/20240711110720.1285-1-zhangqiao22@huawei.com Fixes: 7bb6f0810ecf ("sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT") Acked-by: David Vernet Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 26e1c33bc844..69f68e2121a8 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -179,11 +179,12 @@ struct sched_ext_entity { * If set, reject future sched_setscheduler(2) calls updating the policy * to %SCHED_EXT with -%EACCES. * - * If set from ops.init_task() and the task's policy is already - * %SCHED_EXT, which can happen while the BPF scheduler is being loaded - * or by inhering the parent's policy during fork, the task's policy is - * rejected and forcefully reverted to %SCHED_NORMAL. The number of - * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected. + * Can be set from ops.init_task() while the BPF scheduler is being + * loaded (!scx_init_task_args->fork). If set and the task's policy is + * already %SCHED_EXT, the task's policy is rejected and forcefully + * reverted to %SCHED_NORMAL. The number of such events are reported + * through /sys/kernel/debug/sched_ext::nr_rejected. Setting this flag + * during fork is not allowed. */ bool disallow; /* reject switching into SCX */ -- cgit v1.2.3 From 49675f5bdf9ae2624b430dafda4cb29024521625 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 1 Aug 2024 09:34:01 -0700 Subject: net: remove IFF_* re-definition We re-define values of enum netdev_priv_flags as preprocessor macros with the same name. I guess this was done to avoid breaking out of tree modules which may use #ifdef X for kernel compatibility? Commit 7aa98047df95 ("net: move net_device priv_flags out from UAPI") which added the enum doesn't say. In any case, the flags with defines are quite old now, and defines for new flags don't get added. OOT drivers have to resort to code greps for compat detection, anyway. Let's delete these defines, save LoC, help LXR link to the right place. Reviewed-by: Simon Horman Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20240801163401.378723-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 32 -------------------------------- 1 file changed, 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 607009150b5f..0ef3eaa23f4b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1689,38 +1689,6 @@ enum netdev_priv_flags { IFF_SEE_ALL_HWTSTAMP_REQUESTS = BIT_ULL(33), }; -#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN -#define IFF_EBRIDGE IFF_EBRIDGE -#define IFF_BONDING IFF_BONDING -#define IFF_ISATAP IFF_ISATAP -#define IFF_WAN_HDLC IFF_WAN_HDLC -#define IFF_XMIT_DST_RELEASE IFF_XMIT_DST_RELEASE -#define IFF_DONT_BRIDGE IFF_DONT_BRIDGE -#define IFF_DISABLE_NETPOLL IFF_DISABLE_NETPOLL -#define IFF_MACVLAN_PORT IFF_MACVLAN_PORT -#define IFF_BRIDGE_PORT IFF_BRIDGE_PORT -#define IFF_OVS_DATAPATH IFF_OVS_DATAPATH -#define IFF_TX_SKB_SHARING IFF_TX_SKB_SHARING -#define IFF_UNICAST_FLT IFF_UNICAST_FLT -#define IFF_TEAM_PORT IFF_TEAM_PORT -#define IFF_SUPP_NOFCS IFF_SUPP_NOFCS -#define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE -#define IFF_MACVLAN IFF_MACVLAN -#define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM -#define IFF_L3MDEV_MASTER IFF_L3MDEV_MASTER -#define IFF_NO_QUEUE IFF_NO_QUEUE -#define IFF_OPENVSWITCH IFF_OPENVSWITCH -#define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE -#define IFF_TEAM IFF_TEAM -#define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED -#define IFF_PHONY_HEADROOM IFF_PHONY_HEADROOM -#define IFF_MACSEC IFF_MACSEC -#define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER -#define IFF_FAILOVER IFF_FAILOVER -#define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE -#define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER -#define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR - /* Specifies the type of the struct net_device::ml_priv pointer */ enum netdev_ml_priv_type { ML_PRIV_NONE, -- cgit v1.2.3 From 7e1d512dab5042aa1c2224ed362be79e3f22a15e Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 1 Aug 2024 21:00:03 +0100 Subject: linkmode: Change return type of linkmode_andnot to bool linkmode_andnot() simply returns the result of bitmap_andnot(). And the return type of bitmap_andnot() is bool. So it makes sense for the return type of linkmode_andnot() to also be bool. I checked all call-sites and they either ignore the return value or treat it as a bool. Compile tested only. Link: https://lore.kernel.org/netdev/68088998-4486-4930-90a4-96a32f08c490@lunn.ch/ Signed-off-by: Simon Horman Link: https://patch.msgid.link/20240801-linkfield-bowl-v1-1-d58f68967802@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/linkmode.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h index d94bfd9ac8cc..3b9de09871f6 100644 --- a/include/linux/linkmode.h +++ b/include/linux/linkmode.h @@ -37,8 +37,9 @@ static inline bool linkmode_empty(const unsigned long *src) return bitmap_empty(src, __ETHTOOL_LINK_MODE_MASK_NBITS); } -static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2) +static inline bool linkmode_andnot(unsigned long *dst, + const unsigned long *src1, + const unsigned long *src2) { return bitmap_andnot(dst, src1, src2, __ETHTOOL_LINK_MODE_MASK_NBITS); } -- cgit v1.2.3 From 8585632a8600d26d4e6b7246d375a9c3a344f0d2 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Tue, 9 Jul 2024 13:14:28 +0200 Subject: iio: backend: remove unused parameter Indio_dev was not being used in iio_backend_extend_chan_spec() so remove it. Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240709-dev-iio-backend-add-debugfs-v1-1-fb4b8f2373c7@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 8099759d7242..4e81931703ab 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -142,8 +142,7 @@ ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private, ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private, const struct iio_chan_spec *chan, char *buf); -int iio_backend_extend_chan_spec(struct iio_dev *indio_dev, - struct iio_backend *back, +int iio_backend_extend_chan_spec(struct iio_backend *back, struct iio_chan_spec *chan); void *iio_backend_get_priv(const struct iio_backend *conv); struct iio_backend *devm_iio_backend_get(struct device *dev, const char *name); -- cgit v1.2.3 From 2477d7b179d3295c9978420027750f047976bd04 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Fri, 26 Jul 2024 15:18:40 -0500 Subject: iio: backend: spelling: continuous -> continuous This fixes the spelling in IIO_BACKEND_INTERNAL_CONTINUOUS_WAVE. Signed-off-by: David Lechner Link: https://patch.msgid.link/20240726-iio-backend-spelling-continuous-v1-1-467c6e3f78ff@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 4e81931703ab..29c4cf0bd761 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -17,7 +17,7 @@ enum iio_backend_data_type { }; enum iio_backend_data_source { - IIO_BACKEND_INTERNAL_CONTINUOS_WAVE, + IIO_BACKEND_INTERNAL_CONTINUOUS_WAVE, IIO_BACKEND_EXTERNAL, IIO_BACKEND_DATA_SOURCE_MAX }; -- cgit v1.2.3 From f44e94afbb340be90100f8a172ebb14a9e717c8b Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Fri, 26 Jul 2024 10:23:15 +0200 Subject: iio: core: annotate masklength as __private Now that all users are using the proper accessors, we can mark masklength as __private so that no one tries to write. We also get help from checkers in warning us in case someone does it. To access the private field from IIO core code, we need to use the ACCESS_PRIVATE() macro. Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240726-dev-iio-masklength-private3-v1-23-82913fc0fb87@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index dd6bbc468283..f6c0499853bb 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -609,7 +609,7 @@ struct iio_dev { int scan_bytes; const unsigned long *available_scan_masks; - unsigned masklength; + unsigned __private masklength; const unsigned long *active_scan_mask; bool scan_timestamp; struct iio_trigger *trig; @@ -861,7 +861,7 @@ static inline const struct iio_scan_type */ static inline unsigned int iio_get_masklength(const struct iio_dev *indio_dev) { - return indio_dev->masklength; + return ACCESS_PRIVATE(indio_dev, masklength); } /** -- cgit v1.2.3 From d092b6869817208326005df0a656e3bb9724d89f Mon Sep 17 00:00:00 2001 From: Julien Stephan Date: Wed, 31 Jul 2024 09:05:43 +0200 Subject: iio: core: add function to retrieve active_scan_mask index Add a function to retrieve the index of the active scan mask inside the available scan masks array. As in iio_scan_mask_match and iio_sanity_check_avail_scan_masks, this function does not handle multi-long masks correctly. It only checks the first long to be zero, and will use such mask as a terminator even if there was bits set after the first long. This should be fine since the available_scan_mask has already been sanity tested using iio_sanity_check_avail_scan_masks. See iio_scan_mask_match and iio_sanity_check_avail_scan_masks for more details Signed-off-by: Julien Stephan Reviewed-by: David Lechner Link: https://patch.msgid.link/20240731-ad7380-add-single-ended-chips-v2-2-cd63bf05744c@baylibre.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index f6c0499853bb..3a735a9a9eae 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -864,6 +864,8 @@ static inline unsigned int iio_get_masklength(const struct iio_dev *indio_dev) return ACCESS_PRIVATE(indio_dev, masklength); } +int iio_active_scan_mask_index(struct iio_dev *indio_dev); + /** * iio_for_each_active_channel - Iterated over active channels * @indio_dev: the IIO device -- cgit v1.2.3 From 2256f37e24b1979f9a97de0b76cabbf8544a8aff Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Fri, 2 Aug 2024 16:26:59 +0200 Subject: iio: backend: introduce struct iio_backend_info Instead of only passing the backend ops when calling devm_iio_backend_register(), pass an info like structure that will contains the ops and additional information. Fow now, the backend name is being added as that will be used by the debugFS interface introduced in a later patch. It also opens the door for further customizations passed by backends. All users of devm_iio_backend_register() were updated accordingly. Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240802-dev-iio-backend-add-debugfs-v2-1-4cb62852f0d0@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 29c4cf0bd761..f120fa2e0a43 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -115,6 +115,16 @@ struct iio_backend_ops { const struct iio_chan_spec *chan, char *buf); }; +/** + * struct iio_backend_info - info structure for an iio_backend + * @name: Backend name. + * @ops: Backend operations. + */ +struct iio_backend_info { + const char *name; + const struct iio_backend_ops *ops; +}; + int iio_backend_chan_enable(struct iio_backend *back, unsigned int chan); int iio_backend_chan_disable(struct iio_backend *back, unsigned int chan); int devm_iio_backend_enable(struct device *dev, struct iio_backend *back); @@ -151,6 +161,6 @@ __devm_iio_backend_get_from_fwnode_lookup(struct device *dev, struct fwnode_handle *fwnode); int devm_iio_backend_register(struct device *dev, - const struct iio_backend_ops *ops, void *priv); + const struct iio_backend_info *info, void *priv); #endif -- cgit v1.2.3 From cdf01e0809a4c6c7877ea52401c2a6679df7aed6 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Fri, 2 Aug 2024 16:27:00 +0200 Subject: iio: backend: add debugFs interface This adds a basic debugfs interface for backends. Two new ops are being added: * debugfs_reg_access: Analogous to the core IIO one but for backend devices. * debugfs_print_chan_status: One useful usecase for this one is for testing test tones in a digital interface and "ask" the backend to dump more details on why a test tone might have errors. Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240802-dev-iio-backend-add-debugfs-v2-2-4cb62852f0d0@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index f120fa2e0a43..9d0dba7ab9e7 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -22,6 +22,8 @@ enum iio_backend_data_source { IIO_BACKEND_DATA_SOURCE_MAX }; +#define iio_backend_debugfs_ptr(ptr) PTR_IF(IS_ENABLED(CONFIG_DEBUG_FS), ptr) + /** * IIO_BACKEND_EX_INFO - Helper for an IIO extended channel attribute * @_name: Attribute name @@ -81,6 +83,8 @@ enum iio_backend_sample_trigger { * @extend_chan_spec: Extend an IIO channel. * @ext_info_set: Extended info setter. * @ext_info_get: Extended info getter. + * @debugfs_print_chan_status: Print channel status into a buffer. + * @debugfs_reg_access: Read or write register value of backend. **/ struct iio_backend_ops { int (*enable)(struct iio_backend *back); @@ -113,6 +117,11 @@ struct iio_backend_ops { const char *buf, size_t len); int (*ext_info_get)(struct iio_backend *back, uintptr_t private, const struct iio_chan_spec *chan, char *buf); + int (*debugfs_print_chan_status)(struct iio_backend *back, + unsigned int chan, char *buf, + size_t len); + int (*debugfs_reg_access)(struct iio_backend *back, unsigned int reg, + unsigned int writeval, unsigned int *readval); }; /** @@ -163,4 +172,9 @@ __devm_iio_backend_get_from_fwnode_lookup(struct device *dev, int devm_iio_backend_register(struct device *dev, const struct iio_backend_info *info, void *priv); +ssize_t iio_backend_debugfs_print_chan_status(struct iio_backend *back, + unsigned int chan, char *buf, + size_t len); +void iio_backend_debugfs_add(struct iio_backend *back, + struct iio_dev *indio_dev); #endif -- cgit v1.2.3 From b001635a7c3014fe7745a507840c5d8f58235c04 Mon Sep 17 00:00:00 2001 From: Nuno Sa Date: Fri, 2 Aug 2024 16:27:01 +0200 Subject: iio: backend: add a modified prbs23 support Support ADI specific prb23 sequence that can be used both for calibrating or debugging digital interfaces. Signed-off-by: Nuno Sa Link: https://patch.msgid.link/20240802-dev-iio-backend-add-debugfs-v2-3-4cb62852f0d0@analog.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 9d0dba7ab9e7..2b9d1aa86552 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -56,6 +56,8 @@ enum iio_backend_test_pattern { IIO_BACKEND_NO_TEST_PATTERN, /* modified prbs9 */ IIO_BACKEND_ADI_PRBS_9A = 32, + /* modified prbs23 */ + IIO_BACKEND_ADI_PRBS_23A, IIO_BACKEND_TEST_PATTERN_MAX }; -- cgit v1.2.3 From d21fe1e9a6a44e752e227d3a43f41aed790dad95 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Thu, 18 Jul 2024 10:23:54 +0800 Subject: pinctrl: pinconf-generic: Add support for "input-schmitt-microvolt" property Add "input-schmitt-microvolt" property to generic options used for DT parsing files. This enables drivers, which use generic pin configurations, to get the value passed to this property. Signed-off-by: Inochi Amaoto Link: https://lore.kernel.org/IA1PR20MB4953806785BA04E075DC4F03BBAC2@IA1PR20MB4953.namprd20.prod.outlook.com Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index a65d3d078e58..53cfde98433d 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -81,6 +81,8 @@ struct pinctrl_map; * @PIN_CONFIG_INPUT_SCHMITT_ENABLE: control schmitt-trigger mode on the pin. * If the argument != 0, schmitt-trigger mode is enabled. If it's 0, * schmitt-trigger mode is disabled. + * @PIN_CONFIG_INPUT_SCHMITT_UV: this will configure an input pin to run in + * schmitt-trigger mode. The argument is in uV. * @PIN_CONFIG_MODE_LOW_POWER: this will configure the pin for low power * operation, if several modes of operation are supported these can be * passed in the argument on a custom form, else just use argument 1 @@ -132,6 +134,7 @@ enum pin_config_param { PIN_CONFIG_INPUT_ENABLE, PIN_CONFIG_INPUT_SCHMITT, PIN_CONFIG_INPUT_SCHMITT_ENABLE, + PIN_CONFIG_INPUT_SCHMITT_UV, PIN_CONFIG_MODE_LOW_POWER, PIN_CONFIG_MODE_PWM, PIN_CONFIG_OUTPUT, -- cgit v1.2.3 From d7bdb8e6aabe218fd980768a1486434e42761539 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Mon, 27 May 2024 16:25:51 +0200 Subject: pmdomain: core: Enable s2idle for CPU PM domains on PREEMPT_RT To allow a genpd provider for a CPU PM domain to enter a domain-idle-state during s2idle on a PREEMPT_RT based configuration, we can't use the regular spinlock, as they are turned into sleepable locks on PREEMPT_RT. To address this problem, let's convert into using the raw spinlock, but only for genpd providers that have the GENPD_FLAG_CPU_DOMAIN bit set. In this way, the lock can still be acquired/released in atomic context, which is needed in the idle-path for PREEMPT_RT. Do note that the genpd power-on/off notifiers may also be fired during s2idle, but these are already prepared for PREEMPT_RT as they are based on the raw notifiers. However, consumers of them may need to adopt accordingly to work properly on PREEMPT_RT. Signed-off-by: Ulf Hansson Tested-by: Raghavendra Kakarla # qcm6490 with PREEMPT_RT set Acked-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20240527142557.321610-2-ulf.hansson@linaro.org --- include/linux/pm_domain.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 858c8e7851fb..b86bb52858ac 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -198,8 +198,11 @@ struct generic_pm_domain { spinlock_t slock; unsigned long lock_flags; }; + struct { + raw_spinlock_t raw_slock; + unsigned long raw_lock_flags; + }; }; - }; static inline struct generic_pm_domain *pd_to_genpd(struct dev_pm_domain *pd) -- cgit v1.2.3 From d5934e76316e84eced836b6b2bafae1837d1cd58 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 29 Mar 2024 16:48:48 -0700 Subject: cleanup: Add usage and style documentation When proposing that PCI grow some new cleanup helpers for pci_dev_put() and pci_dev_{lock,unlock} [1], Bjorn had some fundamental questions about expectations and best practices. Upon reviewing an updated changelog with those details he recommended adding them to documentation in the header file itself. Add that documentation and link it into the rendering for Documentation/core-api/. Signed-off-by: Dan Williams Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jonathan Cameron Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/171175585714.2192972.12661675876300167762.stgit@dwillia2-xfh.jf.intel.com --- include/linux/cleanup.h | 136 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index d9e613803df1..9c6b4f2c0176 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -4,6 +4,142 @@ #include +/** + * DOC: scope-based cleanup helpers + * + * The "goto error" pattern is notorious for introducing subtle resource + * leaks. It is tedious and error prone to add new resource acquisition + * constraints into code paths that already have several unwind + * conditions. The "cleanup" helpers enable the compiler to help with + * this tedium and can aid in maintaining LIFO (last in first out) + * unwind ordering to avoid unintentional leaks. + * + * As drivers make up the majority of the kernel code base, here is an + * example of using these helpers to clean up PCI drivers. The target of + * the cleanups are occasions where a goto is used to unwind a device + * reference (pci_dev_put()), or unlock the device (pci_dev_unlock()) + * before returning. + * + * The DEFINE_FREE() macro can arrange for PCI device references to be + * dropped when the associated variable goes out of scope:: + * + * DEFINE_FREE(pci_dev_put, struct pci_dev *, if (_T) pci_dev_put(_T)) + * ... + * struct pci_dev *dev __free(pci_dev_put) = + * pci_get_slot(parent, PCI_DEVFN(0, 0)); + * + * The above will automatically call pci_dev_put() if @dev is non-NULL + * when @dev goes out of scope (automatic variable scope). If a function + * wants to invoke pci_dev_put() on error, but return @dev (i.e. without + * freeing it) on success, it can do:: + * + * return no_free_ptr(dev); + * + * ...or:: + * + * return_ptr(dev); + * + * The DEFINE_GUARD() macro can arrange for the PCI device lock to be + * dropped when the scope where guard() is invoked ends:: + * + * DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) + * ... + * guard(pci_dev)(dev); + * + * The lifetime of the lock obtained by the guard() helper follows the + * scope of automatic variable declaration. Take the following example:: + * + * func(...) + * { + * if (...) { + * ... + * guard(pci_dev)(dev); // pci_dev_lock() invoked here + * ... + * } // <- implied pci_dev_unlock() triggered here + * } + * + * Observe the lock is held for the remainder of the "if ()" block not + * the remainder of "func()". + * + * Now, when a function uses both __free() and guard(), or multiple + * instances of __free(), the LIFO order of variable definition order + * matters. GCC documentation says: + * + * "When multiple variables in the same scope have cleanup attributes, + * at exit from the scope their associated cleanup functions are run in + * reverse order of definition (last defined, first cleanup)." + * + * When the unwind order matters it requires that variables be defined + * mid-function scope rather than at the top of the file. Take the + * following example and notice the bug highlighted by "!!":: + * + * LIST_HEAD(list); + * DEFINE_MUTEX(lock); + * + * struct object { + * struct list_head node; + * }; + * + * static struct object *alloc_add(void) + * { + * struct object *obj; + * + * lockdep_assert_held(&lock); + * obj = kzalloc(sizeof(*obj), GFP_KERNEL); + * if (obj) { + * LIST_HEAD_INIT(&obj->node); + * list_add(obj->node, &list): + * } + * return obj; + * } + * + * static void remove_free(struct object *obj) + * { + * lockdep_assert_held(&lock); + * list_del(&obj->node); + * kfree(obj); + * } + * + * DEFINE_FREE(remove_free, struct object *, if (_T) remove_free(_T)) + * static int init(void) + * { + * struct object *obj __free(remove_free) = NULL; + * int err; + * + * guard(mutex)(&lock); + * obj = alloc_add(); + * + * if (!obj) + * return -ENOMEM; + * + * err = other_init(obj); + * if (err) + * return err; // remove_free() called without the lock!! + * + * no_free_ptr(obj); + * return 0; + * } + * + * That bug is fixed by changing init() to call guard() and define + + * initialize @obj in this order:: + * + * guard(mutex)(&lock); + * struct object *obj __free(remove_free) = alloc_add(); + * + * Given that the "__free(...) = NULL" pattern for variables defined at + * the top of the function poses this potential interdependency problem + * the recommendation is to always define and assign variables in one + * statement and not group variable definitions at the top of the + * function when __free() is used. + * + * Lastly, given that the benefit of cleanup helpers is removal of + * "goto", and that the "goto" statement can jump between scopes, the + * expectation is that usage of "goto" and cleanup helpers is never + * mixed in the same function. I.e. for a given routine, convert all + * resources that need a "goto" cleanup to scope-based cleanup, or + * convert none of them. + */ + /* * DEFINE_FREE(name, type, free): * simple helper macro that defines the required wrapper for a __free() -- cgit v1.2.3 From a720dee5e039238a44c0142dfccdc0e35c1125f7 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sat, 13 Jul 2024 19:47:33 +1200 Subject: hid-asus: use hid for brightness control on keyboard On almost all ASUS ROG series laptops the MCU used for the USB keyboard also has a HID packet used for setting the brightness. This is usually the same as the WMI method. But in some laptops the WMI method either is missing or doesn't work, so we should default to the HID control. Signed-off-by: Luke D. Jones Acked-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240713074733.77334-2-luke@ljones.dev Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/asus-wmi.h | 36 ++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 5e6f407b2def..b601b245a035 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -4,6 +4,7 @@ #include #include +#include /* WMI Methods */ #define ASUS_WMI_METHODID_SPEC 0x43455053 /* BIOS SPECification */ @@ -165,4 +166,39 @@ static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, } #endif +/* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */ +static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Zephyrus"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Strix"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA403U"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU605M"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC71L"), + }, + }, + { }, +}; + #endif /* __PLATFORM_DATA_X86_ASUS_WMI_H */ -- cgit v1.2.3 From 25162a4f64f8ba0065f300977589fe1f6af332f0 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 4 Aug 2024 17:16:25 -0700 Subject: Input: cyttsp4 - remove driver The cyttsp4 touchscreen driver was contributed in 2013 and since then has seen no updates. The driver uses platform data (no device tree support) and there are no users of it in the mainline kernel. There were occasional fixes to it for issues either found by static code analysis tools or via visual inspection, but otherwise the driver is completely untested. Remove the driver. Reviewed-by: Linus Walleij Reviewed-by: Javier Martinez Canillas Link: https://lore.kernel.org/r/ZrAZ2cUow_z838tp@google.com Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/cyttsp4.h | 62 ----------------------------------- 1 file changed, 62 deletions(-) delete mode 100644 include/linux/platform_data/cyttsp4.h (limited to 'include/linux') diff --git a/include/linux/platform_data/cyttsp4.h b/include/linux/platform_data/cyttsp4.h deleted file mode 100644 index 5dc9d2be384b..000000000000 --- a/include/linux/platform_data/cyttsp4.h +++ /dev/null @@ -1,62 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Header file for: - * Cypress TrueTouch(TM) Standard Product (TTSP) touchscreen drivers. - * For use with Cypress Txx3xx parts. - * Supported parts include: - * CY8CTST341 - * CY8CTMA340 - * - * Copyright (C) 2009, 2010, 2011 Cypress Semiconductor, Inc. - * Copyright (C) 2012 Javier Martinez Canillas - * - * Contact Cypress Semiconductor at www.cypress.com (kev@cypress.com) - */ -#ifndef _CYTTSP4_H_ -#define _CYTTSP4_H_ - -#define CYTTSP4_MT_NAME "cyttsp4_mt" -#define CYTTSP4_I2C_NAME "cyttsp4_i2c_adapter" -#define CYTTSP4_SPI_NAME "cyttsp4_spi_adapter" - -#define CY_TOUCH_SETTINGS_MAX 32 - -struct touch_framework { - const uint16_t *abs; - uint8_t size; - uint8_t enable_vkeys; -} __packed; - -struct cyttsp4_mt_platform_data { - struct touch_framework *frmwrk; - unsigned short flags; - char const *inp_dev_name; -}; - -struct touch_settings { - const uint8_t *data; - uint32_t size; - uint8_t tag; -} __packed; - -struct cyttsp4_core_platform_data { - int irq_gpio; - int rst_gpio; - int level_irq_udelay; - int (*xres)(struct cyttsp4_core_platform_data *pdata, - struct device *dev); - int (*init)(struct cyttsp4_core_platform_data *pdata, - int on, struct device *dev); - int (*power)(struct cyttsp4_core_platform_data *pdata, - int on, struct device *dev, atomic_t *ignore_irq); - int (*irq_stat)(struct cyttsp4_core_platform_data *pdata, - struct device *dev); - struct touch_settings *sett[CY_TOUCH_SETTINGS_MAX]; -}; - -struct cyttsp4_platform_data { - struct cyttsp4_core_platform_data *core_pdata; - struct cyttsp4_mt_platform_data *mt_pdata; -}; - -#endif /* _CYTTSP4_H_ */ -- cgit v1.2.3 From d4433e8b405a882fa2ef29601c4ad262ba6e5526 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Aug 2024 13:40:26 +0000 Subject: inet: constify 'struct net' parameter of various lookup helpers Following helpers do not touch their struct net argument: - bpf_sk_lookup_run_v4() - inet_lookup_reuseport() - inet_lhash2_lookup() - inet_lookup_run_sk_lookup() - __inet_lookup_listener() - __inet_lookup_established() Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240802134029.3748005-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index b6672ff61407..4acd1da4dac6 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1616,7 +1616,7 @@ extern struct static_key_false bpf_sk_lookup_enabled; _all_pass || _selected_sk ? SK_PASS : SK_DROP; \ }) -static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, +static inline bool bpf_sk_lookup_run_v4(const struct net *net, int protocol, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 dport, const int ifindex, struct sock **psk) -- cgit v1.2.3 From 10b2a44ccb0cdf85480b6f73a23530aa3ac722de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Aug 2024 13:40:28 +0000 Subject: inet6: constify 'struct net' parameter of various lookup helpers Following helpers do not touch their struct net argument: - bpf_sk_lookup_run_v6() - __inet6_lookup_established() - inet6_lookup_reuseport() - inet6_lookup_listener() - inet6_lookup_run_sk_lookup() - __inet6_lookup() - inet6_lookup() Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240802134029.3748005-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 4acd1da4dac6..64e1506fefb8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1653,7 +1653,7 @@ static inline bool bpf_sk_lookup_run_v4(const struct net *net, int protocol, } #if IS_ENABLED(CONFIG_IPV6) -static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, +static inline bool bpf_sk_lookup_run_v6(const struct net *net, int protocol, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, -- cgit v1.2.3 From 7e45c1e9edc0004b914e2d5a33245908a06f072c Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Tue, 30 Jul 2024 16:40:52 +0300 Subject: net/mlx5: Add support for MTPTM and MTCTR registers Make Management Precision Time Measurement (MTPTM) register and Management Cross Timestamp (MTCTR) register usable in mlx5 driver. Signed-off-by: Rahul Rameshbabu Signed-off-by: Tariq Toukan Reviewed-by: Wojciech Drewek Tested-by: Vadim Fedorenko Link: https://patch.msgid.link/20240730134055.1835261-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/device.h | 7 ++++++- include/linux/mlx5/driver.h | 2 ++ include/linux/mlx5/mlx5_ifc.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ba875a619b97..a94bc9e3af96 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1243,7 +1243,8 @@ enum mlx5_pcam_feature_groups { enum mlx5_mcam_reg_groups { MLX5_MCAM_REGS_FIRST_128 = 0x0, MLX5_MCAM_REGS_0x9100_0x917F = 0x2, - MLX5_MCAM_REGS_NUM = 0x3, + MLX5_MCAM_REGS_0x9180_0x91FF = 0x3, + MLX5_MCAM_REGS_NUM = 0x4, }; enum mlx5_mcam_feature_groups { @@ -1392,6 +1393,10 @@ enum mlx5_qcam_feature_groups { MLX5_GET(mcam_reg, (mdev)->caps.mcam[MLX5_MCAM_REGS_0x9100_0x917F], \ mng_access_reg_cap_mask.access_regs2.reg) +#define MLX5_CAP_MCAM_REG3(mdev, reg) \ + MLX5_GET(mcam_reg, (mdev)->caps.mcam[MLX5_MCAM_REGS_0x9180_0x91FF], \ + mng_access_reg_cap_mask.access_regs3.reg) + #define MLX5_CAP_MCAM_FEATURE(mdev, fld) \ MLX5_GET(mcam_reg, (mdev)->caps.mcam, mng_feature_cap_mask.enhanced_features.fld) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a96438ded15f..9f42834f57c5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -159,6 +159,8 @@ enum { MLX5_REG_MSECQ = 0x9155, MLX5_REG_MSEES = 0x9156, MLX5_REG_MIRC = 0x9162, + MLX5_REG_MTPTM = 0x9180, + MLX5_REG_MTCTR = 0x9181, MLX5_REG_SBCAM = 0xB01F, MLX5_REG_RESOURCE_DUMP = 0xC000, MLX5_REG_DTOR = 0xC00E, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cab228cf51c6..234ad6f16e92 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10401,6 +10401,18 @@ struct mlx5_ifc_mcam_access_reg_bits2 { u8 regs_31_to_0[0x20]; }; +struct mlx5_ifc_mcam_access_reg_bits3 { + u8 regs_127_to_96[0x20]; + + u8 regs_95_to_64[0x20]; + + u8 regs_63_to_32[0x20]; + + u8 regs_31_to_2[0x1e]; + u8 mtctr[0x1]; + u8 mtptm[0x1]; +}; + struct mlx5_ifc_mcam_reg_bits { u8 reserved_at_0[0x8]; u8 feature_group[0x8]; @@ -10413,6 +10425,7 @@ struct mlx5_ifc_mcam_reg_bits { struct mlx5_ifc_mcam_access_reg_bits access_regs; struct mlx5_ifc_mcam_access_reg_bits1 access_regs1; struct mlx5_ifc_mcam_access_reg_bits2 access_regs2; + struct mlx5_ifc_mcam_access_reg_bits3 access_regs3; u8 reserved_at_0[0x80]; } mng_access_reg_cap_mask; @@ -11166,6 +11179,34 @@ struct mlx5_ifc_mtmp_reg_bits { u8 sensor_name_lo[0x20]; }; +struct mlx5_ifc_mtptm_reg_bits { + u8 reserved_at_0[0x10]; + u8 psta[0x1]; + u8 reserved_at_11[0xf]; + + u8 reserved_at_20[0x60]; +}; + +enum { + MLX5_MTCTR_REQUEST_NOP = 0x0, + MLX5_MTCTR_REQUEST_PTM_ROOT_CLOCK = 0x1, + MLX5_MTCTR_REQUEST_FREE_RUNNING_COUNTER = 0x2, + MLX5_MTCTR_REQUEST_REAL_TIME_CLOCK = 0x3, +}; + +struct mlx5_ifc_mtctr_reg_bits { + u8 first_clock_timestamp_request[0x8]; + u8 second_clock_timestamp_request[0x8]; + u8 reserved_at_10[0x10]; + + u8 first_clock_valid[0x1]; + u8 second_clock_valid[0x1]; + u8 reserved_at_22[0x1e]; + + u8 first_clock_timestamp[0x40]; + u8 second_clock_timestamp[0x40]; +}; + union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_bufferx_reg_bits bufferx_reg; struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; @@ -11230,6 +11271,8 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mrtc_reg_bits mrtc_reg; struct mlx5_ifc_mtcap_reg_bits mtcap_reg; struct mlx5_ifc_mtmp_reg_bits mtmp_reg; + struct mlx5_ifc_mtptm_reg_bits mtptm_reg; + struct mlx5_ifc_mtctr_reg_bits mtctr_reg; u8 reserved_at_0[0x60e0]; }; -- cgit v1.2.3 From c114e9948c2b6a0b400266e59cc656b59e795bca Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Thu, 18 Jul 2024 11:27:24 -0700 Subject: coredump: Standartize and fix logging The coredump code does not log the process ID and the comm consistently, logs unescaped comm when it does log it, and does not always use the ratelimited logging. That makes it harder to analyze logs and puts the system at the risk of spamming the system log incase something crashes many times over and over again. Fix that by logging TGID and comm (escaped) consistently and using the ratelimited logging always. Signed-off-by: Roman Kisel Tested-by: Allen Pais Link: https://lore.kernel.org/r/20240718182743.1959160-2-romank@linux.microsoft.com Signed-off-by: Kees Cook --- include/linux/coredump.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 0904ba010341..45e598fe3476 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -43,8 +43,30 @@ extern int dump_align(struct coredump_params *cprm, int align); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); extern void do_coredump(const kernel_siginfo_t *siginfo); + +/* + * Logging for the coredump code, ratelimited. + * The TGID and comm fields are added to the message. + */ + +#define __COREDUMP_PRINTK(Level, Format, ...) \ + do { \ + char comm[TASK_COMM_LEN]; \ + \ + get_task_comm(comm, current); \ + printk_ratelimited(Level "coredump: %d(%*pE): " Format "\n", \ + task_tgid_vnr(current), (int)strlen(comm), comm, ##__VA_ARGS__); \ + } while (0) \ + +#define coredump_report(fmt, ...) __COREDUMP_PRINTK(KERN_INFO, fmt, ##__VA_ARGS__) +#define coredump_report_failure(fmt, ...) __COREDUMP_PRINTK(KERN_WARNING, fmt, ##__VA_ARGS__) + #else static inline void do_coredump(const kernel_siginfo_t *siginfo) {} + +#define coredump_report(...) +#define coredump_report_failure(...) + #endif #if defined(CONFIG_COREDUMP) && defined(CONFIG_SYSCTL) -- cgit v1.2.3 From fb97d2eb542faf19a8725afbd75cbc2518903210 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Thu, 18 Jul 2024 11:27:25 -0700 Subject: binfmt_elf, coredump: Log the reason of the failed core dumps Missing, failed, or corrupted core dumps might impede crash investigations. To improve reliability of that process and consequently the programs themselves, one needs to trace the path from producing a core dumpfile to analyzing it. That path starts from the core dump file written to the disk by the kernel or to the standard input of a user mode helper program to which the kernel streams the coredump contents. There are cases where the kernel will interrupt writing the core out or produce a truncated/not-well-formed core dump without leaving a note. Add logging for the core dump collection failure paths to be able to reason what has gone wrong when the core dump is malformed or missing. Report the size of the data written to aid in diagnosing the user mode helper. Signed-off-by: Roman Kisel Link: https://lore.kernel.org/r/20240718182743.1959160-3-romank@linux.microsoft.com Signed-off-by: Kees Cook --- include/linux/coredump.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 45e598fe3476..edeb8532ce0f 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -42,7 +42,7 @@ extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); -extern void do_coredump(const kernel_siginfo_t *siginfo); +extern int do_coredump(const kernel_siginfo_t *siginfo); /* * Logging for the coredump code, ratelimited. @@ -62,7 +62,11 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); #define coredump_report_failure(fmt, ...) __COREDUMP_PRINTK(KERN_WARNING, fmt, ##__VA_ARGS__) #else -static inline void do_coredump(const kernel_siginfo_t *siginfo) {} +static inline int do_coredump(const kernel_siginfo_t *siginfo) +{ + /* Coredump support is not available, can't fail. */ + return 0; +} #define coredump_report(...) #define coredump_report_failure(...) -- cgit v1.2.3 From 7424fc6b86c8980a87169e005f5cd4438d18efe6 Mon Sep 17 00:00:00 2001 From: Gatlin Newhouse Date: Wed, 24 Jul 2024 00:01:55 +0000 Subject: x86/traps: Enable UBSAN traps on x86 Currently ARM64 extracts which specific sanitizer has caused a trap via encoded data in the trap instruction. Clang on x86 currently encodes the same data in the UD1 instruction but x86 handle_bug() and is_valid_bugaddr() currently only look at UD2. Bring x86 to parity with ARM64, similar to commit 25b84002afb9 ("arm64: Support Clang UBSAN trap codes for better reporting"). See the llvm links for information about the code generation. Enable the reporting of UBSAN sanitizer details on x86 compiled with clang when CONFIG_UBSAN_TRAP=y by analysing UD1 and retrieving the type immediate which is encoded by the compiler after the UD1. [ tglx: Simplified it by moving the printk() into handle_bug() ] Signed-off-by: Gatlin Newhouse Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Kees Cook Link: https://lore.kernel.org/all/20240724000206.451425-1-gatlin.newhouse@gmail.com Link: https://github.com/llvm/llvm-project/commit/c5978f42ec8e9#diff-bb68d7cd885f41cfc35843998b0f9f534adb60b415f647109e597ce448e92d9f Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/X86/X86InstrSystem.td#L27 --- include/linux/ubsan.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ubsan.h b/include/linux/ubsan.h index bff7445498de..d8219cbe09ff 100644 --- a/include/linux/ubsan.h +++ b/include/linux/ubsan.h @@ -4,6 +4,11 @@ #ifdef CONFIG_UBSAN_TRAP const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type); +#else +static inline const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) +{ + return NULL; +} #endif #endif -- cgit v1.2.3 From 97edbc02b2efdb0cd0f507b6a45fc509acc861bb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 14:51:11 -0400 Subject: buffer: Convert block_write_end() to take a folio All callers now have a folio, so pass it in instead of converting from a folio to a page and back to a folio again. Saves a call to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- include/linux/buffer_head.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 14acf1bbe0ce..3a3fec154536 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -262,8 +262,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(struct file *, struct address_space *, - loff_t, unsigned, unsigned, - struct page *, void *); + loff_t, unsigned len, unsigned copied, + struct folio *, void *); int generic_write_end(struct file *, struct address_space *, loff_t, unsigned, unsigned, struct page *, void *); -- cgit v1.2.3 From a225800f322a3d6cc8b8b6c7dc4d5281f2f5375b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 15:45:32 -0400 Subject: fs: Convert aops->write_end to take a folio Most callers have a folio, and most implementations operate on a folio, so remove the conversion from folio->page->folio to fit through this interface. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- include/linux/buffer_head.h | 4 ++-- include/linux/fs.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 3a3fec154536..165e859664a5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -265,8 +265,8 @@ int block_write_end(struct file *, struct address_space *, loff_t, unsigned len, unsigned copied, struct folio *, void *); int generic_write_end(struct file *, struct address_space *, - loff_t, unsigned, unsigned, - struct page *, void *); + loff_t, unsigned len, unsigned copied, + struct folio *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **, diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b..1af3373c7cd2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -411,7 +411,7 @@ struct address_space_operations { struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + struct folio *folio, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); -- cgit v1.2.3 From 1da86618bdce301d23e89ecce92161f9d3b3c5e7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 15 Jul 2024 14:24:01 -0400 Subject: fs: Convert aops->write_begin to take a folio Convert all callers from working on a page to working on one page of a folio (support for working on an entire folio can come later). Removes a lot of folio->page->folio conversions. Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- include/linux/buffer_head.h | 4 ++-- include/linux/fs.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 165e859664a5..254563a2a9de 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -258,7 +258,7 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, get_block_t *get_block); + struct folio **foliop, get_block_t *get_block); int __block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(struct file *, struct address_space *, @@ -269,7 +269,7 @@ int generic_write_end(struct file *, struct address_space *, struct folio *, void *); void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to); int cont_write_begin(struct file *, struct address_space *, loff_t, - unsigned, struct page **, void **, + unsigned, struct folio **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); void block_commit_write(struct page *page, unsigned int from, unsigned int to); diff --git a/include/linux/fs.h b/include/linux/fs.h index 1af3373c7cd2..7a79b88a8f5a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -408,7 +408,7 @@ struct address_space_operations { int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata); + struct folio **foliop, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); @@ -3331,7 +3331,7 @@ extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata); + struct folio **foliop, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); -- cgit v1.2.3 From 9f04609f74ec7a439e1ac42da5db9e6ddf4f7b13 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Jul 2024 23:09:04 -0400 Subject: buffer: Convert __block_write_begin() to take a folio Almost all callers have a folio now, so change __block_write_begin() to take a folio and remove a call to compound_head(). Reviewed-by: Josef Bacik Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- include/linux/buffer_head.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 254563a2a9de..c85e65fd4cdc 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -259,7 +259,7 @@ int block_read_full_folio(struct folio *, get_block_t *); bool block_is_partially_uptodate(struct folio *, size_t from, size_t count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, get_block_t *get_block); -int __block_write_begin(struct page *page, loff_t pos, unsigned len, +int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); int block_write_end(struct file *, struct address_space *, loff_t, unsigned len, unsigned copied, -- cgit v1.2.3 From 07a83512fa5be3f3d46369eee6352102fd9fb505 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 10 Jul 2024 11:36:08 +0100 Subject: usb: typec: tcpci: fix a comment typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit autonously -> autonomously Signed-off-by: André Draszik Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-1-0ec1f41f4263@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 0ab39b6ea205..d27fe0c22a8b 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -190,7 +190,7 @@ struct tcpci; * Optional; Callback to perform chip specific operations when FRS * is sourcing vbus. * @auto_discharge_disconnect: - * Optional; Enables TCPC to autonously discharge vbus on disconnect. + * Optional; Enables TCPC to autonomously discharge vbus on disconnect. * @vbus_vsafe0v: * optional; Set when TCPC can detect whether vbus is at VSAFE0V. * @set_partner_usb_comm_capable: -- cgit v1.2.3 From f523aa6d72598bcd2946c7e02d29f33f94806a15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 10 Jul 2024 11:36:10 +0100 Subject: usb: typec: tcpci: use GENMASK() for TCPC_CC_STATUS_CC[12] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing code here, particularly in maxim_contaminant.c, is arguably quite hard to read due to the open-coded masking and shifting spanning multiple lines. Use GENMASK() and FIELD_GET() instead, which arguably make the code much easier to follow. While at it, use the symbolic name TCPC_CC_STATE_SRC_OPEN for one instance of open-coded 0x0. Signed-off-by: André Draszik Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-3-0ec1f41f4263@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index d27fe0c22a8b..31d21ccf662b 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -92,11 +92,9 @@ #define TCPC_CC_STATUS_TERM BIT(4) #define TCPC_CC_STATUS_TERM_RP 0 #define TCPC_CC_STATUS_TERM_RD 1 +#define TCPC_CC_STATUS_CC2 GENMASK(3, 2) +#define TCPC_CC_STATUS_CC1 GENMASK(1, 0) #define TCPC_CC_STATE_SRC_OPEN 0 -#define TCPC_CC_STATUS_CC2_SHIFT 2 -#define TCPC_CC_STATUS_CC2_MASK 0x3 -#define TCPC_CC_STATUS_CC1_SHIFT 0 -#define TCPC_CC_STATUS_CC1_MASK 0x3 #define TCPC_POWER_STATUS 0x1e #define TCPC_POWER_STATUS_DBG_ACC_CON BIT(7) @@ -256,7 +254,7 @@ static inline enum typec_cc_status tcpci_to_typec_cc(unsigned int cc, bool sink) if (sink) return TYPEC_CC_RP_3_0; fallthrough; - case 0x0: + case TCPC_CC_STATE_SRC_OPEN: default: return TYPEC_CC_OPEN; } -- cgit v1.2.3 From 83b254c13ac093810e1058d9cb1bbb4b7ef9c246 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 10 Jul 2024 11:36:11 +0100 Subject: usb: typec: tcpci: use GENMASK() for TCPC_ROLE_CTRL_CC[12] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All this open-coded shifting and masking is quite hard to read, in particular the if-statement in tcpci_apply_rc(). Declare TCPC_ROLE_CTRL_CC[12] using GENMASK() which allows using FIELD_GET() and FIELD_PREP() to arguably make the code more legible. Signed-off-by: André Draszik Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-4-0ec1f41f4263@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 31d21ccf662b..552d074429f0 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -68,10 +68,8 @@ #define TCPC_ROLE_CTRL_RP_VAL_DEF 0x0 #define TCPC_ROLE_CTRL_RP_VAL_1_5 0x1 #define TCPC_ROLE_CTRL_RP_VAL_3_0 0x2 -#define TCPC_ROLE_CTRL_CC2_SHIFT 2 -#define TCPC_ROLE_CTRL_CC2_MASK 0x3 -#define TCPC_ROLE_CTRL_CC1_SHIFT 0 -#define TCPC_ROLE_CTRL_CC1_MASK 0x3 +#define TCPC_ROLE_CTRL_CC2 GENMASK(3, 2) +#define TCPC_ROLE_CTRL_CC1 GENMASK(1, 0) #define TCPC_ROLE_CTRL_CC_RA 0x0 #define TCPC_ROLE_CTRL_CC_RP 0x1 #define TCPC_ROLE_CTRL_CC_RD 0x2 @@ -176,8 +174,7 @@ #define tcpc_presenting_rd(reg, cc) \ (!(TCPC_ROLE_CTRL_DRP & (reg)) && \ - (((reg) & (TCPC_ROLE_CTRL_## cc ##_MASK << TCPC_ROLE_CTRL_## cc ##_SHIFT)) == \ - (TCPC_ROLE_CTRL_CC_RD << TCPC_ROLE_CTRL_## cc ##_SHIFT))) + FIELD_GET(TCPC_ROLE_CTRL_## cc, reg) == TCPC_ROLE_CTRL_CC_RD) struct tcpci; -- cgit v1.2.3 From 46b1e0f87ba89f0a06ffbde5fe8d13c5af93f94e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 10 Jul 2024 11:36:12 +0100 Subject: usb: typec: tcpci: use GENMASK() for TCPC_ROLE_CTRL_RP_VAL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Align the last remaining field TCPC_ROLE_CTRL_RP_VAL with the other fields in the TCPC_ROLE_CTRL register by using GENMASK() and FIELD_PREP(). Signed-off-by: André Draszik Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-5-0ec1f41f4263@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 552d074429f0..80652d4f722e 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -63,8 +63,7 @@ #define TCPC_ROLE_CTRL 0x1a #define TCPC_ROLE_CTRL_DRP BIT(6) -#define TCPC_ROLE_CTRL_RP_VAL_SHIFT 4 -#define TCPC_ROLE_CTRL_RP_VAL_MASK 0x3 +#define TCPC_ROLE_CTRL_RP_VAL GENMASK(5, 4) #define TCPC_ROLE_CTRL_RP_VAL_DEF 0x0 #define TCPC_ROLE_CTRL_RP_VAL_1_5 0x1 #define TCPC_ROLE_CTRL_RP_VAL_3_0 0x2 -- cgit v1.2.3 From aee4568f42e0d7526207a10944bb89bb45522b59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 10 Jul 2024 11:36:13 +0100 Subject: usb: typec: tcpci: use GENMASK() for TCPC_MSG_HDR_INFO_REV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert field TCPC_MSG_HDR_INFO_REV from register TCPC_MSG_HDR_INFO to using GENMASK() and FIELD_PREP() so as to keep using a similar approach for all fields. Signed-off-by: André Draszik Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-6-0ec1f41f4263@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 80652d4f722e..3cd61e9f73b3 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -129,9 +129,8 @@ #define TCPC_MSG_HDR_INFO 0x2e #define TCPC_MSG_HDR_INFO_DATA_ROLE BIT(3) +#define TCPC_MSG_HDR_INFO_REV GENMASK(2, 1) #define TCPC_MSG_HDR_INFO_PWR_ROLE BIT(0) -#define TCPC_MSG_HDR_INFO_REV_SHIFT 1 -#define TCPC_MSG_HDR_INFO_REV_MASK 0x3 #define TCPC_RX_DETECT 0x2f #define TCPC_RX_DETECT_HARD_RESET BIT(5) -- cgit v1.2.3 From 7cd41974d2c81055f61ba9f69e31c02e866716e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 10 Jul 2024 11:36:14 +0100 Subject: usb: typec: tcpci: use GENMASK() for TCPC_TRANSMIT register fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert all fields from register TCPC_TRANSMIT to using GENMASK() and FIELD_PREP() so as to keep using a similar approach throughout the code base and make it arguably easier to read. Signed-off-by: André Draszik Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20240710-tcpc-cleanup-v1-7-0ec1f41f4263@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/tcpci.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/tcpci.h b/include/linux/usb/tcpci.h index 3cd61e9f73b3..f7f5cfbdef12 100644 --- a/include/linux/usb/tcpci.h +++ b/include/linux/usb/tcpci.h @@ -148,10 +148,8 @@ #define TCPC_RX_DATA 0x34 /* through 0x4f */ #define TCPC_TRANSMIT 0x50 -#define TCPC_TRANSMIT_RETRY_SHIFT 4 -#define TCPC_TRANSMIT_RETRY_MASK 0x3 -#define TCPC_TRANSMIT_TYPE_SHIFT 0 -#define TCPC_TRANSMIT_TYPE_MASK 0x7 +#define TCPC_TRANSMIT_RETRY GENMASK(5, 4) +#define TCPC_TRANSMIT_TYPE GENMASK(2, 0) #define TCPC_TX_BYTE_CNT 0x51 #define TCPC_TX_HDR 0x52 -- cgit v1.2.3 From 08d6fd691bdd5d6e7881ce42ca6774ed92b10896 Mon Sep 17 00:00:00 2001 From: Akash Kumar Date: Thu, 1 Aug 2024 11:00:03 +0530 Subject: usb: gadget: Increase max configuration interface to 32 Currently, max configuration interfaces are limited to 16, which fails for compositions containing 10 UVC configurations with interrupt ep disabled along with other configurations , and we see bind failures while allocating interface ID in uvc bind. Increase max configuration interface to 32 to support any large compositions limited to same size as usb device endpoints as interfaces cannot be more than endpoints. Signed-off-by: Akash Kumar Link: https://lore.kernel.org/r/20240801053003.15153-1-quic_akakum@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/composite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index af3cd2aae4bc..6e38fb9d2117 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -256,7 +256,7 @@ int config_ep_by_speed(struct usb_gadget *g, struct usb_function *f, struct usb_ep *_ep); int usb_func_wakeup(struct usb_function *func); -#define MAX_CONFIG_INTERFACES 16 /* arbitrary; max 255 */ +#define MAX_CONFIG_INTERFACES 32 /** * struct usb_configuration - represents one gadget configuration -- cgit v1.2.3 From 259b46204885431f2853afa2a2bffa63f3705e67 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Mon, 5 Aug 2024 12:20:37 +0200 Subject: serial: remove quot_frac from serial8250_do_set_divisor() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit quot_frac is unused in serial8250_do_set_divisor() since commit b2b4b8ed3c06 (serial: 8250_exar: Move custom divisor support out from 8250_port). So no point to pass it. Signed-off-by: Jiri Slaby (SUSE) Cc: Ilpo Järvinen Cc: Andy Shevchenko Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240805102046.307511-5-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_8250.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index fd59ed2cca53..e0717c8393d7 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -193,7 +193,7 @@ void serial8250_do_pm(struct uart_port *port, unsigned int state, unsigned int oldstate); void serial8250_do_set_mctrl(struct uart_port *port, unsigned int mctrl); void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud, - unsigned int quot, unsigned int quot_frac); + unsigned int quot); int fsl8250_handle_irq(struct uart_port *port); int serial8250_handle_irq(struct uart_port *port, unsigned int iir); u16 serial8250_rx_chars(struct uart_8250_port *up, u16 lsr); -- cgit v1.2.3 From 130fd056dd82b02db9a661c013071af35309be1a Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Mon, 10 Jun 2024 20:20:16 +0100 Subject: sched/rt: Clean up usage of rt_task() rt_task() checks if a task has RT priority. But depends on your dictionary, this could mean it belongs to RT class, or is a 'realtime' task, which includes RT and DL classes. Since this has caused some confusion already on discussion [1], it seemed a clean up is due. I define the usage of rt_task() to be tasks that belong to RT class. Make sure that it returns true only for RT class and audit the users and replace the ones required the old behavior with the new realtime_task() which returns true for RT and DL classes. Introduce similar realtime_prio() to create similar distinction to rt_prio() and update the users that required the old behavior to use the new function. Move MAX_DL_PRIO to prio.h so it can be used in the new definitions. Document the functions to make it more obvious what is the difference between them. PI-boosted tasks is a factor that must be taken into account when choosing which function to use. Rename task_is_realtime() to realtime_task_policy() as the old name is confusing against the new realtime_task(). No functional changes were intended. [1] https://lore.kernel.org/lkml/20240506100509.GL40213@noisy.programming.kicks-ass.net/ Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: "Steven Rostedt (Google)" Reviewed-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20240610192018.1567075-2-qyousef@layalina.io --- include/linux/ioprio.h | 2 +- include/linux/sched/deadline.h | 6 ++++-- include/linux/sched/prio.h | 1 + include/linux/sched/rt.h | 27 ++++++++++++++++++++++++++- 4 files changed, 32 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index db1249cd9692..75859b78d540 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task) { if (task->policy == SCHED_IDLE) return IOPRIO_CLASS_IDLE; - else if (task_is_realtime(task)) + else if (realtime_task_policy(task)) return IOPRIO_CLASS_RT; else return IOPRIO_CLASS_BE; diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index df3aca89d4f5..5cb88b748ad6 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -10,8 +10,6 @@ #include -#define MAX_DL_PRIO 0 - static inline int dl_prio(int prio) { if (unlikely(prio < MAX_DL_PRIO)) @@ -19,6 +17,10 @@ static inline int dl_prio(int prio) return 0; } +/* + * Returns true if a task has a priority that belongs to DL class. PI-boosted + * tasks will return true. Use dl_policy() to ignore PI-boosted tasks. + */ static inline int dl_task(struct task_struct *p) { return dl_prio(p->prio); diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index ab83d85e1183..6ab43b4f72f9 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -14,6 +14,7 @@ */ #define MAX_RT_PRIO 100 +#define MAX_DL_PRIO 0 #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index b2b9e6eb9683..a055dd68a77c 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -7,18 +7,43 @@ struct task_struct; static inline int rt_prio(int prio) +{ + if (unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO)) + return 1; + return 0; +} + +static inline int realtime_prio(int prio) { if (unlikely(prio < MAX_RT_PRIO)) return 1; return 0; } +/* + * Returns true if a task has a priority that belongs to RT class. PI-boosted + * tasks will return true. Use rt_policy() to ignore PI-boosted tasks. + */ static inline int rt_task(struct task_struct *p) { return rt_prio(p->prio); } -static inline bool task_is_realtime(struct task_struct *tsk) +/* + * Returns true if a task has a priority that belongs to RT or DL classes. + * PI-boosted tasks will return true. Use realtime_task_policy() to ignore + * PI-boosted tasks. + */ +static inline int realtime_task(struct task_struct *p) +{ + return realtime_prio(p->prio); +} + +/* + * Returns true if a task has a policy that belongs to RT or DL classes. + * PI-boosted tasks will return false. + */ +static inline bool realtime_task_policy(struct task_struct *tsk) { int policy = tsk->policy; -- cgit v1.2.3 From b166af3db70fdcecf125662a2360471bb20be203 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Mon, 10 Jun 2024 20:20:17 +0100 Subject: sched/rt, dl: Convert functions to return bool {rt, realtime, dl}_{task, prio}() functions' return value is actually a bool. Convert their return type to reflect that. Suggested-by: "Steven Rostedt (Google)" Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: "Steven Rostedt (Google)" Reviewed-by: Metin Kaya Link: https://lore.kernel.org/r/20240610192018.1567075-3-qyousef@layalina.io --- include/linux/sched/deadline.h | 8 +++----- include/linux/sched/rt.h | 16 ++++++---------- 2 files changed, 9 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 5cb88b748ad6..3a912ab42bb5 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -10,18 +10,16 @@ #include -static inline int dl_prio(int prio) +static inline bool dl_prio(int prio) { - if (unlikely(prio < MAX_DL_PRIO)) - return 1; - return 0; + return unlikely(prio < MAX_DL_PRIO); } /* * Returns true if a task has a priority that belongs to DL class. PI-boosted * tasks will return true. Use dl_policy() to ignore PI-boosted tasks. */ -static inline int dl_task(struct task_struct *p) +static inline bool dl_task(struct task_struct *p) { return dl_prio(p->prio); } diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index a055dd68a77c..91ef1ef2019f 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -6,25 +6,21 @@ struct task_struct; -static inline int rt_prio(int prio) +static inline bool rt_prio(int prio) { - if (unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO)) - return 1; - return 0; + return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO); } -static inline int realtime_prio(int prio) +static inline bool realtime_prio(int prio) { - if (unlikely(prio < MAX_RT_PRIO)) - return 1; - return 0; + return unlikely(prio < MAX_RT_PRIO); } /* * Returns true if a task has a priority that belongs to RT class. PI-boosted * tasks will return true. Use rt_policy() to ignore PI-boosted tasks. */ -static inline int rt_task(struct task_struct *p) +static inline bool rt_task(struct task_struct *p) { return rt_prio(p->prio); } @@ -34,7 +30,7 @@ static inline int rt_task(struct task_struct *p) * PI-boosted tasks will return true. Use realtime_task_policy() to ignore * PI-boosted tasks. */ -static inline int realtime_task(struct task_struct *p) +static inline bool realtime_task(struct task_struct *p) { return realtime_prio(p->prio); } -- cgit v1.2.3 From ae04f69de0bef93c7086cf2983dbc8e8fd624ebe Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Mon, 10 Jun 2024 20:20:18 +0100 Subject: sched/rt: Rename realtime_{prio, task}() to rt_or_dl_{prio, task}() Some find the name realtime overloaded. Use rt_or_dl() as an alternative, hopefully better, name. Suggested-by: Daniel Bristot de Oliveira Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240610192018.1567075-4-qyousef@layalina.io --- include/linux/ioprio.h | 2 +- include/linux/sched/rt.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 75859b78d540..b25377b6ea98 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task) { if (task->policy == SCHED_IDLE) return IOPRIO_CLASS_IDLE; - else if (realtime_task_policy(task)) + else if (rt_or_dl_task_policy(task)) return IOPRIO_CLASS_RT; else return IOPRIO_CLASS_BE; diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 91ef1ef2019f..4e3338103654 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -11,7 +11,7 @@ static inline bool rt_prio(int prio) return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO); } -static inline bool realtime_prio(int prio) +static inline bool rt_or_dl_prio(int prio) { return unlikely(prio < MAX_RT_PRIO); } @@ -27,19 +27,19 @@ static inline bool rt_task(struct task_struct *p) /* * Returns true if a task has a priority that belongs to RT or DL classes. - * PI-boosted tasks will return true. Use realtime_task_policy() to ignore + * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore * PI-boosted tasks. */ -static inline bool realtime_task(struct task_struct *p) +static inline bool rt_or_dl_task(struct task_struct *p) { - return realtime_prio(p->prio); + return rt_or_dl_prio(p->prio); } /* * Returns true if a task has a policy that belongs to RT or DL classes. * PI-boosted tasks will return false. */ -static inline bool realtime_task_policy(struct task_struct *tsk) +static inline bool rt_or_dl_task_policy(struct task_struct *tsk) { int policy = tsk->policy; -- cgit v1.2.3 From c772a2c690182410642ead740f7a84b3a7544b2b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 1 Aug 2024 15:05:10 +0300 Subject: net/mlx5: Add IFC related stuff for data direct Add IFC related stuff for data direct. Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/82da7f578a567909bb5858a64ba844fe4cc298fa.1722512548.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 51 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cab228cf51c6..970c9d8473ef 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -313,6 +313,7 @@ enum { MLX5_CMD_OP_MODIFY_VHCA_STATE = 0xb0e, MLX5_CMD_OP_SYNC_CRYPTO = 0xb12, MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS = 0xb16, + MLX5_CMD_OPCODE_QUERY_VUID = 0xb22, MLX5_CMD_OP_MAX }; @@ -1885,7 +1886,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_5a0[0x10]; u8 enhanced_cqe_compression[0x1]; - u8 reserved_at_5b1[0x2]; + u8 reserved_at_5b1[0x1]; + u8 crossing_vhca_mkey[0x1]; u8 log_max_dek[0x5]; u8 reserved_at_5b8[0x4]; u8 mini_cqe_resp_stride_index[0x1]; @@ -1954,7 +1956,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 dynamic_msix_table_size[0xc]; u8 reserved_at_740[0xc]; u8 min_dynamic_vf_msix_table_size[0x4]; - u8 reserved_at_750[0x4]; + u8 reserved_at_750[0x2]; + u8 data_direct[0x1]; + u8 reserved_at_753[0x1]; u8 max_dynamic_vf_msix_table_size[0xc]; u8 reserved_at_760[0x3]; @@ -1982,7 +1986,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_0[0x80]; u8 migratable[0x1]; - u8 reserved_at_81[0x1f]; + u8 reserved_at_81[0x11]; + u8 query_vuid[0x1]; + u8 reserved_at_93[0xd]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; @@ -4154,6 +4160,7 @@ enum { MLX5_MKC_ACCESS_MODE_KSM = 0x3, MLX5_MKC_ACCESS_MODE_SW_ICM = 0x4, MLX5_MKC_ACCESS_MODE_MEMIC = 0x5, + MLX5_MKC_ACCESS_MODE_CROSSING = 0x6, }; struct mlx5_ifc_mkc_bits { @@ -4196,7 +4203,10 @@ struct mlx5_ifc_mkc_bits { u8 bsf_octword_size[0x20]; - u8 reserved_at_120[0x80]; + u8 reserved_at_120[0x60]; + + u8 crossing_target_vhca_id[0x10]; + u8 reserved_at_190[0x10]; u8 translations_octword_size[0x20]; @@ -5124,6 +5134,36 @@ struct mlx5_ifc_query_vport_state_out_bits { u8 state[0x4]; }; +struct mlx5_ifc_array1024_auto_bits { + u8 array1024_auto[32][0x20]; +}; + +struct mlx5_ifc_query_vuid_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x40]; + + u8 query_vfs_vuid[0x1]; + u8 data_direct[0x1]; + u8 reserved_at_62[0xe]; + u8 vhca_id[0x10]; +}; + +struct mlx5_ifc_query_vuid_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x1a0]; + + u8 reserved_at_1e0[0x10]; + u8 num_of_entries[0x10]; + + struct mlx5_ifc_array1024_auto_bits vuid[]; +}; + enum { MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT = 0x0, MLX5_VPORT_STATE_OP_MOD_ESW_VPORT = 0x1, @@ -8989,7 +9029,8 @@ struct mlx5_ifc_create_mkey_in_bits { u8 pg_access[0x1]; u8 mkey_umem_valid[0x1]; - u8 reserved_at_62[0x1e]; + u8 data_direct[0x1]; + u8 reserved_at_63[0x1d]; struct mlx5_ifc_mkc_bits memory_key_mkey_entry; -- cgit v1.2.3 From a09cdb8f564613769142a60400bb5160864c3269 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 8 Aug 2024 12:41:17 +0200 Subject: genirq: Remove unused irq_chip_generic:: {type,polarity}_cache The type_cache and polarity_cache members of struct irq_chip_generic are unused. Remove them both along with their kernel-doc. Found by https://github.com/jirislaby/clang-struct. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240808104118.430670-2-jirislaby@kernel.org --- include/linux/irq.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 1f5dbf1f92c9..00490d6ead65 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1040,8 +1040,6 @@ struct irq_chip_type { * @irq_base: Interrupt base nr for this chip * @irq_cnt: Number of interrupts handled by this chip * @mask_cache: Cached mask register shared between all chip types - * @type_cache: Cached type register - * @polarity_cache: Cached polarity register * @wake_enabled: Interrupt can wakeup from suspend * @wake_active: Interrupt is marked as an wakeup from suspend source * @num_ct: Number of available irq_chip_type instances (usually 1) @@ -1068,8 +1066,6 @@ struct irq_chip_generic { unsigned int irq_base; unsigned int irq_cnt; u32 mask_cache; - u32 type_cache; - u32 polarity_cache; u32 wake_enabled; u32 wake_active; unsigned int num_ct; -- cgit v1.2.3 From 60029162a0458832ab2bcfc6fd4986bfd9ca0f55 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Thu, 8 Aug 2024 12:41:18 +0200 Subject: genirq: Remove irq_chip_regs:: Polarity The polarity member of struct irq_chip_regs is unused. Remove it along with its kernel-doc. Found by https://github.com/jirislaby/clang-struct. Signed-off-by: Jiri Slaby (SUSE) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240808104118.430670-3-jirislaby@kernel.org --- include/linux/irq.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 00490d6ead65..fa711f80957b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -991,7 +991,6 @@ void irq_init_desc(unsigned int irq); * @ack: Ack register offset to reg_base * @eoi: Eoi register offset to reg_base * @type: Type configuration register offset to reg_base - * @polarity: Polarity configuration register offset to reg_base */ struct irq_chip_regs { unsigned long enable; @@ -1000,7 +999,6 @@ struct irq_chip_regs { unsigned long ack; unsigned long eoi; unsigned long type; - unsigned long polarity; }; /** -- cgit v1.2.3 From 09612576046a3cd29bd2b2b88c5813b1dfe96775 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 7 Aug 2024 14:22:26 +0200 Subject: net: sungem_phy: Constify struct mii_phy_def 'struct mii_phy_def' are not modified in this driver. Constifying these structures moves some data to a read-only section, so increase overall security. While at it fix the checkpatch warning related to this patch (some missing newlines and spaces around *) On a x86_64, with allmodconfig: Before: ====== 27709 928 0 28637 6fdd drivers/net/sungem_phy.o After: ===== text data bss dec hex filename 28157 476 0 28633 6fd9 drivers/net/sungem_phy.o Signed-off-by: Christophe JAILLET Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/54c3b30930f80f4895e6fa2f4234714fdea4ef4e.1723033266.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- include/linux/sungem_phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sungem_phy.h b/include/linux/sungem_phy.h index c505f30e8b68..eecc7eb63bfb 100644 --- a/include/linux/sungem_phy.h +++ b/include/linux/sungem_phy.h @@ -40,7 +40,7 @@ enum { /* An instance of a PHY, partially borrowed from mii_if_info */ struct mii_phy { - struct mii_phy_def* def; + const struct mii_phy_def *def; u32 advertising; int mii_id; -- cgit v1.2.3 From 5fcf329676cf9d132842d9871f83a091c32f3bfc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Jul 2024 13:41:50 +0200 Subject: fs: add put_mnt_ns() cleanup helper Add a simple helper to put a mount namespace reference. Link: https://lore.kernel.org/r/20240719-work-mount-namespace-v1-3-834113cab0d2@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/mnt_namespace.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 8f882f5881e8..70b366b64816 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -3,6 +3,9 @@ #define _NAMESPACE_H_ #ifdef __KERNEL__ +#include +#include + struct mnt_namespace; struct fs_struct; struct user_namespace; @@ -11,6 +14,7 @@ struct ns_common; extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); +DEFINE_FREE(put_mnt_ns, struct mnt_namespace *, if (!IS_ERR_OR_NULL(_T)) put_mnt_ns(_T)) extern struct ns_common *from_mnt_ns(struct mnt_namespace *); extern const struct file_operations proc_mounts_operations; -- cgit v1.2.3 From 257b1c2c78c25643526609dee0c15f1544eb3252 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 19 Jul 2024 13:41:51 +0200 Subject: file: add fput() cleanup helper Add a simple helper to put a file reference. Link: https://lore.kernel.org/r/20240719-work-mount-namespace-v1-4-834113cab0d2@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/file.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index 237931f20739..d1e768b06069 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -11,6 +11,7 @@ #include #include #include +#include struct file; @@ -96,6 +97,7 @@ extern void put_unused_fd(unsigned int fd); DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T), get_unused_fd_flags(flags), unsigned flags) +DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T)) /* * take_fd() will take care to set @fd to -EBADF ensuring that -- cgit v1.2.3 From 7ff7509fa52397455e04bd44982e9dfbbd19457f Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Mon, 29 Jul 2024 11:36:26 +0200 Subject: PCI: Make pcim_request_region() a public function pcim_request_region() is the managed counterpart of pci_request_region(). It is currently only used internally for PCI. It can be useful for a number of drivers and exporting it is a step towards deprecating more complicated functions. Make pcim_request_region() a public function. Link: https://lore.kernel.org/r/20240729093625.17561-4-pstanner@redhat.com Signed-off-by: Philipp Stanner Signed-off-by: Bjorn Helgaas Tested-by: Hans de Goede Reviewed-by: Hans de Goede Acked-by: Hans de Goede --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 4cf89a4b4cbc..4246cb790c7b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2292,6 +2292,7 @@ static inline void pci_fixup_device(enum pci_fixup_pass pass, void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen); void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr); void __iomem * const *pcim_iomap_table(struct pci_dev *pdev); +int pcim_request_region(struct pci_dev *pdev, int bar, const char *name); int pcim_iomap_regions(struct pci_dev *pdev, int mask, const char *name); int pcim_iomap_regions_request_all(struct pci_dev *pdev, int mask, const char *name); -- cgit v1.2.3 From d140f80f603584d8282817994c0c6241e736bef4 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Wed, 7 Aug 2024 10:30:18 +0200 Subject: PCI: Deprecate pcim_iomap_regions() in favor of pcim_iomap_region() pcim_iomap_regions() is a complicated function that uses a bit mask to determine the BARs the user wishes to request and ioremap. Almost all users only ever set a single bit in that mask, making that mechanism questionable. pcim_iomap_region() is now available as a more simple replacement. Make pcim_iomap_region() a public function. Mark pcim_iomap_regions() as deprecated. Link: https://lore.kernel.org/r/20240807083018.8734-2-pstanner@redhat.com Signed-off-by: Philipp Stanner Signed-off-by: Bjorn Helgaas --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 4246cb790c7b..aa9ef7890c50 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2290,6 +2290,8 @@ static inline void pci_fixup_device(enum pci_fixup_pass pass, #endif void __iomem *pcim_iomap(struct pci_dev *pdev, int bar, unsigned long maxlen); +void __iomem *pcim_iomap_region(struct pci_dev *pdev, int bar, + const char *name); void pcim_iounmap(struct pci_dev *pdev, void __iomem *addr); void __iomem * const *pcim_iomap_table(struct pci_dev *pdev); int pcim_request_region(struct pci_dev *pdev, int bar, const char *name); -- cgit v1.2.3 From 70114e7f7585ef078c2b7033ee14218f95f55e22 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 8 Aug 2024 15:34:02 +0300 Subject: irqdomain: Simplify simple and legacy domain creation irq_domain_create_simple() and irq_domain_create_legacy() use __irq_domain_instantiate(), but have extra handling of allocating interrupt descriptors and associating interrupts in them. Some of that is duplicated. There are also call sites which have conditonals to invoke different interrupt domain creator functions, where one of them is usually irq_domain_create_legacy(). Alternatively they associate the interrupts for the legacy case after creating the domain. Moving the extra logic of irq_domain_create_simple()/legacy() into __irq_domain_instantiate() allows to consolidate that. Introduce hwirq_base and virq_base members in the irq_domain_info structure, which allows to transport the required information and add the conditional interrupt descriptor allocation and interrupt association into __irq_domain_instantiate(). This reduces irq_domain_create_legacy() and irq_domain_create_simple() to trivial wrappers which fill in the info structure and allows call sites which must support the legacy case along with more modern mechanism to select the domain type via the parameters of the info struct. [ tglx: Massaged change log ] Suggested-by: Thomas Gleixner Signed-off-by: Matti Vaittinen Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/32d07bd79eb2b5416e24da9e9e8fe5955423dcf9.1723120028.git.mazziesaccount@gmail.com --- include/linux/irqdomain.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index de6105f68fec..bfcffa2c7047 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -291,6 +291,9 @@ struct irq_domain_chip_generic_info; * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; * Use ~0 for no limit; 0 for no direct mapping + * @hwirq_base: The first hardware interrupt number (legacy domains only) + * @virq_base: The first Linux interrupt number for legacy domains to + * immediately associate the interrupts after domain creation * @bus_token: Domain bus token * @ops: Domain operation callbacks * @host_data: Controller private data pointer @@ -307,6 +310,8 @@ struct irq_domain_info { unsigned int size; irq_hw_number_t hwirq_max; int direct_max; + unsigned int hwirq_base; + unsigned int virq_base; enum irq_domain_bus_token bus_token; const struct irq_domain_ops *ops; void *host_data; -- cgit v1.2.3 From 1e7c05292531e5b6bebe409cd531ed4ec0b2ff56 Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 8 Aug 2024 22:23:06 +0200 Subject: irqdomain: Allow giving name suffix for domain Devices can provide multiple interrupt lines. One reason for this is that a device has multiple subfunctions, each providing its own interrupt line. Another reason is that a device can be designed to be used (also) on a system where some of the interrupts can be routed to another processor. A line often further acts as a demultiplex for specific interrupts and has it's respective set of interrupt (status, mask, ack, ...) registers. Regmap supports the handling of these registers and demultiplexing interrupts, but the interrupt domain code ends up assigning the same name for the per interrupt line domains. This causes a naming collision in the debugFS code and leads to confusion, as /proc/interrupts shows two separate interrupts with the same domain name and hardware interrupt number. Instead of adding a workaround in regmap or driver code, allow giving a name suffix for the domain name when the domain is created. Add a name_suffix field in the irq_domain_info structure and make irq_domain_instantiate() use this suffix if it is given when a domain is created. [ tglx: Adopt it to the cleanup patch and fixup the invalid NULL return ] Signed-off-by: Matti Vaittinen Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/871q2yvk5x.ffs@tglx --- include/linux/irqdomain.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index bfcffa2c7047..e432b6a12a32 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -295,6 +295,8 @@ struct irq_domain_chip_generic_info; * @virq_base: The first Linux interrupt number for legacy domains to * immediately associate the interrupts after domain creation * @bus_token: Domain bus token + * @name_suffix: Optional name suffix to avoid collisions when multiple + * domains are added using same fwnode * @ops: Domain operation callbacks * @host_data: Controller private data pointer * @dgc_info: Geneneric chip information structure pointer used to @@ -313,6 +315,7 @@ struct irq_domain_info { unsigned int hwirq_base; unsigned int virq_base; enum irq_domain_bus_token bus_token; + const char *name_suffix; const struct irq_domain_ops *ops; void *host_data; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY -- cgit v1.2.3 From da4fe6815aca25603944a64b0965310512e867d0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Aug 2024 14:09:00 +0800 Subject: Revert "lib/mpi: Introduce ec implementation to MPI library" This reverts commit d58bb7e55a8a65894cc02f27c3e2bf9403e7c40f. It's no longer needed since sm2 has been removed. Signed-off-by: Herbert Xu --- include/linux/mpi.h | 105 ---------------------------------------------------- 1 file changed, 105 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mpi.h b/include/linux/mpi.h index eb0d1c1db208..89b720893e12 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -157,111 +157,6 @@ void mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor); /*-- mpi-inv.c --*/ int mpi_invm(MPI x, MPI a, MPI n); -/*-- ec.c --*/ - -/* Object to represent a point in projective coordinates */ -struct gcry_mpi_point { - MPI x; - MPI y; - MPI z; -}; - -typedef struct gcry_mpi_point *MPI_POINT; - -/* Models describing an elliptic curve */ -enum gcry_mpi_ec_models { - /* The Short Weierstrass equation is - * y^2 = x^3 + ax + b - */ - MPI_EC_WEIERSTRASS = 0, - /* The Montgomery equation is - * by^2 = x^3 + ax^2 + x - */ - MPI_EC_MONTGOMERY, - /* The Twisted Edwards equation is - * ax^2 + y^2 = 1 + bx^2y^2 - * Note that we use 'b' instead of the commonly used 'd'. - */ - MPI_EC_EDWARDS -}; - -/* Dialects used with elliptic curves */ -enum ecc_dialects { - ECC_DIALECT_STANDARD = 0, - ECC_DIALECT_ED25519, - ECC_DIALECT_SAFECURVE -}; - -/* This context is used with all our EC functions. */ -struct mpi_ec_ctx { - enum gcry_mpi_ec_models model; /* The model describing this curve. */ - enum ecc_dialects dialect; /* The ECC dialect used with the curve. */ - int flags; /* Public key flags (not always used). */ - unsigned int nbits; /* Number of bits. */ - - /* Domain parameters. Note that they may not all be set and if set - * the MPIs may be flagged as constant. - */ - MPI p; /* Prime specifying the field GF(p). */ - MPI a; /* First coefficient of the Weierstrass equation. */ - MPI b; /* Second coefficient of the Weierstrass equation. */ - MPI_POINT G; /* Base point (generator). */ - MPI n; /* Order of G. */ - unsigned int h; /* Cofactor. */ - - /* The actual key. May not be set. */ - MPI_POINT Q; /* Public key. */ - MPI d; /* Private key. */ - - const char *name; /* Name of the curve. */ - - /* This structure is private to mpi/ec.c! */ - struct { - struct { - unsigned int a_is_pminus3:1; - unsigned int two_inv_p:1; - } valid; /* Flags to help setting the helper vars below. */ - - int a_is_pminus3; /* True if A = P - 3. */ - - MPI two_inv_p; - - mpi_barrett_t p_barrett; - - /* Scratch variables. */ - MPI scratch[11]; - - /* Helper for fast reduction. */ - /* int nist_nbits; /\* If this is a NIST curve, the # of bits. *\/ */ - /* MPI s[10]; */ - /* MPI c; */ - } t; - - /* Curve specific computation routines for the field. */ - void (*addm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx); - void (*subm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ec); - void (*mulm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx); - void (*pow2)(MPI w, const MPI b, struct mpi_ec_ctx *ctx); - void (*mul2)(MPI w, MPI u, struct mpi_ec_ctx *ctx); -}; - -void mpi_ec_init(struct mpi_ec_ctx *ctx, enum gcry_mpi_ec_models model, - enum ecc_dialects dialect, - int flags, MPI p, MPI a, MPI b); -void mpi_ec_deinit(struct mpi_ec_ctx *ctx); -MPI_POINT mpi_point_new(unsigned int nbits); -void mpi_point_release(MPI_POINT p); -void mpi_point_init(MPI_POINT p); -void mpi_point_free_parts(MPI_POINT p); -int mpi_ec_get_affine(MPI x, MPI y, MPI_POINT point, struct mpi_ec_ctx *ctx); -void mpi_ec_add_points(MPI_POINT result, - MPI_POINT p1, MPI_POINT p2, - struct mpi_ec_ctx *ctx); -void mpi_ec_mul_point(MPI_POINT result, - MPI scalar, MPI_POINT point, - struct mpi_ec_ctx *ctx); -int mpi_ec_curve_point(MPI_POINT point, struct mpi_ec_ctx *ctx); - /* inline functions */ /** -- cgit v1.2.3 From 0737158aabc278526c784f63b8dd8963cdc558a9 Mon Sep 17 00:00:00 2001 From: Olivier Moysan Date: Tue, 30 Jul 2024 10:46:31 +0200 Subject: iio: add read scale and offset services to iio backend framework Add iio_backend_read_scale() and iio_backend_read_offset() services to read channel scale and offset from an IIO backbend device. Also add a read_raw callback which replicates the read_raw callback of the IIO framework, and is intended to request miscellaneous channel attributes from the backend device. Both scale and offset helpers use this callback. Signed-off-by: Olivier Moysan Reviewed-by: Nuno Sa Link: https://patch.msgid.link/20240730084640.1307938-2-olivier.moysan@foss.st.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 2b9d1aa86552..838092b0de3d 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -3,6 +3,7 @@ #define _IIO_BACKEND_H_ #include +#include struct iio_chan_spec; struct fwnode_handle; @@ -85,6 +86,7 @@ enum iio_backend_sample_trigger { * @extend_chan_spec: Extend an IIO channel. * @ext_info_set: Extended info setter. * @ext_info_get: Extended info getter. + * @read_raw: Read a channel attribute from a backend device * @debugfs_print_chan_status: Print channel status into a buffer. * @debugfs_reg_access: Read or write register value of backend. **/ @@ -119,6 +121,9 @@ struct iio_backend_ops { const char *buf, size_t len); int (*ext_info_get)(struct iio_backend *back, uintptr_t private, const struct iio_chan_spec *chan, char *buf); + int (*read_raw)(struct iio_backend *back, + struct iio_chan_spec const *chan, int *val, int *val2, + long mask); int (*debugfs_print_chan_status)(struct iio_backend *back, unsigned int chan, char *buf, size_t len); @@ -162,7 +167,9 @@ ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private, const char *buf, size_t len); ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private, const struct iio_chan_spec *chan, char *buf); - +int iio_backend_read_raw(struct iio_backend *back, + struct iio_chan_spec const *chan, int *val, int *val2, + long mask); int iio_backend_extend_chan_spec(struct iio_backend *back, struct iio_chan_spec *chan); void *iio_backend_get_priv(const struct iio_backend *conv); @@ -174,6 +181,21 @@ __devm_iio_backend_get_from_fwnode_lookup(struct device *dev, int devm_iio_backend_register(struct device *dev, const struct iio_backend_info *info, void *priv); +static inline int iio_backend_read_scale(struct iio_backend *back, + struct iio_chan_spec const *chan, + int *val, int *val2) +{ + return iio_backend_read_raw(back, chan, val, val2, IIO_CHAN_INFO_SCALE); +} + +static inline int iio_backend_read_offset(struct iio_backend *back, + struct iio_chan_spec const *chan, + int *val, int *val2) +{ + return iio_backend_read_raw(back, chan, val, val2, + IIO_CHAN_INFO_OFFSET); +} + ssize_t iio_backend_debugfs_print_chan_status(struct iio_backend *back, unsigned int chan, char *buf, size_t len); -- cgit v1.2.3 From 2530d7d44ca6dc0c1c059f143cdcb2be8600c59a Mon Sep 17 00:00:00 2001 From: Olivier Moysan Date: Tue, 30 Jul 2024 10:46:32 +0200 Subject: iio: add enable and disable services to iio backend framework Add iio_backend_disable() and iio_backend_enable() APIs to allow IIO backend consumer to request backend disabling and enabling. Signed-off-by: Olivier Moysan Reviewed-by: Nuno Sa Link: https://patch.msgid.link/20240730084640.1307938-3-olivier.moysan@foss.st.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 838092b0de3d..b0b663163ff4 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -144,6 +144,8 @@ struct iio_backend_info { int iio_backend_chan_enable(struct iio_backend *back, unsigned int chan); int iio_backend_chan_disable(struct iio_backend *back, unsigned int chan); int devm_iio_backend_enable(struct device *dev, struct iio_backend *back); +int iio_backend_enable(struct iio_backend *back); +void iio_backend_disable(struct iio_backend *back); int iio_backend_data_format_set(struct iio_backend *back, unsigned int chan, const struct iio_backend_data_fmt *data); int iio_backend_data_source_set(struct iio_backend *back, unsigned int chan, -- cgit v1.2.3 From c464cc610f51f6eb5f27bf905d4c1ab1896b2725 Mon Sep 17 00:00:00 2001 From: Olivier Moysan Date: Tue, 30 Jul 2024 10:46:33 +0200 Subject: iio: add child nodes support in iio backend framework Add an API to support IIO generic channels binding: http://devicetree.org/schemas/iio/adc/adc.yaml# This new API is needed, as generic channel DT node isn't populated as a device. Add devm_iio_backend_fwnode_get() to allow an IIO device backend consumer to reference backend phandles in its child nodes. Signed-off-by: Olivier Moysan Reviewed-by: Nuno Sa Link: https://patch.msgid.link/20240730084640.1307938-4-olivier.moysan@foss.st.com Signed-off-by: Jonathan Cameron --- include/linux/iio/backend.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index b0b663163ff4..37d56914d485 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -176,6 +176,9 @@ int iio_backend_extend_chan_spec(struct iio_backend *back, struct iio_chan_spec *chan); void *iio_backend_get_priv(const struct iio_backend *conv); struct iio_backend *devm_iio_backend_get(struct device *dev, const char *name); +struct iio_backend *devm_iio_backend_fwnode_get(struct device *dev, + const char *name, + struct fwnode_handle *fwnode); struct iio_backend * __devm_iio_backend_get_from_fwnode_lookup(struct device *dev, struct fwnode_handle *fwnode); -- cgit v1.2.3 From 2837efdc7cef5b86b0c4d94a7410bc03cc2f003f Mon Sep 17 00:00:00 2001 From: Denis Benato Date: Wed, 7 Aug 2024 20:56:18 +0200 Subject: iio: trigger: allow devices to suspend/resume theirs associated trigger When a machine enters a sleep state while a trigger is associated to an iio device that trigger is not resumed after exiting the sleep state: provide iio device drivers a way to suspend and resume the associated trigger to solve the aforementioned bug. Each iio driver supporting external triggers is expected to call iio_device_suspend_triggering before suspending, and iio_device_resume_triggering upon resuming. Signed-off-by: Denis Benato Link: https://patch.msgid.link/20240807185619.7261-2-benato.denis96@gmail.com Signed-off-by: Jonathan Cameron --- include/linux/iio/iio.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h index 3a735a9a9eae..18779b631e90 100644 --- a/include/linux/iio/iio.h +++ b/include/linux/iio/iio.h @@ -810,6 +810,23 @@ static inline struct dentry *iio_get_debugfs_dentry(struct iio_dev *indio_dev) } #endif +/** + * iio_device_suspend_triggering() - suspend trigger attached to an iio_dev + * @indio_dev: iio_dev associated with the device that will have triggers suspended + * + * Return 0 if successful, negative otherwise + **/ +int iio_device_suspend_triggering(struct iio_dev *indio_dev); + +/** + * iio_device_resume_triggering() - resume trigger attached to an iio_dev + * that was previously suspended with iio_device_suspend_triggering() + * @indio_dev: iio_dev associated with the device that will have triggers resumed + * + * Return 0 if successful, negative otherwise + **/ +int iio_device_resume_triggering(struct iio_dev *indio_dev); + #ifdef CONFIG_ACPI bool iio_read_acpi_mount_matrix(struct device *dev, struct iio_mount_matrix *orientation, -- cgit v1.2.3 From bf66471987b4bd537bc800353ca7d39bfd1d1022 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 10:51:10 +0200 Subject: context_tracking, rcu: Rename struct context_tracking .dynticks_nesting into .nesting The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. [ neeraj.upadhyay: Fix htmldocs build error reported by Stephen Rothwell ] Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index ad6570ffeff3..65290e7677e6 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -39,7 +39,7 @@ struct context_tracking { atomic_t state; #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE - long dynticks_nesting; /* Track process nesting level. */ + long nesting; /* Track process nesting level. */ long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ #endif }; @@ -77,14 +77,14 @@ static __always_inline int ct_rcu_watching_cpu_acquire(int cpu) static __always_inline long ct_dynticks_nesting(void) { - return __this_cpu_read(context_tracking.dynticks_nesting); + return __this_cpu_read(context_tracking.nesting); } static __always_inline long ct_dynticks_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); - return ct->dynticks_nesting; + return ct->nesting; } static __always_inline long ct_dynticks_nmi_nesting(void) -- cgit v1.2.3 From 1089c0078b69bce0c2d540e3cc43470ba9e5dcfd Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 14:45:30 +0200 Subject: context_tracking, rcu: Rename ct_dynticks_nesting() into ct_nesting() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 65290e7677e6..586c1ff22c2e 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -75,7 +75,7 @@ static __always_inline int ct_rcu_watching_cpu_acquire(int cpu) return atomic_read_acquire(&ct->state) & CT_RCU_WATCHING_MASK; } -static __always_inline long ct_dynticks_nesting(void) +static __always_inline long ct_nesting(void) { return __this_cpu_read(context_tracking.nesting); } -- cgit v1.2.3 From bca9455da531a3c2520c28ff349d0dab4e471d28 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 14:48:26 +0200 Subject: context_tracking, rcu: Rename ct_dynticks_nesting_cpu() into ct_nesting_cpu() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 586c1ff22c2e..fd42d8120ac2 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -80,7 +80,7 @@ static __always_inline long ct_nesting(void) return __this_cpu_read(context_tracking.nesting); } -static __always_inline long ct_dynticks_nesting_cpu(int cpu) +static __always_inline long ct_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); -- cgit v1.2.3 From dc5fface4b30508933b515a4985401c8c8f6bcfd Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 10:52:03 +0200 Subject: context_tracking, rcu: Rename struct context_tracking .dynticks_nmi_nesting into .nmi_nesting The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. [ neeraj.upadhyay: Fix htmldocs build error reported by Stephen Rothwell ] Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index fd42d8120ac2..12d00adf29e1 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -40,7 +40,7 @@ struct context_tracking { #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE long nesting; /* Track process nesting level. */ - long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ + long nmi_nesting; /* Track irq/NMI nesting level. */ #endif }; @@ -89,14 +89,14 @@ static __always_inline long ct_nesting_cpu(int cpu) static __always_inline long ct_dynticks_nmi_nesting(void) { - return __this_cpu_read(context_tracking.dynticks_nmi_nesting); + return __this_cpu_read(context_tracking.nmi_nesting); } static __always_inline long ct_dynticks_nmi_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); - return ct->dynticks_nmi_nesting; + return ct->nmi_nesting; } #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ -- cgit v1.2.3 From 8375cb260d7e43610934af63748d1a51201449f8 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 14:46:14 +0200 Subject: context_tracking, rcu: Rename ct_dynticks_nmi_nesting() into ct_nmi_nesting() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 12d00adf29e1..8f32fe599c5c 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -87,7 +87,7 @@ static __always_inline long ct_nesting_cpu(int cpu) return ct->nesting; } -static __always_inline long ct_dynticks_nmi_nesting(void) +static __always_inline long ct_nmi_nesting(void) { return __this_cpu_read(context_tracking.nmi_nesting); } -- cgit v1.2.3 From 2ef2890b7a94fbbfa8fdf0a90499ae1df476b8e8 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 14:49:13 +0200 Subject: context_tracking, rcu: Rename ct_dynticks_nmi_nesting_cpu() into ct_nmi_nesting_cpu() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. Suggested-by: Frederic Weisbecker Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 8f32fe599c5c..34fd504e53a8 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -92,7 +92,7 @@ static __always_inline long ct_nmi_nesting(void) return __this_cpu_read(context_tracking.nmi_nesting); } -static __always_inline long ct_dynticks_nmi_nesting_cpu(int cpu) +static __always_inline long ct_nmi_nesting_cpu(int cpu) { struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); -- cgit v1.2.3 From e1de438336222dbe27f2d4baa8c01074fcac2bef Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 29 Apr 2024 15:52:32 +0200 Subject: context_tracking, rcu: Rename DYNTICK_IRQ_NONIDLE into CT_NESTING_IRQ_NONIDLE The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, and the 'dynticks' prefix can be dropped without losing any meaning. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 34fd504e53a8..0dbda59c9f37 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -7,7 +7,7 @@ #include /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ -#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) +#define CT_NESTING_IRQ_NONIDLE ((LONG_MAX / 2) + 1) enum ctx_state { CT_STATE_DISABLED = -1, /* returned by ct_state() if unknown */ -- cgit v1.2.3 From aa9fbc5dd9da9e095a8b1a58540cf20cb06f595f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 8 Aug 2024 12:41:17 +0100 Subject: net: mii: constify advertising mask Constify the advertising mask to linkmode functions that only read from the advertising mask. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/mii.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mii.h b/include/linux/mii.h index d5a959ce4877..b8f26d4513c3 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -140,7 +140,7 @@ static inline u32 ethtool_adv_to_mii_adv_t(u32 ethadv) * settings to phy autonegotiation advertisements for the * MII_ADVERTISE register. */ -static inline u32 linkmode_adv_to_mii_adv_t(unsigned long *advertising) +static inline u32 linkmode_adv_to_mii_adv_t(const unsigned long *advertising) { u32 result = 0; @@ -215,7 +215,8 @@ static inline u32 ethtool_adv_to_mii_ctrl1000_t(u32 ethadv) * settings to phy autonegotiation advertisements for the * MII_CTRL1000 register when in 1000T mode. */ -static inline u32 linkmode_adv_to_mii_ctrl1000_t(unsigned long *advertising) +static inline u32 +linkmode_adv_to_mii_ctrl1000_t(const unsigned long *advertising) { u32 result = 0; @@ -453,7 +454,7 @@ static inline void mii_ctrl1000_mod_linkmode_adv_t(unsigned long *advertising, * A small helper function that translates linkmode advertising to LVL * pause capabilities. */ -static inline u32 linkmode_adv_to_lcl_adv_t(unsigned long *advertising) +static inline u32 linkmode_adv_to_lcl_adv_t(const unsigned long *advertising) { u32 lcl_adv = 0; -- cgit v1.2.3 From d0f8a8973f265f6a276f99d091af99edfb2b87de Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 8 Aug 2024 00:14:13 +0000 Subject: mm/memblock: introduce a new helper memblock_estimated_nr_free_pages() During bootup, system may need the number of free pages in the whole system to do some calculation before all pages are freed to buddy system. Usually this number is get from totalram_pages(). Since we plan to move the free pages accounting in __free_pages_core(), this value may not represent total free pages at the early stage, especially when CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled. Instead of using raw memblock api, let's introduce a new helper for user to get the estimated number of free pages from memblock point of view. Signed-off-by: Wei Yang CC: David Hildenbrand Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20240808001415.6298-1-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/memblock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index fc4d75c6cec3..673d5cae7c81 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -467,6 +467,7 @@ static inline __init_memblock bool memblock_bottom_up(void) phys_addr_t memblock_phys_mem_size(void); phys_addr_t memblock_reserved_size(void); +unsigned long memblock_estimated_nr_free_pages(void); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); -- cgit v1.2.3 From 8c38617722bdf57a90e6c77ed9ee5ebb60958d2a Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 9 Aug 2024 16:15:54 +0200 Subject: memory: ti-aemif: remove platform data support There are no longer any users of the ti-aemif driver that set up platform data from board files. We can shrink the driver by removing support for it. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20240809-ti-aemif-v1-1-27b1e5001390@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/linux/platform_data/ti-aemif.h | 45 ---------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 include/linux/platform_data/ti-aemif.h (limited to 'include/linux') diff --git a/include/linux/platform_data/ti-aemif.h b/include/linux/platform_data/ti-aemif.h deleted file mode 100644 index 77625251df07..000000000000 --- a/include/linux/platform_data/ti-aemif.h +++ /dev/null @@ -1,45 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * TI DaVinci AEMIF platform glue. - * - * Copyright (C) 2017 BayLibre SAS - * - * Author: - * Bartosz Golaszewski - */ - -#ifndef __TI_DAVINCI_AEMIF_DATA_H__ -#define __TI_DAVINCI_AEMIF_DATA_H__ - -#include - -/** - * struct aemif_abus_data - Async bus configuration parameters. - * - * @cs - Chip-select number. - */ -struct aemif_abus_data { - u32 cs; -}; - -/** - * struct aemif_platform_data - Data to set up the TI aemif driver. - * - * @dev_lookup: of_dev_auxdata passed to of_platform_populate() for aemif - * subdevices. - * @cs_offset: Lowest allowed chip-select number. - * @abus_data: Array of async bus configuration entries. - * @num_abus_data: Number of abus entries. - * @sub_devices: Array of platform subdevices. - * @num_sub_devices: Number of subdevices. - */ -struct aemif_platform_data { - struct of_dev_auxdata *dev_lookup; - u32 cs_offset; - struct aemif_abus_data *abus_data; - size_t num_abus_data; - struct platform_device *sub_devices; - size_t num_sub_devices; -}; - -#endif /* __TI_DAVINCI_AEMIF_DATA_H__ */ -- cgit v1.2.3 From ce056504e2e53b22c1f789cff2ad6a0a135dc24d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 Aug 2024 22:37:20 -0700 Subject: ethtool: make ethtool_ops::cap_rss_ctx_supported optional cap_rss_ctx_supported was created because the API for creating and configuring additional contexts is mux'ed with the normal RSS API. Presence of ops does not imply driver can actually support rss_context != 0 (in fact drivers mostly ignore that field). cap_rss_ctx_supported lets core check that the driver is context-aware before calling it. Now that we have .create_rxfh_context, there is no such ambiguity. We can depend on presence of the op. Make setting the bit optional. Reviewed-by: Gal Pressman Reviewed-by: Edward Cree Reviewed-by: Joe Damato Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 989c94eddb2b..a149485904d8 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -727,7 +727,8 @@ struct kernel_ethtool_ts_info { * @cap_link_lanes_supported: indicates if the driver supports lanes * parameter. * @cap_rss_ctx_supported: indicates if the driver supports RSS - * contexts. + * contexts via legacy API, drivers implementing @create_rxfh_context + * do not have to set this bit. * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor * RSS. * @rxfh_indir_space: max size of RSS indirection tables, if indirection table -- cgit v1.2.3 From ec6e57beaf8bc64ea0c2dc0cc360afcc7f504425 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 Aug 2024 22:37:22 -0700 Subject: ethtool: rss: don't report key if device doesn't support it marvell/otx2 and mvpp2 do not support setting different keys for different RSS contexts. Contexts have separate indirection tables but key is shared with all other contexts. This is likely fine, indirection table is the most important piece. Don't report the key-related parameters from such drivers. This prevents driver-errors, e.g. otx2 always writes the main key, even when user asks to change per-context key. The second reason is that without this change tracking the keys by the core gets complicated. Even if the driver correctly reject setting key with rss_context != 0, change of the main key would have to be reflected in the XArray for all additional contexts. Since the additional contexts don't have their own keys not including the attributes (in Netlink speak) seems intuitive. ethtool CLI seems to deal with it just fine. Having to set the flag in majority of the drivers is a bit tedious but not reporting the key is a safer default. Reviewed-by: Edward Cree Reviewed-by: Joe Damato Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index a149485904d8..12f6dc567598 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -731,6 +731,9 @@ struct kernel_ethtool_ts_info { * do not have to set this bit. * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor * RSS. + * @rxfh_per_ctx_key: device supports setting different RSS key for each + * additional context. Netlink API should report hfunc, key, and input_xfrm + * for every context, not just context 0. * @rxfh_indir_space: max size of RSS indirection tables, if indirection table * size as returned by @get_rxfh_indir_size may change during lifetime * of the device. Leave as 0 if the table size is constant. @@ -952,6 +955,7 @@ struct ethtool_ops { u32 cap_link_lanes_supported:1; u32 cap_rss_ctx_supported:1; u32 cap_rss_sym_xor_supported:1; + u32 rxfh_per_ctx_key:1; u32 rxfh_indir_space; u16 rxfh_key_space; u16 rxfh_priv_size; -- cgit v1.2.3 From fc9aef4382c02774662da3d7e1de8ba224e04f80 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 25 Jul 2024 08:23:40 -0400 Subject: platform/x86/intel/vsec.h: Move to include/linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some drivers outside of PDX86 need access to the vsec header. Move it to include/linux to make it easier to include. Reviewed-by: Ilpo Järvinen Reviewed-by: Michael J. Ruhl Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20240725122346.4063913-2-michael.j.ruhl@intel.com Signed-off-by: Hans de Goede --- include/linux/intel_vsec.h | 134 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 include/linux/intel_vsec.h (limited to 'include/linux') diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h new file mode 100644 index 000000000000..6495e37c9079 --- /dev/null +++ b/include/linux/intel_vsec.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _INTEL_VSEC_H +#define _INTEL_VSEC_H + +#include +#include + +#define VSEC_CAP_TELEMETRY BIT(0) +#define VSEC_CAP_WATCHER BIT(1) +#define VSEC_CAP_CRASHLOG BIT(2) +#define VSEC_CAP_SDSI BIT(3) +#define VSEC_CAP_TPMI BIT(4) + +/* Intel DVSEC offsets */ +#define INTEL_DVSEC_ENTRIES 0xA +#define INTEL_DVSEC_SIZE 0xB +#define INTEL_DVSEC_TABLE 0xC +#define INTEL_DVSEC_TABLE_BAR(x) ((x) & GENMASK(2, 0)) +#define INTEL_DVSEC_TABLE_OFFSET(x) ((x) & GENMASK(31, 3)) +#define TABLE_OFFSET_SHIFT 3 + +struct pci_dev; +struct resource; + +enum intel_vsec_id { + VSEC_ID_TELEMETRY = 2, + VSEC_ID_WATCHER = 3, + VSEC_ID_CRASHLOG = 4, + VSEC_ID_SDSI = 65, + VSEC_ID_TPMI = 66, +}; + +/** + * struct intel_vsec_header - Common fields of Intel VSEC and DVSEC registers. + * @rev: Revision ID of the VSEC/DVSEC register space + * @length: Length of the VSEC/DVSEC register space + * @id: ID of the feature + * @num_entries: Number of instances of the feature + * @entry_size: Size of the discovery table for each feature + * @tbir: BAR containing the discovery tables + * @offset: BAR offset of start of the first discovery table + */ +struct intel_vsec_header { + u8 rev; + u16 length; + u16 id; + u8 num_entries; + u8 entry_size; + u8 tbir; + u32 offset; +}; + +enum intel_vsec_quirks { + /* Watcher feature not supported */ + VSEC_QUIRK_NO_WATCHER = BIT(0), + + /* Crashlog feature not supported */ + VSEC_QUIRK_NO_CRASHLOG = BIT(1), + + /* Use shift instead of mask to read discovery table offset */ + VSEC_QUIRK_TABLE_SHIFT = BIT(2), + + /* DVSEC not present (provided in driver data) */ + VSEC_QUIRK_NO_DVSEC = BIT(3), + + /* Platforms requiring quirk in the auxiliary driver */ + VSEC_QUIRK_EARLY_HW = BIT(4), +}; + +/** + * struct intel_vsec_platform_info - Platform specific data + * @parent: parent device in the auxbus chain + * @headers: list of headers to define the PMT client devices to create + * @caps: bitmask of PMT capabilities for the given headers + * @quirks: bitmask of VSEC device quirks + * @base_addr: allow a base address to be specified (rather than derived) + */ +struct intel_vsec_platform_info { + struct device *parent; + struct intel_vsec_header **headers; + unsigned long caps; + unsigned long quirks; + u64 base_addr; +}; + +/** + * struct intel_sec_device - Auxbus specific device information + * @auxdev: auxbus device struct for auxbus access + * @pcidev: pci device associated with the device + * @resource: any resources shared by the parent + * @ida: id reference + * @num_resources: number of resources + * @id: xarray id + * @priv_data: any private data needed + * @quirks: specified quirks + * @base_addr: base address of entries (if specified) + */ +struct intel_vsec_device { + struct auxiliary_device auxdev; + struct pci_dev *pcidev; + struct resource *resource; + struct ida *ida; + int num_resources; + int id; /* xa */ + void *priv_data; + size_t priv_data_size; + unsigned long quirks; + u64 base_addr; +}; + +int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, + struct intel_vsec_device *intel_vsec_dev, + const char *name); + +static inline struct intel_vsec_device *dev_to_ivdev(struct device *dev) +{ + return container_of(dev, struct intel_vsec_device, auxdev.dev); +} + +static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device *auxdev) +{ + return container_of(auxdev, struct intel_vsec_device, auxdev); +} + +#if IS_ENABLED(CONFIG_INTEL_VSEC) +void intel_vsec_register(struct pci_dev *pdev, + struct intel_vsec_platform_info *info); +#else +static inline void intel_vsec_register(struct pci_dev *pdev, + struct intel_vsec_platform_info *info) +{ +} +#endif +#endif -- cgit v1.2.3 From e92affc74cd8624a548b380af7364be037adef35 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 25 Jul 2024 08:23:41 -0400 Subject: platform/x86/intel/vsec: Add PMT read callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some PMT providers require device specific actions before their telemetry can be read. Provide assignable PMT read callbacks to allow providers to perform those actions. Reviewed-by: Ilpo Järvinen Reviewed-by: Michael J. Ruhl Signed-off-by: David E. Box Link: https://lore.kernel.org/r/20240725122346.4063913-3-michael.j.ruhl@intel.com Signed-off-by: Hans de Goede --- include/linux/intel_vsec.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 6495e37c9079..11ee185566c3 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -67,10 +67,24 @@ enum intel_vsec_quirks { VSEC_QUIRK_EARLY_HW = BIT(4), }; +/** + * struct pmt_callbacks - Callback infrastructure for PMT devices + * ->read_telem() when specified, called by client driver to access PMT data (instead + * of direct copy). + * @pdev: PCI device reference for the callback's use + * @guid: ID of data to acccss + * @data: buffer for the data to be copied + * @count: size of buffer + */ +struct pmt_callbacks { + int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, u32 count); +}; + /** * struct intel_vsec_platform_info - Platform specific data * @parent: parent device in the auxbus chain * @headers: list of headers to define the PMT client devices to create + * @priv_data: private data, usable by parent devices, currently a callback * @caps: bitmask of PMT capabilities for the given headers * @quirks: bitmask of VSEC device quirks * @base_addr: allow a base address to be specified (rather than derived) @@ -78,6 +92,7 @@ enum intel_vsec_quirks { struct intel_vsec_platform_info { struct device *parent; struct intel_vsec_header **headers; + void *priv_data; unsigned long caps; unsigned long quirks; u64 base_addr; -- cgit v1.2.3 From 29bc83e4d90546aa794a9584786086b141a6ba4d Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 15 Jul 2024 16:23:24 -0700 Subject: srcu: faster gp seq wrap-around Using a higher value for the initial gp sequence counters allows for wrapping to occur faster. It can help with surfacing any issues that may be happening as a result of the wrap around. Signed-off-by: JP Kobryn Tested-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 3 +++ include/linux/srcutree.h | 15 ++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13f6f00aecf9..8d56db70d417 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -34,6 +34,9 @@ #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) +#define RCU_SEQ_CTR_SHIFT 2 +#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) + /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 8f3f72480e78..ed57598394de 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -129,10 +129,23 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 +/* + * Values for initializing gp sequence fields. Higher values allow wrap arounds to + * occur earlier. + * The second value with state is useful in the case of static initialization of + * srcu_usage where srcu_gp_seq_needed is expected to have some state value in its + * lower bits (or else it will appear to be already initialized within + * the call check_init_srcu_struct()). + */ +#define SRCU_GP_SEQ_INITIAL_VAL ((0UL - 100UL) << RCU_SEQ_CTR_SHIFT) +#define SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE (SRCU_GP_SEQ_INITIAL_VAL - 1) + #define __SRCU_USAGE_INIT(name) \ { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ - .srcu_gp_seq_needed = -1UL, \ + .srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL, \ + .srcu_gp_seq_needed = SRCU_GP_SEQ_INITIAL_VAL_WITH_STATE, \ + .srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL, \ .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ } -- cgit v1.2.3 From 711f5c5ce6c2c640c1b3b569ab2a8847be5ab21f Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 15 Jul 2024 21:22:51 -0400 Subject: lsm: cleanup lsm_hooks.h Some cleanup and style corrections for lsm_hooks.h. * Drop the lsm_inode_alloc() extern declaration, it is not needed. * Relocate lsm_get_xattr_slot() and extern variables in the file to improve grouping of related objects. * Don't use tabs to needlessly align structure fields. Reviewed-by: Casey Schaufler Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 87 +++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index f1ca8082075a..11ea0063228f 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -51,8 +51,8 @@ struct security_hook_heads { * Contains the information that identifies the LSM. */ struct lsm_id { - const char *name; - u64 id; + const char *name; + u64 id; }; /* @@ -60,49 +60,31 @@ struct lsm_id { * For use with generic list macros for common operations. */ struct security_hook_list { - struct hlist_node list; - struct hlist_head *head; - union security_list_options hook; - const struct lsm_id *lsmid; + struct hlist_node list; + struct hlist_head *head; + union security_list_options hook; + const struct lsm_id *lsmid; } __randomize_layout; /* * Security blob size or offset data. */ struct lsm_blob_sizes { - int lbs_cred; - int lbs_file; - int lbs_ib; - int lbs_inode; - int lbs_sock; - int lbs_superblock; - int lbs_ipc; - int lbs_key; - int lbs_msg_msg; - int lbs_perf_event; - int lbs_task; - int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ - int lbs_tun_dev; + int lbs_cred; + int lbs_file; + int lbs_ib; + int lbs_inode; + int lbs_sock; + int lbs_superblock; + int lbs_ipc; + int lbs_key; + int lbs_msg_msg; + int lbs_perf_event; + int lbs_task; + int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ + int lbs_tun_dev; }; -/** - * lsm_get_xattr_slot - Return the next available slot and increment the index - * @xattrs: array storing LSM-provided xattrs - * @xattr_count: number of already stored xattrs (updated) - * - * Retrieve the first available slot in the @xattrs array to fill with an xattr, - * and increment @xattr_count. - * - * Return: The slot to fill in @xattrs if non-NULL, NULL otherwise. - */ -static inline struct xattr *lsm_get_xattr_slot(struct xattr *xattrs, - int *xattr_count) -{ - if (unlikely(!xattrs)) - return NULL; - return &xattrs[(*xattr_count)++]; -} - /* * LSM_RET_VOID is used as the default value in LSM_HOOK definitions for void * LSM hooks (in include/linux/lsm_hook_defs.h). @@ -118,9 +100,6 @@ static inline struct xattr *lsm_get_xattr_slot(struct xattr *xattrs, #define LSM_HOOK_INIT(HEAD, HOOK) \ { .head = &security_hook_heads.HEAD, .hook = { .HEAD = HOOK } } -extern struct security_hook_heads security_hook_heads; -extern char *lsm_names; - extern void security_add_hooks(struct security_hook_list *hooks, int count, const struct lsm_id *lsmid); @@ -142,9 +121,6 @@ struct lsm_info { struct lsm_blob_sizes *blobs; /* Optional: for blob sharing. */ }; -extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; -extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; - #define DEFINE_LSM(lsm) \ static struct lsm_info __lsm_##lsm \ __used __section(".lsm_info.init") \ @@ -155,6 +131,29 @@ extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; __used __section(".early_lsm_info.init") \ __aligned(sizeof(unsigned long)) -extern int lsm_inode_alloc(struct inode *inode); +/* DO NOT tamper with these variables outside of the LSM framework */ +extern char *lsm_names; +extern struct security_hook_heads security_hook_heads; +extern struct lsm_static_calls_table static_calls_table __ro_after_init; +extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; +extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; + +/** + * lsm_get_xattr_slot - Return the next available slot and increment the index + * @xattrs: array storing LSM-provided xattrs + * @xattr_count: number of already stored xattrs (updated) + * + * Retrieve the first available slot in the @xattrs array to fill with an xattr, + * and increment @xattr_count. + * + * Return: The slot to fill in @xattrs if non-NULL, NULL otherwise. + */ +static inline struct xattr *lsm_get_xattr_slot(struct xattr *xattrs, + int *xattr_count) +{ + if (unlikely(!xattrs)) + return NULL; + return &xattrs[(*xattr_count)++]; +} #endif /* ! __LINUX_LSM_HOOKS_H */ -- cgit v1.2.3 From 63dff3e48871b0583be5032ff8fb7260c349a18c Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 9 Jul 2024 19:43:06 -0400 Subject: lsm: add the inode_free_security_rcu() LSM implementation hook The LSM framework has an existing inode_free_security() hook which is used by LSMs that manage state associated with an inode, but due to the use of RCU to protect the inode, special care must be taken to ensure that the LSMs do not fully release the inode state until it is safe from a RCU perspective. This patch implements a new inode_free_security_rcu() implementation hook which is called when it is safe to free the LSM's internal inode state. Unfortunately, this new hook does not have access to the inode itself as it may already be released, so the existing inode_free_security() hook is retained for those LSMs which require access to the inode. Cc: stable@vger.kernel.org Reported-by: syzbot+5446fbf332b0602ede0b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/r/00000000000076ba3b0617f65cc8@google.com Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 63e2656d1d56..520730fe2d94 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -114,6 +114,7 @@ LSM_HOOK(int, 0, path_notify, const struct path *path, u64 mask, unsigned int obj_type) LSM_HOOK(int, 0, inode_alloc_security, struct inode *inode) LSM_HOOK(void, LSM_RET_VOID, inode_free_security, struct inode *inode) +LSM_HOOK(void, LSM_RET_VOID, inode_free_security_rcu, void *inode_security) LSM_HOOK(int, -EOPNOTSUPP, inode_init_security, struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count) -- cgit v1.2.3 From 1da91ea87aefe2c25b68c9f96947a9271ba6325d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 31 May 2024 14:12:01 -0400 Subject: introduce fd_file(), convert all accessors to it. For any changes of struct fd representation we need to turn existing accesses to fields into calls of wrappers. Accesses to struct fd::flags are very few (3 in linux/file.h, 1 in net/socket.c, 3 in fs/overlayfs/file.c and 3 more in explicit initializers). Those can be dealt with in the commit converting to new layout; accesses to struct fd::file are too many for that. This commit converts (almost) all of f.file to fd_file(f). It's not entirely mechanical ('file' is used as a member name more than just in struct fd) and it does not even attempt to distinguish the uses in pointer context from those in boolean context; the latter will be eventually turned into a separate helper (fd_empty()). NOTE: mass conversion to fd_empty(), tempting as it might be, is a bad idea; better do that piecewise in commit that convert from fdget...() to CLASS(...). [conflicts in fs/fhandle.c, kernel/bpf/syscall.c, mm/memcontrol.c caught by git; fs/stat.c one got caught by git grep] [fs/xattr.c conflict] Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/cleanup.h | 2 +- include/linux/file.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index d9e613803df1..a3d3e888cf1f 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -98,7 +98,7 @@ const volatile void * __must_check_fn(const volatile void *val) * DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd) * * CLASS(fdget, f)(fd); - * if (!f.file) + * if (!fd_file(f)) * return -EBADF; * * // use 'f' without concern diff --git a/include/linux/file.h b/include/linux/file.h index 237931f20739..0f3f369f2450 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -42,10 +42,12 @@ struct fd { #define FDPUT_FPUT 1 #define FDPUT_POS_UNLOCK 2 +#define fd_file(f) ((f).file) + static inline void fdput(struct fd fd) { if (fd.flags & FDPUT_FPUT) - fput(fd.file); + fput(fd_file(fd)); } extern struct file *fget(unsigned int fd); @@ -79,7 +81,7 @@ static inline struct fd fdget_pos(int fd) static inline void fdput_pos(struct fd f) { if (f.flags & FDPUT_POS_UNLOCK) - __f_unlock_pos(f.file); + __f_unlock_pos(fd_file(f)); fdput(f); } -- cgit v1.2.3 From 88a2f6468d013ca1163490dbddfc95135d1c27a1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 31 May 2024 15:45:12 -0400 Subject: struct fd: representation change We want the compiler to see that fdput() on empty instance is a no-op. The emptiness check is that file reference is NULL, while fdput() is "fput() if FDPUT_FPUT is present in flags". The reason why fdput() on empty instance is a no-op is something compiler can't see - it's that we never generate instances with NULL file reference combined with non-zero flags. It's not that hard to deal with - the real primitives behind fdget() et.al. are returning an unsigned long value, unpacked by (inlined) __to_fd() into the current struct file * + int. The lower bits are used to store flags, while the rest encodes the pointer. Linus suggested that keeping this unsigned long around with the extractions done by inlined accessors should generate a sane code and that turns out to be the case. Namely, turning struct fd into a struct-wrapped unsinged long, with fd_empty(f) => unlikely(f.word == 0) fd_file(f) => (struct file *)(f.word & ~3) fdput(f) => if (f.word & 1) fput(fd_file(f)) ends up with compiler doing the right thing. The cost is the patch footprint, of course - we need to switch f.file to fd_file(f) all over the tree, and it's not doable with simple search and replace; there are false positives, etc. Note that the sole member of that structure is an opaque unsigned long - all accesses should be done via wrappers and I don't want to use a name that would invite manual casts to file pointers, etc. The value of that member is equal either to (unsigned long)p | flags, p being an address of some struct file instance, or to 0 for an empty fd. For now the new predicate (fd_empty(f)) has no users; all the existing checks have form (!fd_file(f)). We will convert to fd_empty() use later; here we only define it (and tell the compiler that it's unlikely to return true). This commit only deals with representation change; there will be followups. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/file.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index 0f3f369f2450..eb28469b1c16 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -35,18 +35,28 @@ static inline void fput_light(struct file *file, int fput_needed) fput(file); } +/* either a reference to struct file + flags + * (cloned vs. borrowed, pos locked), with + * flags stored in lower bits of value, + * or empty (represented by 0). + */ struct fd { - struct file *file; - unsigned int flags; + unsigned long word; }; #define FDPUT_FPUT 1 #define FDPUT_POS_UNLOCK 2 -#define fd_file(f) ((f).file) +#define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK))) +static inline bool fd_empty(struct fd f) +{ + return unlikely(!f.word); +} + +#define EMPTY_FD (struct fd){0} static inline void fdput(struct fd fd) { - if (fd.flags & FDPUT_FPUT) + if (fd.word & FDPUT_FPUT) fput(fd_file(fd)); } @@ -60,7 +70,7 @@ extern void __f_unlock_pos(struct file *); static inline struct fd __to_fd(unsigned long v) { - return (struct fd){(struct file *)(v & ~3),v & 3}; + return (struct fd){v}; } static inline struct fd fdget(unsigned int fd) @@ -80,7 +90,7 @@ static inline struct fd fdget_pos(int fd) static inline void fdput_pos(struct fd f) { - if (f.flags & FDPUT_POS_UNLOCK) + if (f.word & FDPUT_POS_UNLOCK) __f_unlock_pos(fd_file(f)); fdput(f); } -- cgit v1.2.3 From de12c3391bce10504c0e7bd767516c74110cfce1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 31 May 2024 16:34:25 -0400 Subject: add struct fd constructors, get rid of __to_fd() Make __fdget() et.al. return struct fd directly. New helpers: BORROWED_FD(file) and CLONED_FD(file), for borrowed and cloned file references resp. NOTE: this might need tuning; in particular, inline on __fget_light() is there to keep the code generation same as before - we probably want to keep it inlined in fdget() et.al. (especially so in fdget_pos()), but that needs profiling. Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- include/linux/file.h | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index eb28469b1c16..aab1caff6713 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -53,6 +53,14 @@ static inline bool fd_empty(struct fd f) } #define EMPTY_FD (struct fd){0} +static inline struct fd BORROWED_FD(struct file *f) +{ + return (struct fd){(unsigned long)f}; +} +static inline struct fd CLONED_FD(struct file *f) +{ + return (struct fd){(unsigned long)f | FDPUT_FPUT}; +} static inline void fdput(struct fd fd) { @@ -63,30 +71,11 @@ static inline void fdput(struct fd fd) extern struct file *fget(unsigned int fd); extern struct file *fget_raw(unsigned int fd); extern struct file *fget_task(struct task_struct *task, unsigned int fd); -extern unsigned long __fdget(unsigned int fd); -extern unsigned long __fdget_raw(unsigned int fd); -extern unsigned long __fdget_pos(unsigned int fd); extern void __f_unlock_pos(struct file *); -static inline struct fd __to_fd(unsigned long v) -{ - return (struct fd){v}; -} - -static inline struct fd fdget(unsigned int fd) -{ - return __to_fd(__fdget(fd)); -} - -static inline struct fd fdget_raw(unsigned int fd) -{ - return __to_fd(__fdget_raw(fd)); -} - -static inline struct fd fdget_pos(int fd) -{ - return __to_fd(__fdget_pos(fd)); -} +struct fd fdget(unsigned int fd); +struct fd fdget_raw(unsigned int fd); +struct fd fdget_pos(unsigned int fd); static inline void fdput_pos(struct fd f) { -- cgit v1.2.3 From 12dbc67c3b0bcd7414b511fd8a42ba4e388705bc Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Wed, 7 Aug 2024 21:45:28 +0800 Subject: net: stmmac: Move the atds flag to the stmmac_dma_cfg structure ATDS (Alternate Descriptor Size) is a part of the DMA Bus Mode configs (together with PBL, ALL, EME, etc) of the DW GMAC controllers. Seeing it's not changed at runtime but is activated as long as the IP-core has it supported (at least due to the Type 2 Full Checksum Offload Engine feature), move the respective parameter from the stmmac_dma_ops::init() callback argument to the stmmac_dma_cfg structure, which already have the rest of the DMA-related configs defined. Besides the being added in the next commit DW GMAC multi-channels support will require to add the stmmac_dma_ops::init_chan() callback and have the ATDS flag set/cleared for each channel in there. Having the atds-flag in the stmmac_dma_cfg structure will make the parameter accessible from stmmac_dma_ops::init_chan() callback too. Signed-off-by: Feiyang Chen Signed-off-by: Yinggang Gu Reviewed-by: Serge Semin Acked-by: Huacai Chen Signed-off-by: Yanteng Si Tested-by: Serge Semin Signed-off-by: Paolo Abeni --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 84e13bd5df28..338991c08f00 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -100,6 +100,7 @@ struct stmmac_dma_cfg { bool eame; bool multi_msi_en; bool dche; + bool atds; }; #define AXI_BLEN 7 -- cgit v1.2.3 From c343e66ed009fdb6206787558130ccbd504ffc12 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 10 Aug 2024 22:52:17 +0200 Subject: usb: gadget: configfs: Make check_user_usb_string() static "linux/usb/gadget_configfs.h" is only included in "drivers/usb/gadget/configfs.c", so there is no need to declare a function in the header file. it is only used in this .c file. It's better to have it static. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/958cb49dca1bff4254a3492c018efbf3b01918b4.1723323107.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/gadget_configfs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/gadget_configfs.h b/include/linux/usb/gadget_configfs.h index d61aebd68128..6a552dd4dec9 100644 --- a/include/linux/usb/gadget_configfs.h +++ b/include/linux/usb/gadget_configfs.h @@ -4,9 +4,6 @@ #include -int check_user_usb_string(const char *name, - struct usb_gadget_strings *stringtab_dev); - #define GS_STRINGS_W(__struct, __name) \ static ssize_t __struct##_##__name##_store(struct config_item *item, \ const char *page, size_t len) \ -- cgit v1.2.3 From d1e14e06810a96ef0cdabf572513594031d4dad6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 10 Aug 2024 23:05:46 +0200 Subject: usb: gadget: configfs: Constify struct config_item_type 'struct config_item_type' is not modified in this file. Apparently, these structures are only used with config_group_init_type_name() which takes a const struct config_item_type* as a 3rd argument. Constifying this structure moves some data to a read-only section, so increase overall security, especially when the structure holds some function pointers. On a x86_64, with allmodconfig: Before: ====== text data bss dec hex filename 40834 5112 64 46010 b3ba drivers/usb/gadget/configfs.o After: ===== text data bss dec hex filename 41218 4728 64 46010 b3ba drivers/usb/gadget/configfs.o Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/513223e97082e1bb758e36d55c175ec9ea34a71c.1723323896.git.christophe.jaillet@wanadoo.fr Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/gadget_configfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/gadget_configfs.h b/include/linux/usb/gadget_configfs.h index 6a552dd4dec9..6b5d6838f865 100644 --- a/include/linux/usb/gadget_configfs.h +++ b/include/linux/usb/gadget_configfs.h @@ -34,7 +34,7 @@ static struct configfs_item_operations struct_in##_langid_item_ops = { \ .release = struct_in##_attr_release, \ }; \ \ -static struct config_item_type struct_in##_langid_type = { \ +static const struct config_item_type struct_in##_langid_type = { \ .ct_item_ops = &struct_in##_langid_item_ops, \ .ct_attrs = struct_in##_langid_attrs, \ .ct_owner = THIS_MODULE, \ @@ -91,7 +91,7 @@ static struct configfs_group_operations struct_in##_strings_ops = { \ .drop_item = &struct_in##_strings_drop, \ }; \ \ -static struct config_item_type struct_in##_strings_type = { \ +static const struct config_item_type struct_in##_strings_type = { \ .ct_group_ops = &struct_in##_strings_ops, \ .ct_owner = THIS_MODULE, \ } -- cgit v1.2.3 From 1ef33652d22c69a2a7519d003cd6c79b8e514f44 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 8 Aug 2024 05:25:07 -0700 Subject: net: netpoll: extract core of netpoll_cleanup Extract the core part of netpoll_cleanup(), so, it could be called from a caller that has the rtnl lock already. Netconsole uses this in a weird way right now: __netpoll_cleanup(&nt->np); spin_lock_irqsave(&target_list_lock, flags); netdev_put(nt->np.dev, &nt->np.dev_tracker); nt->np.dev = NULL; nt->enabled = false; This will be replaced by do_netpoll_cleanup() as the locking situation is overhauled. Signed-off-by: Breno Leitao Reviewed-by: Rik van Riel Signed-off-by: Paolo Abeni --- include/linux/netpoll.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index bd19c4b91e31..cd4e28db0cbd 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -64,6 +64,7 @@ int netpoll_setup(struct netpoll *np); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); +void do_netpoll_cleanup(struct netpoll *np); netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); #ifdef CONFIG_NETPOLL -- cgit v1.2.3 From dde286ee57704226b500cb9eb59547fec07aad3d Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Thu, 8 Aug 2024 15:36:28 +0300 Subject: regmap: Allow setting IRQ domain name suffix When multiple IRQ domains are created from the same device-tree node they will get the same name based on the device-tree path. This will cause a naming collision in debugFS when IRQ domain specific entries are created. The regmap-IRQ creates per instance IRQ domains. This will lead to a domain name conflict when a device which provides more than one interrupt line uses the regmap-IRQ. Add support for specifying an IRQ domain name suffix when creating a regmap-IRQ controller. Signed-off-by: Matti Vaittinen Link: https://patch.msgid.link/776bc4996969e5081bcf61b9bdb5517e537147a3.1723120028.git.mazziesaccount@gmail.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 122e38161acb..f9ccad32fc5c 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -1521,6 +1521,9 @@ struct regmap_irq_chip_data; * struct regmap_irq_chip - Description of a generic regmap irq_chip. * * @name: Descriptive name for IRQ controller. + * @domain_suffix: Name suffix to be appended to end of IRQ domain name. Needed + * when multiple regmap-IRQ controllers are created from same + * device. * * @main_status: Base main status register address. For chips which have * interrupts arranged in separate sub-irq blocks with own IRQ @@ -1606,6 +1609,7 @@ struct regmap_irq_chip_data; */ struct regmap_irq_chip { const char *name; + const char *domain_suffix; unsigned int main_status; unsigned int num_main_status_bits; -- cgit v1.2.3 From ec0a7d44b358afaaf52856d03c72e20587bc888b Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Fri, 9 Aug 2024 15:28:25 -0700 Subject: workqueue: Add interface for user-defined workqueue lockdep map Add an interface for a user-defined workqueue lockdep map, which is helpful when multiple workqueues are created for the same purpose. This also helps avoid leaking lockdep maps on each workqueue creation. v2: - Add alloc_workqueue_lockdep_map (Tejun) v3: - Drop __WQ_USER_OWNED_LOCKDEP (Tejun) - static inline alloc_ordered_workqueue_lockdep_map (Tejun) Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Matthew Brost Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 4eb8f9563136..8ccbf510880b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -507,6 +507,58 @@ void workqueue_softirq_dead(unsigned int cpu); __printf(1, 4) struct workqueue_struct * alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); +#ifdef CONFIG_LOCKDEP +/** + * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags + * @max_active: max in-flight work items, 0 for default + * @lockdep_map: user-defined lockdep_map + * @...: args for @fmt + * + * Same as alloc_workqueue but with the a user-define lockdep_map. Useful for + * workqueues created with the same purpose and to avoid leaking a lockdep_map + * on each workqueue creation. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ +__printf(1, 5) struct workqueue_struct * +alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, + struct lockdep_map *lockdep_map, ...); + +/** + * alloc_ordered_workqueue_lockdep_map - allocate an ordered workqueue with + * user-defined lockdep_map + * + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) + * @lockdep_map: user-defined lockdep_map + * @args: args for @fmt + * + * Same as alloc_ordered_workqueue but with the a user-define lockdep_map. + * Useful for workqueues created with the same purpose and to avoid leaking a + * lockdep_map on each workqueue creation. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ +__printf(1, 4) static inline struct workqueue_struct * +alloc_ordered_workqueue_lockdep_map(const char *fmt, unsigned int flags, + struct lockdep_map *lockdep_map, ...) +{ + struct workqueue_struct *wq; + va_list args; + + va_start(args, lockdep_map); + wq = alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | flags, + 1, lockdep_map, args); + va_end(args); + + return wq; +} +#endif + /** * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue -- cgit v1.2.3 From fb4dae4ca315fe48f1a8a452172f29196b2a7f3d Mon Sep 17 00:00:00 2001 From: Junfeng Guo Date: Thu, 25 Jul 2024 16:08:05 -0600 Subject: virtchnl: support raw packet in protocol header The patch extends existing virtchnl_proto_hdrs structure to allow VF to pass a pair of buffers as packet data and mask that describe a match pattern of a filter rule. Then the kernel PF driver is requested to parse the pair of buffer and figure out low level hardware metadata (ptype, profile, field vector.. ) to program the expected FDIR or RSS rules. Reviewed-by: Simon Horman Reviewed-by: Marcin Szycik Signed-off-by: Qi Zhang Signed-off-by: Junfeng Guo Signed-off-by: Ahmed Zaki Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 8e177b67e82f..4f78a65e33dc 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -1121,6 +1121,7 @@ enum virtchnl_vfr_states { }; #define VIRTCHNL_MAX_NUM_PROTO_HDRS 32 +#define VIRTCHNL_MAX_SIZE_RAW_PACKET 1024 #define PROTO_HDR_SHIFT 5 #define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT) #define PROTO_HDR_FIELD_MASK ((1UL << PROTO_HDR_SHIFT) - 1) @@ -1266,13 +1267,22 @@ struct virtchnl_proto_hdrs { u8 pad[3]; /** * specify where protocol header start from. + * must be 0 when sending a raw packet request. * 0 - from the outer layer * 1 - from the first inner layer * 2 - from the second inner layer * .... **/ int count; /* the proto layers must < VIRTCHNL_MAX_NUM_PROTO_HDRS */ - struct virtchnl_proto_hdr proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS]; + union { + struct virtchnl_proto_hdr + proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS]; + struct { + u16 pkt_len; + u8 spec[VIRTCHNL_MAX_SIZE_RAW_PACKET]; + u8 mask[VIRTCHNL_MAX_SIZE_RAW_PACKET]; + } raw; + }; }; VIRTCHNL_CHECK_STRUCT_LEN(2312, virtchnl_proto_hdrs); -- cgit v1.2.3 From 623122ac1c4074791ea319df4676fb2875817fe4 Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Thu, 25 Jul 2024 16:08:09 -0600 Subject: iavf: add support for offloading tc U32 cls filters Add support for offloading cls U32 filters. Only "skbedit queue_mapping" and "drop" actions are supported. Also, only "ip" and "802_3" tc protocols are allowed. The PF must advertise the VIRTCHNL_VF_OFFLOAD_TC_U32 capability flag. Since the filters will be enabled via the FD stage at the PF, a new type of FDIR filters is added and the existing list and state machine are used. The new filters can be used to configure flow directors based on raw (binary) pattern in the rx packet. Examples: 0. # tc qdisc add dev enp175s0v0 ingress 1. Redirect UDP from src IP 192.168.2.1 to queue 12: # tc filter add dev protocol ip ingress u32 \ match u32 0x45000000 0xff000000 at 0 \ match u32 0x00110000 0x00ff0000 at 8 \ match u32 0xC0A80201 0xffffffff at 12 \ match u32 0x00000000 0x00000000 at 24 \ action skbedit queue_mapping 12 skip_sw 2. Drop all ICMP: # tc filter add dev protocol ip ingress u32 \ match u32 0x45000000 0xff000000 at 0 \ match u32 0x00010000 0x00ff0000 at 8 \ match u32 0x00000000 0x00000000 at 24 \ action drop skip_sw 3. Redirect ICMP traffic from MAC 3c:fd:fe:a5:47:e0 to queue 7 (note proto: 802_3): # tc filter add dev protocol 802_3 ingress u32 \ match u32 0x00003CFD 0x0000ffff at 4 \ match u32 0xFEA547E0 0xffffffff at 8 \ match u32 0x08004500 0xffffff00 at 12 \ match u32 0x00000001 0x000000ff at 20 \ match u32 0x0000 0x0000 at 40 \ action skbedit queue_mapping 7 skip_sw Notes on matches: 1 - All intermediate fields that are needed to parse the correct PTYPE must be provided (in e.g. 3: Ethernet Type 0x0800 in MAC, IP version and IP length: 0x45 and protocol: 0x01 (ICMP)). 2 - The last match must provide an offset that guarantees all required headers are accounted for, even if the last header is not matched. For example, in #2, the last match is 4 bytes at offset 24 starting from IP header, so the total is 14 (MAC) + 24 + 4 = 42, which is the sum of MAC+IP+ICMP headers. Reviewed-by: Sridhar Samudrala Reviewed-by: Marcin Szycik Signed-off-by: Ahmed Zaki Tested-by: Rafal Romanowski Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 4f78a65e33dc..f41395264dca 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -247,6 +247,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); /* used to negotiate communicating link speeds in Mbps */ #define VIRTCHNL_VF_CAP_ADV_LINK_SPEED BIT(7) #define VIRTCHNL_VF_OFFLOAD_CRC BIT(10) +#define VIRTCHNL_VF_OFFLOAD_TC_U32 BIT(11) #define VIRTCHNL_VF_OFFLOAD_VLAN_V2 BIT(15) #define VIRTCHNL_VF_OFFLOAD_VLAN BIT(16) #define VIRTCHNL_VF_OFFLOAD_RX_POLLING BIT(17) -- cgit v1.2.3 From 55f325958ccc41eaea43eb4546d4dc77c1b5ef8a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 30 Jul 2024 01:16:04 -0400 Subject: bpf: switch maps to CLASS(fd, ...) Calling conventions for __bpf_map_get() would be more convenient if it left fpdut() on failure to callers. Makes for simpler logics in the callers. Among other things, the proof of memory safety no longer has to rely upon file->private_data never being ERR_PTR(...) for bpffs files. Original calling conventions made it impossible for the caller to tell whether __bpf_map_get() has returned ERR_PTR(-EINVAL) because it has found the file not be a bpf map one (in which case it would've done fdput()) or because it found that ERR_PTR(-EINVAL) in file->private_data of a bpf map file (in which case fdput() would _not_ have been done). Signed-off-by: Al Viro Reviewed-by: Christian Brauner Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b9425e410bcb..9f35df07e86d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2241,7 +2241,16 @@ void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); -struct bpf_map *__bpf_map_get(struct fd f); + +static inline struct bpf_map *__bpf_map_get(struct fd f) +{ + if (fd_empty(f)) + return ERR_PTR(-EBADF); + if (unlikely(fd_file(f)->f_op != &bpf_map_fops)) + return ERR_PTR(-EINVAL); + return fd_file(f)->private_data; +} + void bpf_map_inc(struct bpf_map *map); void bpf_map_inc_with_uref(struct bpf_map *map); struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); -- cgit v1.2.3 From 7945b741d1fc071a621366c512a060ea08848955 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jul 2024 19:47:47 -0700 Subject: rcu-tasks: Remove RCU Tasks Rude asynchronous APIs The call_rcu_tasks_rude() and rcu_barrier_tasks_rude() APIs are currently unused. This commit therefore removes their definitions and boot-time self-tests. Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13f6f00aecf9..31e679c7110e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -37,7 +37,6 @@ /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); -void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); struct rcu_gp_oldstate; @@ -202,7 +201,6 @@ do { \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU -void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks_rude(void); # endif -- cgit v1.2.3 From fe91cf39db0939ac8d523b5a1c31840f7cbe205c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Aug 2024 17:34:25 -0700 Subject: rcu/tasks: Add detailed grace-period and barrier diagnostics This commit adds rcu_tasks_torture_stats_print(), rcu_tasks_trace_torture_stats_print(), and rcu_tasks_rude_torture_stats_print() functions that provide detailed diagnostics on grace-period, callback, and barrier state. Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay --- include/linux/rcupdate.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 31e679c7110e..17463e95b6ef 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -164,6 +164,7 @@ static inline void rcu_nocb_flush_deferred_wakeup(void) { } } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); +void rcu_tasks_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu @@ -190,6 +191,7 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t); rcu_tasks_trace_qs_blkd(t); \ } \ } while (0) +void rcu_tasks_trace_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif @@ -202,6 +204,7 @@ do { \ # ifdef CONFIG_TASKS_RUDE_RCU void synchronize_rcu_tasks_rude(void); +void rcu_tasks_rude_torture_stats_print(char *tt, char *tf); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) -- cgit v1.2.3 From 44f65d900698278a8451988abe0d5ca37fd46882 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Tue, 6 Aug 2024 21:49:27 +0000 Subject: binfmt_elf: mseal address zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In load_elf_binary as part of the execve(), when the current task’s personality has MMAP_PAGE_ZERO set, the kernel allocates one page at address 0. According to the comment: /* Why this, you ask??? Well SVr4 maps page 0 as read-only, and some applications "depend" upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ At one point, Linus suggested removing this [1]. Code search in debian didn't see much use of MMAP_PAGE_ZERO [2], it exists in util and test (rr). Sealing this is probably safe, the comment doesn't say the app ever wanting to change the mapping to rwx. Sealing also ensures that never happens. If there is a complaint, we can make this configurable. Link: https://lore.kernel.org/lkml/CAHk-=whVa=nm_GW=NVfPHqcxDbWt4JjjK1YWb0cLjO4ZSGyiDA@mail.gmail.com/ [1] Link: https://codesearch.debian.net/search?q=MMAP_PAGE_ZERO&literal=1&perpkg=1&page=1 [2] Signed-off-by: Jeff Xu Link: https://lore.kernel.org/r/20240806214931.2198172-2-jeffxu@google.com Signed-off-by: Kees Cook --- include/linux/mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76..a178c15812eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4201,4 +4201,14 @@ void vma_pgtable_walk_end(struct vm_area_struct *vma); int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); +#ifdef CONFIG_64BIT +int do_mseal(unsigned long start, size_t len_in, unsigned long flags); +#else +static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags) +{ + /* noop on 32 bit */ + return 0; +} +#endif + #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From ac9d45544cd571decca395715d0b0a3b617d02f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 13:33:58 -0700 Subject: locking/csd_lock: Provide an indication of ongoing CSD-lock stall If a CSD-lock stall goes on long enough, it will cause an RCU CPU stall warning. This additional warning provides much additional console-log traffic and little additional information. Therefore, provide a new csd_lock_is_stuck() function that returns true if there is an ongoing CSD-lock stall. This function will be used by the RCU CPU stall warnings to provide a one-line indication of the stall when this function returns true. [ neeraj.upadhyay: Apply Rik van Riel feedback. ] [ neeraj.upadhyay: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Imran Khan Cc: Ingo Molnar Cc: Leonardo Bras Cc: "Peter Zijlstra (Intel)" Cc: Rik van Riel Signed-off-by: Neeraj Upadhyay --- include/linux/smp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index fcd61dfe2af3..3871bd32018f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -294,4 +294,10 @@ int smpcfd_prepare_cpu(unsigned int cpu); int smpcfd_dead_cpu(unsigned int cpu); int smpcfd_dying_cpu(unsigned int cpu); +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG +bool csd_lock_is_stuck(void); +#else +static inline bool csd_lock_is_stuck(void) { return false; } +#endif + #endif /* __LINUX_SMP_H */ -- cgit v1.2.3 From fda70207135b60dd1a7c8f16cc036bb1cc344490 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 16 Apr 2024 11:01:23 +0200 Subject: context_tracking, rcu: Rename rcu_dynticks_curr_cpu_in_eqs() into rcu_is_watching_curr_cpu() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, reflect that change in the related helpers. Note that "watching" is the opposite of "in EQS", so the negation is lifted out of the helper and into the callsites. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index a6c36780cc3b..d53092ffa9db 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -113,13 +113,17 @@ extern void ct_idle_enter(void); extern void ct_idle_exit(void); /* - * Is the current CPU in an extended quiescent state? + * Is RCU watching the current CPU (IOW, it is not in an extended quiescent state)? + * + * Note that this returns the actual boolean data (watching / not watching), + * whereas ct_rcu_watching() returns the RCU_WATCHING subvariable of + * context_tracking.state. * * No ordering, as we are sampling CPU-local information. */ -static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) +static __always_inline bool rcu_is_watching_curr_cpu(void) { - return !(raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING); + return raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING; } /* @@ -140,7 +144,7 @@ static __always_inline bool warn_rcu_enter(void) * lots of the actual reporting also relies on RCU. */ preempt_disable_notrace(); - if (rcu_dynticks_curr_cpu_in_eqs()) { + if (!rcu_is_watching_curr_cpu()) { ret = true; ct_state_inc(CT_RCU_WATCHING); } -- cgit v1.2.3 From 32a9f26e5e266f0116cc8536e5e4544523ddd334 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 29 Apr 2024 16:43:36 +0200 Subject: rcu: Rename rcu_momentary_dyntick_idle() into rcu_momentary_eqs() The context_tracking.state RCU_DYNTICKS subvariable has been renamed to RCU_WATCHING, replace "dyntick_idle" into "eqs" to drop the dyntick reference. Signed-off-by: Valentin Schneider Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- include/linux/rcutiny.h | 2 +- include/linux/rcutree.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d9ac7b136aea..cf2b5a188f78 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -158,7 +158,7 @@ void rcu_scheduler_starting(void); static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } -static inline void rcu_momentary_dyntick_idle(void) { } +static inline void rcu_momentary_eqs(void) { } static inline void kfree_rcu_scheduler_running(void) { } static inline bool rcu_gp_might_be_stalled(void) { return false; } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 254244202ea9..7dbde2b6f714 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -37,7 +37,7 @@ void synchronize_rcu_expedited(void); void kvfree_call_rcu(struct rcu_head *head, void *ptr); void rcu_barrier(void); -void rcu_momentary_dyntick_idle(void); +void rcu_momentary_eqs(void); void kfree_rcu_scheduler_running(void); bool rcu_gp_might_be_stalled(void); -- cgit v1.2.3 From 4040b1139904b3f72b56862fc9a8d3e9abb69ffb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 17 Jun 2024 08:51:14 -0700 Subject: context_tracking: Tag context_tracking_enabled_this_cpu() __always_inline Force context_tracking_enabled_this_cpu() to be inlined so that invoking it from guest_context_enter_irqoff(), which KVM uses in non-instrumentable code, doesn't unexpectedly leave a noinstr section. vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x1c7: call to context_tracking_enabled_this_cpu() leaves .noinstr.text section vmlinux.o: warning: objtool: svm_vcpu_enter_exit+0x83: call to context_tracking_enabled_this_cpu() leaves .noinstr.text section Note, the CONFIG_CONTEXT_TRACKING_USER=n stub is already __always_inline. Signed-off-by: Sean Christopherson Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/context_tracking_state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 0dbda59c9f37..7b8433d5a8ef 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -113,7 +113,7 @@ static __always_inline bool context_tracking_enabled_cpu(int cpu) return context_tracking_enabled() && per_cpu(context_tracking.active, cpu); } -static inline bool context_tracking_enabled_this_cpu(void) +static __always_inline bool context_tracking_enabled_this_cpu(void) { return context_tracking_enabled() && __this_cpu_read(context_tracking.active); } -- cgit v1.2.3 From a98ae7f045b29de4f48b191d5aeb4e803183d759 Mon Sep 17 00:00:00 2001 From: Michal Wajdeczko Date: Thu, 25 Jul 2024 12:18:40 +0200 Subject: lib/string_choices: Add str_up_down() helper Add str_up_down() helper to return "up" or "down" string literal. Signed-off-by: Michal Wajdeczko Link: https://lore.kernel.org/r/20240725101841.574-1-michal.wajdeczko@intel.com Signed-off-by: Kees Cook --- include/linux/string_choices.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index d9ebe20229f8..bcde3c9cff81 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -42,6 +42,11 @@ static inline const char *str_yes_no(bool v) return v ? "yes" : "no"; } +static inline const char *str_up_down(bool v) +{ + return v ? "up" : "down"; +} + /** * str_plural - Return the simple pluralization based on English counts * @num: Number used for deciding pluralization -- cgit v1.2.3 From f5c1ca3a15fdb867d2b535003f74e0b975eff116 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 12 Aug 2024 11:29:40 -0700 Subject: string_choices: Add wrapper for str_down_up() The string choice functions which are not clearly true/false synonyms also have inverted wrappers. Add this for str_down_up() as well. Suggested-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240812182939.work.424-kees@kernel.org Reviewed-by: Andy Shevchenko Signed-off-by: Kees Cook --- include/linux/string_choices.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index bcde3c9cff81..1320bcdcb89c 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -46,6 +46,7 @@ static inline const char *str_up_down(bool v) { return v ? "up" : "down"; } +#define str_down_up(v) str_up_down(!(v)) /** * str_plural - Return the simple pluralization based on English counts -- cgit v1.2.3 From 0cb70ee4a6ee6b0a4b0e0f70a64a094c6fe05944 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 14 Aug 2024 13:22:25 +0800 Subject: virtio: rename virtio_config_enabled to virtio_config_core_enabled Following patch will allow the config interrupt to be disabled by a specific driver via another boolean. So this patch renames virtio_config_enabled and relevant helpers to virtio_config_core_enabled. Cc: Venkat Venkatsubra Cc: Gia-Khanh Nguyen Acked-by: Michael S. Tsirkin Signed-off-by: Jason Wang Link: https://patch.msgid.link/20240814052228.4654-2-jasowang@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/virtio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 4b16844c6bc2..de6d3cc176d9 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -118,7 +118,7 @@ struct virtio_admin_cmd { * struct virtio_device - representation of a device using virtio * @index: unique position on the virtio bus * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore) - * @config_enabled: configuration change reporting enabled + * @config_core_enabled: configuration change reporting enabled by core * @config_change_pending: configuration change reported while disabled * @config_lock: protects configuration change reporting * @vqs_list_lock: protects @vqs. @@ -135,7 +135,7 @@ struct virtio_admin_cmd { struct virtio_device { int index; bool failed; - bool config_enabled; + bool config_core_enabled; bool config_change_pending; spinlock_t config_lock; spinlock_t vqs_list_lock; -- cgit v1.2.3 From 224de6f886f8184fd448700a6d78f216694de948 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 14 Aug 2024 13:22:26 +0800 Subject: virtio: allow driver to disable the configure change notification Sometime, it would be useful to disable the configure change notification from the driver. So this patch allows this by introducing a variable config_change_driver_disabled and only allow the configure change notification callback to be triggered when it is allowed by both the virtio core and the driver. It is set to false by default to hold the current semantic so we don't need to change any drivers. The first user for this would be virtio-net. Cc: Venkat Venkatsubra Cc: Gia-Khanh Nguyen Acked-by: Michael S. Tsirkin Signed-off-by: Jason Wang Link: https://patch.msgid.link/20240814052228.4654-3-jasowang@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/virtio.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index de6d3cc176d9..306137a15d07 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -119,6 +119,8 @@ struct virtio_admin_cmd { * @index: unique position on the virtio bus * @failed: saved value for VIRTIO_CONFIG_S_FAILED bit (for restore) * @config_core_enabled: configuration change reporting enabled by core + * @config_driver_disabled: configuration change reporting disabled by + * a driver * @config_change_pending: configuration change reported while disabled * @config_lock: protects configuration change reporting * @vqs_list_lock: protects @vqs. @@ -136,6 +138,7 @@ struct virtio_device { int index; bool failed; bool config_core_enabled; + bool config_driver_disabled; bool config_change_pending; spinlock_t config_lock; spinlock_t vqs_list_lock; @@ -166,6 +169,10 @@ void __virtqueue_break(struct virtqueue *_vq); void __virtqueue_unbreak(struct virtqueue *_vq); void virtio_config_changed(struct virtio_device *dev); + +void virtio_config_driver_disable(struct virtio_device *dev); +void virtio_config_driver_enable(struct virtio_device *dev); + #ifdef CONFIG_PM_SLEEP int virtio_device_freeze(struct virtio_device *dev); int virtio_device_restore(struct virtio_device *dev); -- cgit v1.2.3 From 3942bb49728ad9e1f94d953a88af169a8f5d8099 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Wed, 14 Aug 2024 13:00:34 +0300 Subject: string: add mem_is_zero() helper to check if memory area is all zeros Almost two thirds of the memchr_inv() usages check if the memory area is all zeros, with no interest in where in the buffer the first non-zero byte is located. Checking for !memchr_inv(s, 0, n) is also not very intuitive or discoverable. Add an explicit mem_is_zero() helper for this use case. Reviewed-by: Kees Cook Reviewed-by: Andy Shevchenko Link: https://patchwork.freedesktop.org/patch/msgid/20240814100035.3100852-1-jani.nikula@intel.com Signed-off-by: Jani Nikula --- include/linux/string.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 9edace076ddb..5855c5626b4b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -279,6 +279,18 @@ static inline void memcpy_flushcache(void *dst, const void *src, size_t cnt) void *memchr_inv(const void *s, int c, size_t n); char *strreplace(char *str, char old, char new); +/** + * mem_is_zero - Check if an area of memory is all 0's. + * @s: The memory area + * @n: The size of the area + * + * Return: True if the area of memory is all 0's. + */ +static inline bool mem_is_zero(const void *s, size_t n) +{ + return !memchr_inv(s, 0, n); +} + extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp) __malloc; -- cgit v1.2.3 From bf5ffc8c80e0cf5205849cd0c9c3cb261d2beee6 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:18 -0600 Subject: perf: arm_pmu: Remove event index to counter remapping Xscale and Armv6 PMUs defined the cycle counter at 0 and event counters starting at 1 and had 1:1 event index to counter numbering. On Armv7 and later, this changed the cycle counter to 31 and event counters start at 0. The drivers for Armv7 and PMUv3 kept the old event index numbering and introduced an event index to counter conversion. The conversion uses masking to convert from event index to a counter number. This operation relies on having at most 32 counters so that the cycle counter index 0 can be transformed to counter number 31. Armv9.4 adds support for an additional fixed function counter (instructions) which increases possible counters to more than 32, and the conversion won't work anymore as a simple subtract and mask. The primary reason for the translation (other than history) seems to be to have a contiguous mask of counters 0-N. Keeping that would result in more complicated index to counter conversions. Instead, store a mask of available counters rather than just number of events. That provides more information in addition to the number of events. No (intended) functional changes. Acked-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-1-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- include/linux/perf/arm_pmu.h | 2 +- include/linux/perf/arm_pmuv3.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index b3b34f6670cf..e5d6d204beab 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -96,7 +96,7 @@ struct arm_pmu { void (*stop)(struct arm_pmu *); void (*reset)(void *); int (*map_event)(struct perf_event *event); - int num_events; + DECLARE_BITMAP(cntr_mask, ARMPMU_MAX_HWEVENTS); bool secure_access; /* 32-bit ARM only */ #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 7867db04ec98..eccbdd8eb98f 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -6,6 +6,7 @@ #ifndef __PERF_ARM_PMUV3_H #define __PERF_ARM_PMUV3_H +#define ARMV8_PMU_MAX_GENERAL_COUNTERS 31 #define ARMV8_PMU_MAX_COUNTERS 32 #define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1) -- cgit v1.2.3 From f9b11aa00708d94a0cd78bfde34b68c0f95d8b50 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:21 -0600 Subject: KVM: arm64: pmu: Use generated define for PMSELR_EL0.SEL access ARMV8_PMU_COUNTER_MASK is really a mask for the PMSELR_EL0.SEL register field. Make that clear by adding a standard sysreg definition for the register, and using it instead. Reviewed-by: Mark Rutland Acked-by: Mark Rutland Reviewed-by: Marc Zyngier Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-4-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index eccbdd8eb98f..792b8e10b72a 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -8,7 +8,6 @@ #define ARMV8_PMU_MAX_GENERAL_COUNTERS 31 #define ARMV8_PMU_MAX_COUNTERS 32 -#define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1) /* * Common architectural and microarchitectural event numbers. -- cgit v1.2.3 From 126d7d7cce5e048fb82477a9842d088d10ff0df6 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:22 -0600 Subject: arm64: perf/kvm: Use a common PMU cycle counter define The PMUv3 and KVM code each have a define for the PMU cycle counter index. Move KVM's define to a shared location and use it for PMUv3 driver. Reviewed-by: Marc Zyngier Acked-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-5-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 792b8e10b72a..f4ec76f725a3 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -9,6 +9,9 @@ #define ARMV8_PMU_MAX_GENERAL_COUNTERS 31 #define ARMV8_PMU_MAX_COUNTERS 32 +#define ARMV8_PMU_CYCLE_IDX 31 + + /* * Common architectural and microarchitectural event numbers. */ -- cgit v1.2.3 From 2f62701fa5b0ee94c68d2fcfc470d08aef195441 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:23 -0600 Subject: KVM: arm64: Refine PMU defines for number of counters There are 2 defines for the number of PMU counters: ARMV8_PMU_MAX_COUNTERS and ARMPMU_MAX_HWEVENTS. Both are the same currently, but Armv9.4/8.9 increases the number of possible counters from 32 to 33. With this change, the maximum number of counters will differ for KVM's PMU emulation which is PMUv3.4. Give KVM PMU emulation its own define to decouple it from the rest of the kernel's number PMU counters. The VHE PMU code needs to match the PMU driver, so switch it to use ARMPMU_MAX_HWEVENTS instead. Acked-by: Mark Rutland Reviewed-by: Marc Zyngier Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-6-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- include/linux/perf/arm_pmuv3.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index f4ec76f725a3..4f7a7f2222e5 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -7,8 +7,6 @@ #define __PERF_ARM_PMUV3_H #define ARMV8_PMU_MAX_GENERAL_COUNTERS 31 -#define ARMV8_PMU_MAX_COUNTERS 32 - #define ARMV8_PMU_CYCLE_IDX 31 -- cgit v1.2.3 From d8226d8cfbaf5eb9771af8ad8b4e58697e2ffb74 Mon Sep 17 00:00:00 2001 From: "Rob Herring (Arm)" Date: Wed, 31 Jul 2024 10:51:24 -0600 Subject: perf: arm_pmuv3: Add support for Armv9.4 PMU instruction counter Armv9.4/8.9 PMU adds optional support for a fixed instruction counter similar to the fixed cycle counter. Support for the feature is indicated in the ID_AA64DFR1_EL1 register PMICNTR field. The counter is not accessible in AArch32. Existing userspace using direct counter access won't know how to handle the fixed instruction counter, so we have to avoid using the counter when user access is requested. Acked-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Tested-by: James Clark Link: https://lore.kernel.org/r/20240731-arm-pmu-3-9-icntr-v3-7-280a8d7ff465@kernel.org Signed-off-by: Will Deacon --- include/linux/perf/arm_pmu.h | 8 ++++++-- include/linux/perf/arm_pmuv3.h | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index e5d6d204beab..4b5b83677e3f 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -17,10 +17,14 @@ #ifdef CONFIG_ARM_PMU /* - * The ARMv7 CPU PMU supports up to 32 event counters. + * The Armv7 and Armv8.8 or less CPU PMU supports up to 32 event counters. + * The Armv8.9/9.4 CPU PMU supports up to 33 event counters. */ +#ifdef CONFIG_ARM #define ARMPMU_MAX_HWEVENTS 32 - +#else +#define ARMPMU_MAX_HWEVENTS 33 +#endif /* * ARM PMU hw_event flags */ diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index 4f7a7f2222e5..3372c1b56486 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -8,7 +8,7 @@ #define ARMV8_PMU_MAX_GENERAL_COUNTERS 31 #define ARMV8_PMU_CYCLE_IDX 31 - +#define ARMV8_PMU_INSTR_IDX 32 /* Not accessible from AArch32 */ /* * Common architectural and microarchitectural event numbers. @@ -228,8 +228,10 @@ */ #define ARMV8_PMU_OVSR_P GENMASK(30, 0) #define ARMV8_PMU_OVSR_C BIT(31) +#define ARMV8_PMU_OVSR_F BIT_ULL(32) /* arm64 only */ /* Mask for writable bits is both P and C fields */ -#define ARMV8_PMU_OVERFLOWED_MASK (ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C) +#define ARMV8_PMU_OVERFLOWED_MASK (ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C | \ + ARMV8_PMU_OVSR_F) /* * PMXEVTYPER: Event selection reg -- cgit v1.2.3 From fca5cb4dd2b4a9423cb6d112cc71c33899955a1f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 10 Aug 2024 14:20:55 +0800 Subject: Revert "lib/mpi: Extend the MPI library" This partially reverts commit a8ea8bdd9df92a0e5db5b43900abb7a288b8a53e. Most of it is no longer needed since sm2 has been removed. However, the following functions have been kept as they have developed other uses: mpi_copy mpi_mod mpi_test_bit mpi_set_bit mpi_rshift mpi_add mpi_sub mpi_addm mpi_subm mpi_mul mpi_mulm mpi_tdiv_r mpi_fdiv_r Signed-off-by: Herbert Xu --- include/linux/mpi.h | 65 ----------------------------------------------------- 1 file changed, 65 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mpi.h b/include/linux/mpi.h index 89b720893e12..e081428b91ef 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -40,87 +40,33 @@ struct gcry_mpi { typedef struct gcry_mpi *MPI; #define mpi_get_nlimbs(a) ((a)->nlimbs) -#define mpi_has_sign(a) ((a)->sign) /*-- mpiutil.c --*/ MPI mpi_alloc(unsigned nlimbs); -void mpi_clear(MPI a); void mpi_free(MPI a); int mpi_resize(MPI a, unsigned nlimbs); -static inline MPI mpi_new(unsigned int nbits) -{ - return mpi_alloc((nbits + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB); -} - MPI mpi_copy(MPI a); -MPI mpi_alloc_like(MPI a); -void mpi_snatch(MPI w, MPI u); -MPI mpi_set(MPI w, MPI u); -MPI mpi_set_ui(MPI w, unsigned long u); -MPI mpi_alloc_set_ui(unsigned long u); -void mpi_swap_cond(MPI a, MPI b, unsigned long swap); - -/* Constants used to return constant MPIs. See mpi_init if you - * want to add more constants. - */ -#define MPI_NUMBER_OF_CONSTANTS 6 -enum gcry_mpi_constants { - MPI_C_ZERO, - MPI_C_ONE, - MPI_C_TWO, - MPI_C_THREE, - MPI_C_FOUR, - MPI_C_EIGHT -}; - -MPI mpi_const(enum gcry_mpi_constants no); /*-- mpicoder.c --*/ - -/* Different formats of external big integer representation. */ -enum gcry_mpi_format { - GCRYMPI_FMT_NONE = 0, - GCRYMPI_FMT_STD = 1, /* Twos complement stored without length. */ - GCRYMPI_FMT_PGP = 2, /* As used by OpenPGP (unsigned only). */ - GCRYMPI_FMT_SSH = 3, /* As used by SSH (like STD but with length). */ - GCRYMPI_FMT_HEX = 4, /* Hex format. */ - GCRYMPI_FMT_USG = 5, /* Like STD but unsigned. */ - GCRYMPI_FMT_OPAQUE = 8 /* Opaque format (some functions only). */ -}; - MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes); MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread); -int mpi_fromstr(MPI val, const char *str); -MPI mpi_scanval(const char *string); MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len); void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign); int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, int *sign); int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes, int *sign); -int mpi_print(enum gcry_mpi_format format, unsigned char *buffer, - size_t buflen, size_t *nwritten, MPI a); /*-- mpi-mod.c --*/ void mpi_mod(MPI rem, MPI dividend, MPI divisor); -/* Context used with Barrett reduction. */ -struct barrett_ctx_s; -typedef struct barrett_ctx_s *mpi_barrett_t; - -mpi_barrett_t mpi_barrett_init(MPI m, int copy); -void mpi_barrett_free(mpi_barrett_t ctx); -void mpi_mod_barrett(MPI r, MPI x, mpi_barrett_t ctx); -void mpi_mul_barrett(MPI w, MPI u, MPI v, mpi_barrett_t ctx); - /*-- mpi-pow.c --*/ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod); /*-- mpi-cmp.c --*/ int mpi_cmp_ui(MPI u, ulong v); int mpi_cmp(MPI u, MPI v); -int mpi_cmpabs(MPI u, MPI v); /*-- mpi-sub-ui.c --*/ int mpi_sub_ui(MPI w, MPI u, unsigned long vval); @@ -130,16 +76,9 @@ void mpi_normalize(MPI a); unsigned mpi_get_nbits(MPI a); int mpi_test_bit(MPI a, unsigned int n); void mpi_set_bit(MPI a, unsigned int n); -void mpi_set_highbit(MPI a, unsigned int n); -void mpi_clear_highbit(MPI a, unsigned int n); -void mpi_clear_bit(MPI a, unsigned int n); -void mpi_rshift_limbs(MPI a, unsigned int count); void mpi_rshift(MPI x, MPI a, unsigned int n); -void mpi_lshift_limbs(MPI a, unsigned int count); -void mpi_lshift(MPI x, MPI a, unsigned int n); /*-- mpi-add.c --*/ -void mpi_add_ui(MPI w, MPI u, unsigned long v); void mpi_add(MPI w, MPI u, MPI v); void mpi_sub(MPI w, MPI u, MPI v); void mpi_addm(MPI w, MPI u, MPI v, MPI m); @@ -152,10 +91,6 @@ void mpi_mulm(MPI w, MPI u, MPI v, MPI m); /*-- mpi-div.c --*/ void mpi_tdiv_r(MPI rem, MPI num, MPI den); void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor); -void mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor); - -/*-- mpi-inv.c --*/ -int mpi_invm(MPI x, MPI a, MPI n); /* inline functions */ -- cgit v1.2.3 From 8e3a67f2de87ee94ac11ea69beb4edc2993b17a0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 10 Aug 2024 14:20:57 +0800 Subject: crypto: lib/mpi - Add error checks to extension The remaining functions added by commit a8ea8bdd9df92a0e5db5b43900abb7a288b8a53e did not check for memory allocation errors. Add the checks and change the API to allow errors to be returned. Fixes: a8ea8bdd9df9 ("lib/mpi: Extend the MPI library") Signed-off-by: Herbert Xu --- include/linux/mpi.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mpi.h b/include/linux/mpi.h index e081428b91ef..47be46f36435 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -59,7 +59,7 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes, int *sign); /*-- mpi-mod.c --*/ -void mpi_mod(MPI rem, MPI dividend, MPI divisor); +int mpi_mod(MPI rem, MPI dividend, MPI divisor); /*-- mpi-pow.c --*/ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod); @@ -75,22 +75,22 @@ int mpi_sub_ui(MPI w, MPI u, unsigned long vval); void mpi_normalize(MPI a); unsigned mpi_get_nbits(MPI a); int mpi_test_bit(MPI a, unsigned int n); -void mpi_set_bit(MPI a, unsigned int n); -void mpi_rshift(MPI x, MPI a, unsigned int n); +int mpi_set_bit(MPI a, unsigned int n); +int mpi_rshift(MPI x, MPI a, unsigned int n); /*-- mpi-add.c --*/ -void mpi_add(MPI w, MPI u, MPI v); -void mpi_sub(MPI w, MPI u, MPI v); -void mpi_addm(MPI w, MPI u, MPI v, MPI m); -void mpi_subm(MPI w, MPI u, MPI v, MPI m); +int mpi_add(MPI w, MPI u, MPI v); +int mpi_sub(MPI w, MPI u, MPI v); +int mpi_addm(MPI w, MPI u, MPI v, MPI m); +int mpi_subm(MPI w, MPI u, MPI v, MPI m); /*-- mpi-mul.c --*/ -void mpi_mul(MPI w, MPI u, MPI v); -void mpi_mulm(MPI w, MPI u, MPI v, MPI m); +int mpi_mul(MPI w, MPI u, MPI v); +int mpi_mulm(MPI w, MPI u, MPI v, MPI m); /*-- mpi-div.c --*/ -void mpi_tdiv_r(MPI rem, MPI num, MPI den); -void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor); +int mpi_tdiv_r(MPI rem, MPI num, MPI den); +int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor); /* inline functions */ -- cgit v1.2.3 From abc158c82ae555078aa5dd2d8407c3df0f868904 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 May 2024 10:55:59 +0200 Subject: sched: Prepare generic code for delayed dequeue While most of the delayed dequeue code can be done inside the sched_class itself, there is one location where we do not have an appropriate hook, namely ttwu_runnable(). Add an ENQUEUE_DELAYED call to the on_rq path to deal with waking delayed dequeue tasks. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20240727105029.200000445@infradead.org --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c1b4ee3234f..f4a648e739e1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -544,6 +544,7 @@ struct sched_entity { struct list_head group_node; unsigned int on_rq; + unsigned int sched_delayed; u64 exec_start; u64 sum_exec_runtime; -- cgit v1.2.3 From a1c446611e31ca5363d4db51e398271da1dce0af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 Jul 2024 21:30:09 +0200 Subject: sched,freezer: Mark TASK_FROZEN special The special task states are those that do not suffer spurious wakeups, TASK_FROZEN is very much one of those, mark it as such. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20240727105029.998329901@infradead.org --- include/linux/sched.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f4a648e739e1..8a3a389bd623 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -149,8 +149,9 @@ struct user_event_mm; * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ -#define is_special_task_state(state) \ - ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) +#define is_special_task_state(state) \ + ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ + TASK_DEAD | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ -- cgit v1.2.3 From 82e9d0456e06cebe2c89f3c73cdbc9e3805e9437 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 31 May 2024 15:49:40 +0200 Subject: sched/fair: Avoid re-setting virtual deadline on 'migrations' During OSPM24 Youssef noted that migrations are re-setting the virtual deadline. Notably everything that does a dequeue-enqueue, like setting nice, changing preferred numa-node, and a myriad of other random crap, will cause this to happen. This shouldn't be. Preserve the relative virtual deadline across such dequeue/enqueue cycles. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20240727105030.625119246@infradead.org --- include/linux/sched.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8a3a389bd623..d25e1cfd5766 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -544,8 +544,10 @@ struct sched_entity { u64 min_vruntime; struct list_head group_node; - unsigned int on_rq; - unsigned int sched_delayed; + unsigned char on_rq; + unsigned char sched_delayed; + unsigned char rel_deadline; + /* hole */ u64 exec_start; u64 sum_exec_runtime; -- cgit v1.2.3 From 857b158dc5e81c6de795ef6be006eed146098fc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 May 2023 13:46:30 +0200 Subject: sched/eevdf: Use sched_attr::sched_runtime to set request/slice suggestion Allow applications to directly set a suggested request/slice length using sched_attr::sched_runtime. The implementation clamps the value to: 0.1[ms] <= slice <= 100[ms] which is 1/10 the size of HZ=1000 and 10 times the size of HZ=100. Applications should strive to use their periodic runtime at a high confidence interval (95%+) as the target slice. Using a smaller slice will introduce undue preemptions, while using a larger value will increase latency. For all the following examples assume a scheduling quantum of 8, and for consistency all examples have W=4: {A,B,C,D}(w=1,r=8): ABCD... +---+---+---+--- t=0, V=1.5 t=1, V=3.5 A |------< A |------< B |------< B |------< C |------< C |------< D |------< D |------< ---+*------+-------+--- ---+--*----+-------+--- t=2, V=5.5 t=3, V=7.5 A |------< A |------< B |------< B |------< C |------< C |------< D |------< D |------< ---+----*--+-------+--- ---+------*+-------+--- Note: 4 identical tasks in FIFO order ~~~ {A,B}(w=1,r=16) C(w=2,r=16) AACCBBCC... +---+---+---+--- t=0, V=1.25 t=2, V=5.25 A |--------------< A |--------------< B |--------------< B |--------------< C |------< C |------< ---+*------+-------+--- ---+----*--+-------+--- t=4, V=8.25 t=6, V=12.25 A |--------------< A |--------------< B |--------------< B |--------------< C |------< C |------< ---+-------*-------+--- ---+-------+---*---+--- Note: 1 heavy task -- because q=8, double r such that the deadline of the w=2 task doesn't go below q. Note: observe the full schedule becomes: W*max(r_i/w_i) = 4*2q = 8q in length. Note: the period of the heavy task is half the full period at: W*(r_i/w_i) = 4*(2q/2) = 4q ~~~ {A,C,D}(w=1,r=16) B(w=1,r=8): BAACCBDD... +---+---+---+--- t=0, V=1.5 t=1, V=3.5 A |--------------< A |---------------< B |------< B |------< C |--------------< C |--------------< D |--------------< D |--------------< ---+*------+-------+--- ---+--*----+-------+--- t=3, V=7.5 t=5, V=11.5 A |---------------< A |---------------< B |------< B |------< C |--------------< C |--------------< D |--------------< D |--------------< ---+------*+-------+--- ---+-------+--*----+--- t=6, V=13.5 A |---------------< B |------< C |--------------< D |--------------< ---+-------+----*--+--- Note: 1 short task -- again double r so that the deadline of the short task won't be below q. Made B short because its not the leftmost task, but is eligible with the 0,1,2,3 spread. Note: like with the heavy task, the period of the short task observes: W*(r_i/w_i) = 4*(1q/1) = 4q ~~~ A(w=1,r=16) B(w=1,r=8) C(w=2,r=16) BCCAABCC... +---+---+---+--- t=0, V=1.25 t=1, V=3.25 A |--------------< A |--------------< B |------< B |------< C |------< C |------< ---+*------+-------+--- ---+--*----+-------+--- t=3, V=7.25 t=5, V=11.25 A |--------------< A |--------------< B |------< B |------< C |------< C |------< ---+------*+-------+--- ---+-------+--*----+--- t=6, V=13.25 A |--------------< B |------< C |------< ---+-------+----*--+--- Note: 1 heavy and 1 short task -- combine them all. Note: both the short and heavy task end up with a period of 4q ~~~ A(w=1,r=16) B(w=2,r=16) C(w=1,r=8) BBCAABBC... +---+---+---+--- t=0, V=1 t=2, V=5 A |--------------< A |--------------< B |------< B |------< C |------< C |------< ---+*------+-------+--- ---+----*--+-------+--- t=3, V=7 t=5, V=11 A |--------------< A |--------------< B |------< B |------< C |------< C |------< ---+------*+-------+--- ---+-------+--*----+--- t=7, V=15 A |--------------< B |------< C |------< ---+-------+------*+--- Note: as before but permuted ~~~ From all this it can be deduced that, for the steady state: - the total period (P) of a schedule is: W*max(r_i/w_i) - the average period of a task is: W*(r_i/w_i) - each task obtains the fair share: w_i/W of each full period P Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20240727105030.842834421@infradead.org --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index d25e1cfd5766..89a3d8d94e96 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -547,6 +547,7 @@ struct sched_entity { unsigned char on_rq; unsigned char sched_delayed; unsigned char rel_deadline; + unsigned char custom_slice; /* hole */ u64 exec_start; -- cgit v1.2.3 From aef6987d89544d63a47753cf3741cabff0b5574c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 20 Jun 2024 13:16:49 +0200 Subject: sched/eevdf: Propagate min_slice up the cgroup hierarchy In the absence of an explicit cgroup slice configureation, make mixed slice length work with cgroups by propagating the min_slice up the hierarchy. This ensures the cgroup entity gets timely service to service its entities that have this timing constraint set on them. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20240727105030.948188417@infradead.org --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 89a3d8d94e96..3709dedbab59 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -542,6 +542,7 @@ struct sched_entity { struct rb_node run_node; u64 deadline; u64 min_vruntime; + u64 min_slice; struct list_head group_node; unsigned char on_rq; -- cgit v1.2.3 From fda1dd3c54ef3c4200c2e77634a91610da973950 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 18 Jul 2024 17:50:37 -0700 Subject: find: Switch from inline to __always_inline 'inline' keyword is only a recommendation for compiler. If it decides to not inline find_bit nodemask functions, the whole small_const_nbits() machinery doesn't work. This is how a standard GCC 11.3.0 does for my x86_64 build now. This patch replaces 'inline' directive with unconditional '__always_inline' to make sure that there's always a chance for compile-time optimization. It doesn't change size of kernel image, according to bloat-o-meter. [[ Brian: split out from: Subject: [PATCH 1/3] bitmap: switch from inline to __always_inline https://lore.kernel.org/all/20221027043810.350460-2-yury.norov@gmail.com/ But rewritten, as there were too many conflicts. ]] Co-developed-by: Brian Norris Signed-off-by: Brian Norris Reviewed-by: Kees Cook Reviewed-by: Nathan Chancellor Signed-off-by: Yury Norov --- include/linux/find.h | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/find.h b/include/linux/find.h index 5dfca4225fef..68685714bc18 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -52,7 +52,7 @@ unsigned long _find_next_bit_le(const unsigned long *addr, unsigned * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { @@ -81,7 +81,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size, * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) @@ -112,7 +112,7 @@ unsigned long find_next_and_bit(const unsigned long *addr1, * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) @@ -142,7 +142,7 @@ unsigned long find_next_andnot_bit(const unsigned long *addr1, * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) @@ -171,7 +171,7 @@ unsigned long find_next_or_bit(const unsigned long *addr1, * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ -static inline +static __always_inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { @@ -198,7 +198,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_first_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -224,7 +224,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) * Returns the bit number of the N'th set bit. * If no such, returns >= @size. */ -static inline +static __always_inline unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n) { if (n >= size) @@ -249,7 +249,7 @@ unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsign * Returns the bit number of the N'th set bit. * If no such, returns @size. */ -static inline +static __always_inline unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { @@ -276,7 +276,7 @@ unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long * * Returns the bit number of the N'th set bit. * If no such, returns @size. */ -static inline +static __always_inline unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long n) { @@ -332,7 +332,7 @@ unsigned long find_nth_and_andnot_bit(const unsigned long *addr1, * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_first_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size) @@ -357,7 +357,7 @@ unsigned long find_first_and_bit(const unsigned long *addr1, * Returns the bit number for the first set bit * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2, const unsigned long *addr3, @@ -381,7 +381,7 @@ unsigned long find_first_and_and_bit(const unsigned long *addr1, * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ -static inline +static __always_inline unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -402,7 +402,7 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) * * Returns the bit number of the last set bit, or size. */ -static inline +static __always_inline unsigned long find_last_bit(const unsigned long *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -425,7 +425,7 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size) * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_and_bit_wrap(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) @@ -448,7 +448,7 @@ unsigned long find_next_and_bit_wrap(const unsigned long *addr1, * Returns the bit number for the next set bit, or first set bit up to @offset * If no bits are set, returns @size. */ -static inline +static __always_inline unsigned long find_next_bit_wrap(const unsigned long *addr, unsigned long size, unsigned long offset) { @@ -465,7 +465,7 @@ unsigned long find_next_bit_wrap(const unsigned long *addr, * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing * before using it alone. */ -static inline +static __always_inline unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size, unsigned long start, unsigned long n) { @@ -506,20 +506,20 @@ extern unsigned long find_next_clump8(unsigned long *clump, #if defined(__LITTLE_ENDIAN) -static inline unsigned long find_next_zero_bit_le(const void *addr, - unsigned long size, unsigned long offset) +static __always_inline +unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_zero_bit(addr, size, offset); } -static inline unsigned long find_next_bit_le(const void *addr, - unsigned long size, unsigned long offset) +static __always_inline +unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { return find_next_bit(addr, size, offset); } -static inline unsigned long find_first_zero_bit_le(const void *addr, - unsigned long size) +static __always_inline +unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { return find_first_zero_bit(addr, size); } @@ -527,7 +527,7 @@ static inline unsigned long find_first_zero_bit_le(const void *addr, #elif defined(__BIG_ENDIAN) #ifndef find_next_zero_bit_le -static inline +static __always_inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { @@ -546,7 +546,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned #endif #ifndef find_first_zero_bit_le -static inline +static __always_inline unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) { if (small_const_nbits(size)) { @@ -560,7 +560,7 @@ unsigned long find_first_zero_bit_le(const void *addr, unsigned long size) #endif #ifndef find_next_bit_le -static inline +static __always_inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { -- cgit v1.2.3 From ed8cd2b3bd9f070120528fc5403a2d0b5afe07f8 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 18 Jul 2024 17:50:38 -0700 Subject: bitmap: Switch from inline to __always_inline 'inline' keyword is only a recommendation for compiler. If it decides to not inline bitmap functions, the whole small_const_nbits() machinery doesn't work. This is how a standard GCC 11.3.0 does for my x86_64 build now. This patch replaces 'inline' directive with unconditional '__always_inline' to make sure that there's always a chance for compile-time optimization. It doesn't change size of kernel image, according to bloat-o-meter. [[ Brian: split out from: Subject: [PATCH 1/3] bitmap: switch from inline to __always_inline https://lore.kernel.org/all/20221027043810.350460-2-yury.norov@gmail.com/ But rewritten, as there were too many conflicts. ]] Co-developed-by: Brian Norris Signed-off-by: Brian Norris Reviewed-by: Kees Cook Reviewed-by: Nathan Chancellor Signed-off-by: Yury Norov --- include/linux/bitmap.h | 140 +++++++++++++++++++++++++++---------------------- 1 file changed, 76 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 8c4768c44a01..0406f48f0a6a 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -203,12 +203,12 @@ unsigned long bitmap_find_next_zero_area_off(unsigned long *map, * the bit offset of all zero areas this function finds is multiples of that * power of 2. A @align_mask of 0 means no alignment is required. */ -static inline unsigned long -bitmap_find_next_zero_area(unsigned long *map, - unsigned long size, - unsigned long start, - unsigned int nr, - unsigned long align_mask) +static __always_inline +unsigned long bitmap_find_next_zero_area(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned int nr, + unsigned long align_mask) { return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0); @@ -228,7 +228,7 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig, #define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) -static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) +static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = bitmap_size(nbits); @@ -238,7 +238,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) memset(dst, 0, len); } -static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) +static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { unsigned int len = bitmap_size(nbits); @@ -248,8 +248,8 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) memset(dst, 0xff, len); } -static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, - unsigned int nbits) +static __always_inline +void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits) { unsigned int len = bitmap_size(nbits); @@ -262,8 +262,8 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, /* * Copy bitmap and clear tail bits in last word. */ -static inline void bitmap_copy_clear_tail(unsigned long *dst, - const unsigned long *src, unsigned int nbits) +static __always_inline +void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits) { bitmap_copy(dst, src, nbits); if (nbits % BITS_PER_LONG) @@ -306,16 +306,18 @@ void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits); bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits)) #endif -static inline bool bitmap_and(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) +static __always_inline +bool bitmap_and(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_and(dst, src1, src2, nbits); } -static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) +static __always_inline +void bitmap_or(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; @@ -323,8 +325,9 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, __bitmap_or(dst, src1, src2, nbits); } -static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) +static __always_inline +void bitmap_xor(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 ^ *src2; @@ -332,16 +335,17 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, __bitmap_xor(dst, src1, src2, nbits); } -static inline bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) +static __always_inline +bool bitmap_andnot(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_andnot(dst, src1, src2, nbits); } -static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, - unsigned int nbits) +static __always_inline +void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = ~(*src); @@ -356,8 +360,8 @@ static inline void bitmap_complement(unsigned long *dst, const unsigned long *sr #endif #define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1) -static inline bool bitmap_equal(const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) +static __always_inline +bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); @@ -376,10 +380,9 @@ static inline bool bitmap_equal(const unsigned long *src1, * * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise */ -static inline bool bitmap_or_equal(const unsigned long *src1, - const unsigned long *src2, - const unsigned long *src3, - unsigned int nbits) +static __always_inline +bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2, + const unsigned long *src3, unsigned int nbits) { if (!small_const_nbits(nbits)) return __bitmap_or_equal(src1, src2, src3, nbits); @@ -387,9 +390,8 @@ static inline bool bitmap_or_equal(const unsigned long *src1, return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits)); } -static inline bool bitmap_intersects(const unsigned long *src1, - const unsigned long *src2, - unsigned int nbits) +static __always_inline +bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; @@ -397,8 +399,8 @@ static inline bool bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } -static inline bool bitmap_subset(const unsigned long *src1, - const unsigned long *src2, unsigned int nbits) +static __always_inline +bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); @@ -406,7 +408,8 @@ static inline bool bitmap_subset(const unsigned long *src1, return __bitmap_subset(src1, src2, nbits); } -static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) +static __always_inline +bool bitmap_empty(const unsigned long *src, unsigned nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); @@ -414,7 +417,8 @@ static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) return find_first_bit(src, nbits) == nbits; } -static inline bool bitmap_full(const unsigned long *src, unsigned int nbits) +static __always_inline +bool bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); @@ -448,8 +452,8 @@ unsigned long bitmap_weight_andnot(const unsigned long *src1, return __bitmap_weight_andnot(src1, src2, nbits); } -static __always_inline void bitmap_set(unsigned long *map, unsigned int start, - unsigned int nbits) +static __always_inline +void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) __set_bit(start, map); @@ -464,8 +468,8 @@ static __always_inline void bitmap_set(unsigned long *map, unsigned int start, __bitmap_set(map, start, nbits); } -static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, - unsigned int nbits) +static __always_inline +void bitmap_clear(unsigned long *map, unsigned int start, unsigned int nbits) { if (__builtin_constant_p(nbits) && nbits == 1) __clear_bit(start, map); @@ -480,8 +484,9 @@ static __always_inline void bitmap_clear(unsigned long *map, unsigned int start, __bitmap_clear(map, start, nbits); } -static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, - unsigned int shift, unsigned int nbits) +static __always_inline +void bitmap_shift_right(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift; @@ -489,8 +494,9 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *s __bitmap_shift_right(dst, src, shift, nbits); } -static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *src, - unsigned int shift, unsigned int nbits) +static __always_inline +void bitmap_shift_left(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits); @@ -498,11 +504,12 @@ static inline void bitmap_shift_left(unsigned long *dst, const unsigned long *sr __bitmap_shift_left(dst, src, shift, nbits); } -static inline void bitmap_replace(unsigned long *dst, - const unsigned long *old, - const unsigned long *new, - const unsigned long *mask, - unsigned int nbits) +static __always_inline +void bitmap_replace(unsigned long *dst, + const unsigned long *old, + const unsigned long *new, + const unsigned long *mask, + unsigned int nbits) { if (small_const_nbits(nbits)) *dst = (*old & ~(*mask)) | (*new & *mask); @@ -545,8 +552,9 @@ static inline void bitmap_replace(unsigned long *dst, * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation. * See bitmap_scatter() for details related to this relationship. */ -static inline void bitmap_scatter(unsigned long *dst, const unsigned long *src, - const unsigned long *mask, unsigned int nbits) +static __always_inline +void bitmap_scatter(unsigned long *dst, const unsigned long *src, + const unsigned long *mask, unsigned int nbits) { unsigned int n = 0; unsigned int bit; @@ -599,8 +607,9 @@ static inline void bitmap_scatter(unsigned long *dst, const unsigned long *src, * bitmap_scatter(res, src, mask, n) and a call to * bitmap_scatter(res, result, mask, n) will lead to the same res value. */ -static inline void bitmap_gather(unsigned long *dst, const unsigned long *src, - const unsigned long *mask, unsigned int nbits) +static __always_inline +void bitmap_gather(unsigned long *dst, const unsigned long *src, + const unsigned long *mask, unsigned int nbits) { unsigned int n = 0; unsigned int bit; @@ -611,9 +620,9 @@ static inline void bitmap_gather(unsigned long *dst, const unsigned long *src, __assign_bit(n++, dst, test_bit(bit, src)); } -static inline void bitmap_next_set_region(unsigned long *bitmap, - unsigned int *rs, unsigned int *re, - unsigned int end) +static __always_inline +void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, + unsigned int *re, unsigned int end) { *rs = find_next_bit(bitmap, end, *rs); *re = find_next_zero_bit(bitmap, end, *rs + 1); @@ -628,7 +637,8 @@ static inline void bitmap_next_set_region(unsigned long *bitmap, * This is the complement to __bitmap_find_free_region() and releases * the found region (by clearing it in the bitmap). */ -static inline void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order) +static __always_inline +void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order) { bitmap_clear(bitmap, pos, BIT(order)); } @@ -644,7 +654,8 @@ static inline void bitmap_release_region(unsigned long *bitmap, unsigned int pos * Returns: 0 on success, or %-EBUSY if specified region wasn't * free (not all bits were zero). */ -static inline int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) +static __always_inline +int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { unsigned int len = BIT(order); @@ -668,7 +679,8 @@ static inline int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos * Returns: the bit offset in bitmap of the allocated region, * or -errno on failure. */ -static inline int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order) +static __always_inline +int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order) { unsigned int pos, end; /* scans bitmap by regions of size order */ @@ -722,7 +734,7 @@ static inline int bitmap_find_free_region(unsigned long *bitmap, unsigned int bi * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, * but we expect the lower 32-bits of u64. */ -static inline void bitmap_from_u64(unsigned long *dst, u64 mask) +static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask) { bitmap_from_arr64(dst, &mask, 64); } @@ -737,9 +749,8 @@ static inline void bitmap_from_u64(unsigned long *dst, u64 mask) * @map memory region. For @nbits = 0 and @nbits > BITS_PER_LONG the return * value is undefined. */ -static inline unsigned long bitmap_read(const unsigned long *map, - unsigned long start, - unsigned long nbits) +static __always_inline +unsigned long bitmap_read(const unsigned long *map, unsigned long start, unsigned long nbits) { size_t index = BIT_WORD(start); unsigned long offset = start % BITS_PER_LONG; @@ -772,8 +783,9 @@ static inline unsigned long bitmap_read(const unsigned long *map, * * For @nbits == 0 and @nbits > BITS_PER_LONG no writes are performed. */ -static inline void bitmap_write(unsigned long *map, unsigned long value, - unsigned long start, unsigned long nbits) +static __always_inline +void bitmap_write(unsigned long *map, unsigned long value, + unsigned long start, unsigned long nbits) { size_t index; unsigned long offset; -- cgit v1.2.3 From ab6b1010dab68f6d4bf063517db4ce2d63554bc6 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 18 Jul 2024 17:50:39 -0700 Subject: cpumask: Switch from inline to __always_inline On recent (v6.6+) builds with Clang (based on Clang 18.0.0) and certain configurations [0], I'm finding that (lack of) inlining decisions may lead to section mismatch warnings like the following: WARNING: modpost: vmlinux.o: section mismatch in reference: cpumask_andnot (section: .text) -> cpuhp_bringup_cpus_parallel.tmp_mask (section: .init.data) ERROR: modpost: Section mismatches detected. or more confusingly: WARNING: modpost: vmlinux: section mismatch in reference: cpumask_andnot+0x5f (section: .text) -> efi_systab_phys (section: .init.data) The first warning makes a little sense, because cpuhp_bringup_cpus_parallel() (an __init function) calls cpumask_andnot() on tmp_mask (an __initdata symbol). If the compiler doesn't inline cpumask_andnot(), this may appear like a mismatch. The second warning makes less sense, but might be because efi_systab_phys and cpuhp_bringup_cpus_parallel.tmp_mask are laid out near each other, and the latter isn't a proper C symbol definition. In any case, it seems a reasonable solution to suggest more strongly to the compiler that these cpumask macros *must* be inlined, as 'inline' is just a recommendation. This change has been previously proposed in the past as: Subject: [PATCH 1/3] bitmap: switch from inline to __always_inline https://lore.kernel.org/all/20221027043810.350460-2-yury.norov@gmail.com/ But the change has been split up, to separately justify the cpumask changes (which drive my work) and the bitmap/const optimizations (that Yury separately proposed for other reasons). This ends up as somewhere between a "rebase" and "rewrite" -- I had to rewrite most of the patch. According to bloat-o-meter, vmlinux decreases minimally in size (-0.00% to -0.01%, depending on the version of GCC or Clang and .config in question) with this series of changes: gcc 13.2.0, x86_64_defconfig -3005 bytes, Before=21944501, After=21941496, chg -0.01% clang 16.0.6, x86_64_defconfig -105 bytes, Before=22571692, After=22571587, chg -0.00% gcc 9.5.0, x86_64_defconfig -1771 bytes, Before=21557598, After=21555827, chg -0.01% clang 18.0_pre516547 (ChromiumOS toolchain), x86_64_defconfig -191 bytes, Before=22615339, After=22615148, chg -0.00% clang 18.0_pre516547 (ChromiumOS toolchain), based on ChromiumOS config + gcov -979 bytes, Before=76294783, After=76293804, chg -0.00% [0] CONFIG_HOTPLUG_PARALLEL=y ('select'ed for x86 as of [1]) and CONFIG_GCOV_PROFILE_ALL. [1] commit 0c7ffa32dbd6 ("x86/smpboot/64: Implement arch_cpuhp_init_parallel_bringup() and enable it") Co-developed-by: Brian Norris Signed-off-by: Brian Norris Reviewed-by: Kees Cook Reviewed-by: Nathan Chancellor Signed-off-by: Yury Norov --- include/linux/cpumask.h | 212 +++++++++++++++++++++++++----------------------- 1 file changed, 112 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 801a7e524113..9e3f3c96607d 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -30,7 +30,7 @@ extern unsigned int nr_cpu_ids; #endif -static inline void set_nr_cpu_ids(unsigned int nr) +static __always_inline void set_nr_cpu_ids(unsigned int nr) { #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) WARN_ON(nr != nr_cpu_ids); @@ -149,7 +149,7 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu) * * Return: >= nr_cpu_ids if no cpus set. */ -static inline unsigned int cpumask_first(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), small_cpumask_bits); } @@ -160,7 +160,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) * * Return: >= nr_cpu_ids if all cpus are set. */ -static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp) { return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits); } @@ -172,7 +172,7 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) * * Return: >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ -static inline +static __always_inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); @@ -186,7 +186,7 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask * * Return: >= nr_cpu_ids if no cpus set in all. */ -static inline +static __always_inline unsigned int cpumask_first_and_and(const struct cpumask *srcp1, const struct cpumask *srcp2, const struct cpumask *srcp3) @@ -201,7 +201,7 @@ unsigned int cpumask_first_and_and(const struct cpumask *srcp1, * * Return: >= nr_cpumask_bits if no CPUs set. */ -static inline unsigned int cpumask_last(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), small_cpumask_bits); } @@ -213,7 +213,7 @@ static inline unsigned int cpumask_last(const struct cpumask *srcp) * * Return: >= nr_cpu_ids if no further cpus set. */ -static inline +static __always_inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ @@ -229,7 +229,8 @@ unsigned int cpumask_next(int n, const struct cpumask *srcp) * * Return: >= nr_cpu_ids if no further cpus unset. */ -static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) +static __always_inline +unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) @@ -239,18 +240,21 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) #if NR_CPUS == 1 /* Uniprocessor: there is only one valid CPU */ -static inline unsigned int cpumask_local_spread(unsigned int i, int node) +static __always_inline +unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } -static inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, - const struct cpumask *src2p) +static __always_inline +unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, + const struct cpumask *src2p) { return cpumask_first_and(src1p, src2p); } -static inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) +static __always_inline +unsigned int cpumask_any_distribute(const struct cpumask *srcp) { return cpumask_first(srcp); } @@ -269,9 +273,9 @@ unsigned int cpumask_any_distribute(const struct cpumask *srcp); * * Return: >= nr_cpu_ids if no further cpus set in both. */ -static inline +static __always_inline unsigned int cpumask_next_and(int n, const struct cpumask *src1p, - const struct cpumask *src2p) + const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) @@ -291,7 +295,7 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p, for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits) #if NR_CPUS == 1 -static inline +static __always_inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { cpumask_check(start); @@ -394,7 +398,7 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta * Often used to find any cpu but smp_processor_id() in a mask. * Return: >= nr_cpu_ids if no cpus set. */ -static inline +static __always_inline unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) { unsigned int i; @@ -414,7 +418,7 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) * * Returns >= nr_cpu_ids if no cpus set. */ -static inline +static __always_inline unsigned int cpumask_any_and_but(const struct cpumask *mask1, const struct cpumask *mask2, unsigned int cpu) @@ -436,7 +440,8 @@ unsigned int cpumask_any_and_but(const struct cpumask *mask1, * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ -static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) +static __always_inline +unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) { return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu)); } @@ -449,7 +454,7 @@ static inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *s * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ -static inline +static __always_inline unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -465,7 +470,7 @@ unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ -static inline +static __always_inline unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -508,12 +513,14 @@ unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ -static __always_inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) +static __always_inline +void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } -static __always_inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) +static __always_inline +void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } @@ -557,7 +564,8 @@ static __always_inline void __cpumask_assign_cpu(int cpu, struct cpumask *dstp, * * Return: true if @cpu is set in @cpumask, else returns false */ -static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask) +static __always_inline +bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask) { return test_bit(cpumask_check(cpu), cpumask_bits((cpumask))); } @@ -571,7 +579,8 @@ static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpum * * Return: true if @cpu is set in old bitmap of @cpumask, else returns false */ -static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) +static __always_inline +bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) { return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } @@ -585,7 +594,8 @@ static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cp * * Return: true if @cpu is set in old bitmap of @cpumask, else returns false */ -static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask) +static __always_inline +bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask) { return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } @@ -594,7 +604,7 @@ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask * * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ -static inline void cpumask_setall(struct cpumask *dstp) +static __always_inline void cpumask_setall(struct cpumask *dstp) { if (small_const_nbits(small_cpumask_bits)) { cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits); @@ -607,7 +617,7 @@ static inline void cpumask_setall(struct cpumask *dstp) * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ -static inline void cpumask_clear(struct cpumask *dstp) +static __always_inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), large_cpumask_bits); } @@ -620,9 +630,9 @@ static inline void cpumask_clear(struct cpumask *dstp) * * Return: false if *@dstp is empty, else returns true */ -static inline bool cpumask_and(struct cpumask *dstp, - const struct cpumask *src1p, - const struct cpumask *src2p) +static __always_inline +bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) { return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); @@ -634,8 +644,9 @@ static inline bool cpumask_and(struct cpumask *dstp, * @src1p: the first input * @src2p: the second input */ -static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, - const struct cpumask *src2p) +static __always_inline +void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); @@ -647,9 +658,9 @@ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, * @src1p: the first input * @src2p: the second input */ -static inline void cpumask_xor(struct cpumask *dstp, - const struct cpumask *src1p, - const struct cpumask *src2p) +static __always_inline +void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) { bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); @@ -663,9 +674,9 @@ static inline void cpumask_xor(struct cpumask *dstp, * * Return: false if *@dstp is empty, else returns true */ -static inline bool cpumask_andnot(struct cpumask *dstp, - const struct cpumask *src1p, - const struct cpumask *src2p) +static __always_inline +bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) { return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); @@ -678,8 +689,8 @@ static inline bool cpumask_andnot(struct cpumask *dstp, * * Return: true if the cpumasks are equal, false if not */ -static inline bool cpumask_equal(const struct cpumask *src1p, - const struct cpumask *src2p) +static __always_inline +bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); @@ -694,9 +705,9 @@ static inline bool cpumask_equal(const struct cpumask *src1p, * Return: true if first cpumask ORed with second cpumask == third cpumask, * otherwise false */ -static inline bool cpumask_or_equal(const struct cpumask *src1p, - const struct cpumask *src2p, - const struct cpumask *src3p) +static __always_inline +bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p, + const struct cpumask *src3p) { return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p), cpumask_bits(src3p), small_cpumask_bits); @@ -710,8 +721,8 @@ static inline bool cpumask_or_equal(const struct cpumask *src1p, * Return: true if first cpumask ANDed with second cpumask is non-empty, * otherwise false */ -static inline bool cpumask_intersects(const struct cpumask *src1p, - const struct cpumask *src2p) +static __always_inline +bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); @@ -724,8 +735,8 @@ static inline bool cpumask_intersects(const struct cpumask *src1p, * * Return: true if *@src1p is a subset of *@src2p, else returns false */ -static inline bool cpumask_subset(const struct cpumask *src1p, - const struct cpumask *src2p) +static __always_inline +bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); @@ -737,7 +748,7 @@ static inline bool cpumask_subset(const struct cpumask *src1p, * * Return: true if srcp is empty (has no bits set), else false */ -static inline bool cpumask_empty(const struct cpumask *srcp) +static __always_inline bool cpumask_empty(const struct cpumask *srcp) { return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits); } @@ -748,7 +759,7 @@ static inline bool cpumask_empty(const struct cpumask *srcp) * * Return: true if srcp is full (has all bits set), else false */ -static inline bool cpumask_full(const struct cpumask *srcp) +static __always_inline bool cpumask_full(const struct cpumask *srcp) { return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); } @@ -759,7 +770,7 @@ static inline bool cpumask_full(const struct cpumask *srcp) * * Return: count of bits set in *srcp */ -static inline unsigned int cpumask_weight(const struct cpumask *srcp) +static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp) { return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits); } @@ -771,8 +782,8 @@ static inline unsigned int cpumask_weight(const struct cpumask *srcp) * * Return: count of bits set in both *srcp1 and *srcp2 */ -static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, - const struct cpumask *srcp2) +static __always_inline +unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } @@ -784,8 +795,9 @@ static inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, * * Return: count of bits set in both *srcp1 and *srcp2 */ -static inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1, - const struct cpumask *srcp2) +static __always_inline +unsigned int cpumask_weight_andnot(const struct cpumask *srcp1, + const struct cpumask *srcp2) { return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } @@ -796,8 +808,8 @@ static inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1, * @srcp: the input to shift * @n: the number of bits to shift by */ -static inline void cpumask_shift_right(struct cpumask *dstp, - const struct cpumask *srcp, int n) +static __always_inline +void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, small_cpumask_bits); @@ -809,8 +821,8 @@ static inline void cpumask_shift_right(struct cpumask *dstp, * @srcp: the input to shift * @n: the number of bits to shift by */ -static inline void cpumask_shift_left(struct cpumask *dstp, - const struct cpumask *srcp, int n) +static __always_inline +void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); @@ -821,8 +833,8 @@ static inline void cpumask_shift_left(struct cpumask *dstp, * @dstp: the result * @srcp: the input cpumask */ -static inline void cpumask_copy(struct cpumask *dstp, - const struct cpumask *srcp) +static __always_inline +void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits); } @@ -858,8 +870,8 @@ static inline void cpumask_copy(struct cpumask *dstp, * * Return: -errno, or 0 for success. */ -static inline int cpumask_parse_user(const char __user *buf, int len, - struct cpumask *dstp) +static __always_inline +int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } @@ -872,8 +884,8 @@ static inline int cpumask_parse_user(const char __user *buf, int len, * * Return: -errno, or 0 for success. */ -static inline int cpumask_parselist_user(const char __user *buf, int len, - struct cpumask *dstp) +static __always_inline +int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); @@ -886,7 +898,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, * * Return: -errno, or 0 for success. */ -static inline int cpumask_parse(const char *buf, struct cpumask *dstp) +static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp) { return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } @@ -898,7 +910,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) * * Return: -errno, or 0 for success. */ -static inline int cpulist_parse(const char *buf, struct cpumask *dstp) +static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp) { return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } @@ -908,7 +920,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) * * Return: size to allocate for a &struct cpumask in bytes */ -static inline unsigned int cpumask_size(void) +static __always_inline unsigned int cpumask_size(void) { return bitmap_size(large_cpumask_bits); } @@ -920,7 +932,7 @@ static inline unsigned int cpumask_size(void) bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); -static inline +static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); @@ -938,13 +950,13 @@ bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) * * Return: %true if allocation succeeded, %false if not */ -static inline +static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE); } -static inline +static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var(mask, flags | __GFP_ZERO); @@ -954,7 +966,7 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); -static inline bool cpumask_available(cpumask_var_t mask) +static __always_inline bool cpumask_available(cpumask_var_t mask) { return mask != NULL; } @@ -964,43 +976,43 @@ static inline bool cpumask_available(cpumask_var_t mask) #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly -static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; } -static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, +static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return true; } -static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) +static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { cpumask_clear(*mask); return true; } -static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, +static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { cpumask_clear(*mask); return true; } -static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) +static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } -static inline void free_cpumask_var(cpumask_var_t mask) +static __always_inline void free_cpumask_var(cpumask_var_t mask) { } -static inline void free_bootmem_cpumask_var(cpumask_var_t mask) +static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } -static inline bool cpumask_available(cpumask_var_t mask) +static __always_inline bool cpumask_available(cpumask_var_t mask) { return true; } @@ -1058,7 +1070,7 @@ void set_cpu_online(unsigned int cpu, bool online); ((struct cpumask *)(1 ? (bitmap) \ : (void *)sizeof(__check_is_bitmap(bitmap)))) -static inline int __check_is_bitmap(const unsigned long *bitmap) +static __always_inline int __check_is_bitmap(const unsigned long *bitmap) { return 1; } @@ -1073,7 +1085,7 @@ static inline int __check_is_bitmap(const unsigned long *bitmap) extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; -static inline const struct cpumask *get_cpu_mask(unsigned int cpu) +static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; @@ -1100,32 +1112,32 @@ static __always_inline unsigned int num_online_cpus(void) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) -static inline bool cpu_online(unsigned int cpu) +static __always_inline bool cpu_online(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_online_mask); } -static inline bool cpu_enabled(unsigned int cpu) +static __always_inline bool cpu_enabled(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_enabled_mask); } -static inline bool cpu_possible(unsigned int cpu) +static __always_inline bool cpu_possible(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_possible_mask); } -static inline bool cpu_present(unsigned int cpu) +static __always_inline bool cpu_present(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_present_mask); } -static inline bool cpu_active(unsigned int cpu) +static __always_inline bool cpu_active(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_active_mask); } -static inline bool cpu_dying(unsigned int cpu) +static __always_inline bool cpu_dying(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_dying_mask); } @@ -1138,32 +1150,32 @@ static inline bool cpu_dying(unsigned int cpu) #define num_present_cpus() 1U #define num_active_cpus() 1U -static inline bool cpu_online(unsigned int cpu) +static __always_inline bool cpu_online(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_possible(unsigned int cpu) +static __always_inline bool cpu_possible(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_enabled(unsigned int cpu) +static __always_inline bool cpu_enabled(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_present(unsigned int cpu) +static __always_inline bool cpu_present(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_active(unsigned int cpu) +static __always_inline bool cpu_active(unsigned int cpu) { return cpu == 0; } -static inline bool cpu_dying(unsigned int cpu) +static __always_inline bool cpu_dying(unsigned int cpu) { return false; } @@ -1197,7 +1209,7 @@ static inline bool cpu_dying(unsigned int cpu) * Return: the length of the (null-terminated) @buf string, zero if * nothing is copied. */ -static inline ssize_t +static __always_inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), @@ -1220,9 +1232,9 @@ cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) * Return: the length of how many bytes have been copied, excluding * terminating '\0'. */ -static inline ssize_t -cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, - loff_t off, size_t count) +static __always_inline +ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, + loff_t off, size_t count) { return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; @@ -1242,9 +1254,9 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, * Return: the length of how many bytes have been copied, excluding * terminating '\0'. */ -static inline ssize_t -cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, - loff_t off, size_t count) +static __always_inline +ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, + loff_t off, size_t count) { return bitmap_print_list_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; -- cgit v1.2.3 From 54c9e0085bd1015e524d8f4d3c4e78a7bc77ffca Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 18 Jul 2024 17:50:40 -0700 Subject: nodemask: Switch from inline to __always_inline 'inline' keyword is only a recommendation for compiler. If it decides to not inline nodemask functions, the whole small_const_nbits() machinery doesn't work. This is how a standard GCC 11.3.0 does for my x86_64 build now. This patch replaces 'inline' directive with unconditional '__always_inline' to make sure that there's always a chance for compile-time optimization. It doesn't change size of kernel image, according to bloat-o-meter. [[ Brian: split out from: Subject: [PATCH 1/3] bitmap: switch from inline to __always_inline https://lore.kernel.org/all/20221027043810.350460-2-yury.norov@gmail.com/ But rewritten, as there were too many conflicts. ]] Co-developed-by: Brian Norris Signed-off-by: Brian Norris Reviewed-by: Kees Cook Reviewed-by: Nathan Chancellor Signed-off-by: Yury Norov --- include/linux/nodemask.h | 86 ++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index b61438313a73..9fd7a0ce9c1a 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -107,11 +107,11 @@ extern nodemask_t _unused_nodemask_arg_; */ #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ __nodemask_pr_bits(maskp) -static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) +static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) { return m ? MAX_NUMNODES : 0; } -static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) +static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) { return m ? m->bits : NULL; } @@ -132,19 +132,19 @@ static __always_inline void __node_set(int node, volatile nodemask_t *dstp) } #define node_clear(node, dst) __node_clear((node), &(dst)) -static inline void __node_clear(int node, volatile nodemask_t *dstp) +static __always_inline void __node_clear(int node, volatile nodemask_t *dstp) { clear_bit(node, dstp->bits); } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) -static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) +static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) -static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) +static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } @@ -154,14 +154,14 @@ static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) -static inline bool __node_test_and_set(int node, nodemask_t *addr) +static __always_inline bool __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -169,7 +169,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -177,7 +177,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -185,7 +185,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) -static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, +static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); @@ -193,7 +193,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) -static inline void __nodes_complement(nodemask_t *dstp, +static __always_inline void __nodes_complement(nodemask_t *dstp, const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); @@ -201,7 +201,7 @@ static inline void __nodes_complement(nodemask_t *dstp, #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) -static inline bool __nodes_equal(const nodemask_t *src1p, +static __always_inline bool __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); @@ -209,7 +209,7 @@ static inline bool __nodes_equal(const nodemask_t *src1p, #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) -static inline bool __nodes_intersects(const nodemask_t *src1p, +static __always_inline bool __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); @@ -217,33 +217,33 @@ static inline bool __nodes_intersects(const nodemask_t *src1p, #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) -static inline bool __nodes_subset(const nodemask_t *src1p, +static __always_inline bool __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) -static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) +static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) -static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) +static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) -static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) +static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } #define nodes_shift_right(dst, src, n) \ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) -static inline void __nodes_shift_right(nodemask_t *dstp, +static __always_inline void __nodes_shift_right(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); @@ -251,7 +251,7 @@ static inline void __nodes_shift_right(nodemask_t *dstp, #define nodes_shift_left(dst, src, n) \ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) -static inline void __nodes_shift_left(nodemask_t *dstp, +static __always_inline void __nodes_shift_left(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); @@ -261,13 +261,13 @@ static inline void __nodes_shift_left(nodemask_t *dstp, > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src)) -static inline unsigned int __first_node(const nodemask_t *srcp) +static __always_inline unsigned int __first_node(const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) -static inline unsigned int __next_node(int n, const nodemask_t *srcp) +static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } @@ -277,7 +277,7 @@ static inline unsigned int __next_node(int n, const nodemask_t *srcp) * the first node in src if needed. Returns MAX_NUMNODES if src is empty. */ #define next_node_in(n, src) __next_node_in((n), &(src)) -static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) +static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp) { unsigned int ret = __next_node(node, srcp); @@ -286,7 +286,7 @@ static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) return ret; } -static inline void init_nodemask_of_node(nodemask_t *mask, int node) +static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); node_set(node, *mask); @@ -304,7 +304,7 @@ static inline void init_nodemask_of_node(nodemask_t *mask, int node) }) #define first_unset_node(mask) __first_unset_node(&(mask)) -static inline unsigned int __first_unset_node(const nodemask_t *maskp) +static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp) { return min_t(unsigned int, MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); @@ -338,21 +338,21 @@ static inline unsigned int __first_unset_node(const nodemask_t *maskp) #define nodemask_parse_user(ubuf, ulen, dst) \ __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) -static inline int __nodemask_parse_user(const char __user *buf, int len, +static __always_inline int __nodemask_parse_user(const char __user *buf, int len, nodemask_t *dstp, int nbits) { return bitmap_parse_user(buf, len, dstp->bits, nbits); } #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) -static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) +static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) { return bitmap_parselist(buf, dstp->bits, nbits); } #define node_remap(oldbit, old, new) \ __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) -static inline int __node_remap(int oldbit, +static __always_inline int __node_remap(int oldbit, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); @@ -360,7 +360,7 @@ static inline int __node_remap(int oldbit, #define nodes_remap(dst, src, old, new) \ __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) -static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, +static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); @@ -368,7 +368,7 @@ static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, #define nodes_onto(dst, orig, relmap) \ __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) -static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, +static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, const nodemask_t *relmapp, int nbits) { bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); @@ -376,7 +376,7 @@ static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, #define nodes_fold(dst, orig, sz) \ __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) -static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, +static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, int sz, int nbits) { bitmap_fold(dstp->bits, origp->bits, sz, nbits); @@ -418,22 +418,22 @@ enum node_states { extern nodemask_t node_states[NR_NODE_STATES]; #if MAX_NUMNODES > 1 -static inline int node_state(int node, enum node_states state) +static __always_inline int node_state(int node, enum node_states state) { return node_isset(node, node_states[state]); } -static inline void node_set_state(int node, enum node_states state) +static __always_inline void node_set_state(int node, enum node_states state) { __node_set(node, &node_states[state]); } -static inline void node_clear_state(int node, enum node_states state) +static __always_inline void node_clear_state(int node, enum node_states state) { __node_clear(node, &node_states[state]); } -static inline int num_node_state(enum node_states state) +static __always_inline int num_node_state(enum node_states state) { return nodes_weight(node_states[state]); } @@ -443,11 +443,11 @@ static inline int num_node_state(enum node_states state) #define first_online_node first_node(node_states[N_ONLINE]) #define first_memory_node first_node(node_states[N_MEMORY]) -static inline unsigned int next_online_node(int nid) +static __always_inline unsigned int next_online_node(int nid) { return next_node(nid, node_states[N_ONLINE]); } -static inline unsigned int next_memory_node(int nid) +static __always_inline unsigned int next_memory_node(int nid) { return next_node(nid, node_states[N_MEMORY]); } @@ -455,13 +455,13 @@ static inline unsigned int next_memory_node(int nid) extern unsigned int nr_node_ids; extern unsigned int nr_online_nodes; -static inline void node_set_online(int nid) +static __always_inline void node_set_online(int nid) { node_set_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } -static inline void node_set_offline(int nid) +static __always_inline void node_set_offline(int nid) { node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); @@ -469,20 +469,20 @@ static inline void node_set_offline(int nid) #else -static inline int node_state(int node, enum node_states state) +static __always_inline int node_state(int node, enum node_states state) { return node == 0; } -static inline void node_set_state(int node, enum node_states state) +static __always_inline void node_set_state(int node, enum node_states state) { } -static inline void node_clear_state(int node, enum node_states state) +static __always_inline void node_clear_state(int node, enum node_states state) { } -static inline int num_node_state(enum node_states state) +static __always_inline int num_node_state(enum node_states state) { return 1; } @@ -502,7 +502,7 @@ static inline int num_node_state(enum node_states state) #endif -static inline int node_random(const nodemask_t *maskp) +static __always_inline int node_random(const nodemask_t *maskp) { #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) int w, bit; -- cgit v1.2.3 From b27404b2bbf951a11500525dea15681cc970226c Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 19 Aug 2024 08:55:46 +0800 Subject: ALSA/ASoC/SoundWire: Intel: use single definition for SDW_INTEL_MAX_LINKS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The definitions are currently duplicated in intel-sdw-acpi.c and sof_sdw.c. Move the definition to the sdw_intel.h header, and change the prefix to make it Intel-specific. No functionality change in this patch. Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Signed-off-by: Bard Liao Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20240819005548.5867-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_intel.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index d537587b4499..87d82ea9a13a 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -447,4 +447,9 @@ extern const struct sdw_intel_hw_ops sdw_intel_lnl_hw_ops; #define SDW_INTEL_DEV_NUM_IDA_MIN 6 +/* + * Max number of links supported in hardware + */ +#define SDW_INTEL_MAX_LINKS 4 + #endif -- cgit v1.2.3 From d2234596be2192d9c1ae34f8c7534531191bc433 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 19 Aug 2024 08:55:47 +0800 Subject: soundwire: intel: add probe-time check on link id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In older platforms, the number of links was constant and hard-coded to 4. Newer platforms can have varying number of links, so we need to add a probe-time check to make sure the ACPI-reported information with _DSD properties is aligned with hardware capabilities reported in the SoundWire LCAP register. Acked-by: Vinod Koul Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Signed-off-by: Bard Liao Acked-by: Mark Brown Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20240819005548.5867-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_intel.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 87d82ea9a13a..cb8e7396b4db 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -388,6 +388,7 @@ struct sdw_intel; /* struct intel_sdw_hw_ops - SoundWire ops for Intel platforms. * @debugfs_init: initialize all debugfs capabilities * @debugfs_exit: close and cleanup debugfs capabilities + * @get_link_count: fetch link count from hardware registers * @register_dai: read all PDI information and register DAIs * @check_clock_stop: throw error message if clock is not stopped. * @start_bus: normal start @@ -412,6 +413,8 @@ struct sdw_intel_hw_ops { void (*debugfs_init)(struct sdw_intel *sdw); void (*debugfs_exit)(struct sdw_intel *sdw); + int (*get_link_count)(struct sdw_intel *sdw); + int (*register_dai)(struct sdw_intel *sdw); void (*check_clock_stop)(struct sdw_intel *sdw); -- cgit v1.2.3 From 1f3662838a05f1ab6af89a417f6f252d91d0806b Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 19 Aug 2024 08:55:48 +0800 Subject: soundwire: intel: increase maximum number of links MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intel platforms have enabled 4 links since the beginning, newer platforms now have 5 links. Update the definition accordingly. This patch will have no effect on older platforms where the number of links was hard-coded. A follow-up patch will add a dynamic check that the ACPI-reported information is aligned with hardware capabilities on newer platforms. Acked-by: Vinod Koul Signed-off-by: Pierre-Louis Bossart Reviewed-by: Péter Ujfalusi Signed-off-by: Bard Liao Acked-by: Mark Brown Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20240819005548.5867-4-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_intel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index cb8e7396b4db..37ae69365fe2 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -453,6 +453,6 @@ extern const struct sdw_intel_hw_ops sdw_intel_lnl_hw_ops; /* * Max number of links supported in hardware */ -#define SDW_INTEL_MAX_LINKS 4 +#define SDW_INTEL_MAX_LINKS 5 #endif -- cgit v1.2.3 From c393eaa85349941badf3ce5f087400dbaf3bbe02 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Fri, 26 Jul 2024 11:05:25 +0800 Subject: fs: don't flush in-flight wb switches for superblocks without cgroup writeback When deactivating any type of superblock, it had to wait for the in-flight wb switches to be completed. wb switches are executed in inode_switch_wbs_work_fn() which needs to acquire the wb_switch_rwsem and races against sync_inodes_sb(). If there are too much dirty data in the superblock, the waiting time may increase significantly. For superblocks without cgroup writeback such as tmpfs, they have nothing to do with the wb swithes, so the flushing can be avoided. Signed-off-by: Haifeng Xu Link: https://lore.kernel.org/r/20240726030525.180330-1-haifeng.xu@shopee.com Reviewed-by: Jan Kara Suggested-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/writeback.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 1a54676d843a..56b85841ae4c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -217,7 +217,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); -void cgroup_writeback_umount(void); +void cgroup_writeback_umount(struct super_block *sb); bool cleanup_offline_cgwb(struct bdi_writeback *wb); /** @@ -324,7 +324,7 @@ static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, { } -static inline void cgroup_writeback_umount(void) +static inline void cgroup_writeback_umount(struct super_block *sb) { } -- cgit v1.2.3 From c01a5d89e5c8abe638107be2a4ea9e4c7fcdd7f6 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Fri, 2 Aug 2024 17:19:01 +0800 Subject: percpu-rwsem: remove the unused parameter 'read' In the function percpu_rwsem_release, the parameter `read` is unused, so remove it. Signed-off-by: Wang Long Link: https://lore.kernel.org/r/20240802091901.2546797-1-w@laoqinren.net Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- include/linux/percpu-rwsem.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index fb0426f349fc..c0286d479c9d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1683,7 +1683,7 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level) #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ - percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_) /** * __sb_write_started - check if sb freeze level is held diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 36b942b67b7d..c012df33a9f0 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -145,7 +145,7 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *); #define percpu_rwsem_assert_held(sem) lockdep_assert_held(sem) static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, - bool read, unsigned long ip) + unsigned long ip) { lock_release(&sem->dep_map, ip); } -- cgit v1.2.3 From 2865baf54077aa98fcdb478cefe6a42c417b9374 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Apr 2024 20:04:58 -0700 Subject: x86: support user address masking instead of non-speculative conditional The Spectre-v1 mitigations made "access_ok()" much more expensive, since it has to serialize execution with the test for a valid user address. All the normal user copy routines avoid this by just masking the user address with a data-dependent mask instead, but the fast "unsafe_user_read()" kind of patterms that were supposed to be a fast case got slowed down. This introduces a notion of using src = masked_user_access_begin(src); to do the user address sanity using a data-dependent mask instead of the more traditional conditional if (user_read_access_begin(src, len)) { model. This model only works for dense accesses that start at 'src' and on architectures that have a guard region that is guaranteed to fault in between the user space and the kernel space area. With this, the user access doesn't need to be manually checked, because a bad address is guaranteed to fault (by some architecture masking trick: on x86-64 this involves just turning an invalid user address into all ones, since we don't map the top of address space). This only converts a couple of examples for now. Example x86-64 code generation for loading two words from user space: stac mov %rax,%rcx sar $0x3f,%rcx or %rax,%rcx mov (%rcx),%r13 mov 0x8(%rcx),%r14 clac where all the error handling and -EFAULT is now purely handled out of line by the exception path. Of course, if the micro-architecture does badly at 'clac' and 'stac', the above is still pitifully slow. But at least we did as well as we could. Signed-off-by: Linus Torvalds --- include/linux/uaccess.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 3064314f4832..f18371f6cf36 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -32,6 +32,13 @@ }) #endif +#ifdef masked_user_access_begin + #define can_do_masked_user_access() 1 +#else + #define can_do_masked_user_access() 0 + #define masked_user_access_begin(src) NULL +#endif + /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and -- cgit v1.2.3 From 8dffaec34dd55473adcbc924a4c9b04aaa0d4278 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Aug 2024 12:18:23 -1000 Subject: workqueue: Fix htmldocs build warning Fix htmldocs build warning introduced by ec0a7d44b358 ("workqueue: Add interface for user-defined workqueue lockdep map"). Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell Cc: Matthew Brost --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 8ccbf510880b..04dff07a9e2b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -534,7 +534,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @lockdep_map: user-defined lockdep_map - * @args: args for @fmt + * @...: args for @fmt * * Same as alloc_ordered_workqueue but with the a user-define lockdep_map. * Useful for workqueues created with the same purpose and to avoid leaking a -- cgit v1.2.3 From 7f6287417baf57754f47687c6ea1a749a0686ab0 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 19 Aug 2024 18:28:05 +0200 Subject: bpf: Allow bpf_current_task_under_cgroup() with BPF_CGROUP_* The helper bpf_current_task_under_cgroup() currently is only allowed for tracing programs, allow its usage also in the BPF_CGROUP_* program types. Move the code from kernel/trace/bpf_trace.c to kernel/bpf/helpers.c, so it compiles also without CONFIG_BPF_EVENTS. This will be used in systemd-networkd to monitor the sysctl writes, and filter it's own writes from others: https://github.com/systemd/systemd/pull/32212 Signed-off-by: Matteo Croce Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240819162805.78235-3-technoboy85@gmail.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b9425e410bcb..f0192c173ed8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3206,6 +3206,7 @@ extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; +extern const struct bpf_func_proto bpf_current_task_under_cgroup_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; -- cgit v1.2.3 From 6ad8bc92a47702f0b5d0b96b680199910b89f688 Mon Sep 17 00:00:00 2001 From: Christian Hopps Date: Thu, 15 Aug 2024 13:21:14 -0400 Subject: net: add copy from skb_seq_state to buffer function Add an skb helper function to copy a range of bytes from within an existing skb_seq_state. Signed-off-by: Christian Hopps Signed-off-by: Steffen Klassert --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 29c3ea5b6e93..a871533b8568 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1433,6 +1433,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, unsigned int skb_seq_read(unsigned int consumed, const u8 **data, struct skb_seq_state *st); void skb_abort_seq_read(struct skb_seq_state *st); +int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len); unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, unsigned int to, struct ts_config *config); -- cgit v1.2.3 From b6cee6544d01b8ac01232d343b1b2b594d94d61c Mon Sep 17 00:00:00 2001 From: Dikshita Agarwal Date: Mon, 19 Aug 2024 15:59:09 +0530 Subject: PM: domains: add device managed version of dev_pm_domain_attach|detach_list() Add the devres-enabled version of dev_pm_domain_attach|detach_list. If client drivers use devm_pm_domain_attach_list() to attach the PM domains, devm_pm_domain_detach_list() will be invoked implicitly during remove phase. Signed-off-by: Dikshita Agarwal Link: https://lore.kernel.org/r/1724063350-11993-2-git-send-email-quic_dikshita@quicinc.com Signed-off-by: Ulf Hansson --- include/linux/pm_domain.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index b86bb52858ac..b637ec14025f 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -476,6 +476,9 @@ struct device *dev_pm_domain_attach_by_name(struct device *dev, int dev_pm_domain_attach_list(struct device *dev, const struct dev_pm_domain_attach_data *data, struct dev_pm_domain_list **list); +int devm_pm_domain_attach_list(struct device *dev, + const struct dev_pm_domain_attach_data *data, + struct dev_pm_domain_list **list); void dev_pm_domain_detach(struct device *dev, bool power_off); void dev_pm_domain_detach_list(struct dev_pm_domain_list *list); int dev_pm_domain_start(struct device *dev); @@ -502,6 +505,14 @@ static inline int dev_pm_domain_attach_list(struct device *dev, { return 0; } + +static inline int devm_pm_domain_attach_list(struct device *dev, + const struct dev_pm_domain_attach_data *data, + struct dev_pm_domain_list **list) +{ + return 0; +} + static inline void dev_pm_domain_detach(struct device *dev, bool power_off) {} static inline void dev_pm_domain_detach_list(struct dev_pm_domain_list *list) {} static inline int dev_pm_domain_start(struct device *dev) -- cgit v1.2.3 From acb0184fe9bca3bb7103dadc76ba9d38c969ca86 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:53 +0100 Subject: coresight: Move struct coresight_trace_id_map to common header The trace ID maps will need to be created and stored by the core and Perf code so move the definition up to the common header. Reviewed-by: Anshuman Khandual Reviewed-by: Mike Leach Signed-off-by: James Clark Tested-by: Leo Yan Tested-by: Ganapatrao Kulkarni Signed-off-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20240722101202.26915-12-james.clark@linaro.org --- include/linux/coresight.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index f09ace92176e..c16c61a8411d 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -218,6 +218,24 @@ struct coresight_sysfs_link { const char *target_name; }; +/* architecturally we have 128 IDs some of which are reserved */ +#define CORESIGHT_TRACE_IDS_MAX 128 + +/** + * Trace ID map. + * + * @used_ids: Bitmap to register available (bit = 0) and in use (bit = 1) IDs. + * Initialised so that the reserved IDs are permanently marked as + * in use. + * @pend_rel_ids: CPU IDs that have been released by the trace source but not + * yet marked as available, to allow re-allocation to the same + * CPU during a perf session. + */ +struct coresight_trace_id_map { + DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX); + DECLARE_BITMAP(pend_rel_ids, CORESIGHT_TRACE_IDS_MAX); +}; + /** * struct coresight_device - representation of a device as used by the framework * @pdata: Platform data with device connections associated to this device. -- cgit v1.2.3 From d53c8253c7822fbc524adcd950e7c6de6a229c99 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:55 +0100 Subject: coresight: Make CPU id map a property of a trace ID map The global CPU ID mappings won't work for per-sink ID maps so move it to the ID map struct. coresight_trace_id_release_all_pending() is hard coded to operate on the default map, but once Perf sessions use their own maps the pending release mechanism will be deleted. So it doesn't need to be extended to accept a trace ID map argument at this point. Signed-off-by: James Clark Reviewed-by: Mike Leach Tested-by: Leo Yan Signed-off-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20240722101202.26915-14-james.clark@linaro.org --- include/linux/coresight.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index c16c61a8411d..7d62b88bfb5c 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -234,6 +234,7 @@ struct coresight_sysfs_link { struct coresight_trace_id_map { DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX); DECLARE_BITMAP(pend_rel_ids, CORESIGHT_TRACE_IDS_MAX); + atomic_t __percpu *cpu_map; }; /** -- cgit v1.2.3 From 5ad628a7617607baac53745f8025638b967470a2 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:56 +0100 Subject: coresight: Use per-sink trace ID maps for Perf sessions This will allow sessions with more than CORESIGHT_TRACE_IDS_MAX ETMs as long as there are fewer than that many ETMs connected to each sink. Each sink owns its own trace ID map, and any Perf session connecting to that sink will allocate from it, even if the sink is currently in use by other users. This is similar to the existing behavior where the dynamic trace IDs are constant as long as there is any concurrent Perf session active. It's not completely optimal because slightly more IDs will be used than necessary, but the optimal solution involves tracking the PIDs of each session and allocating ID maps based on the session owner. This is difficult to do with the combination of per-thread and per-cpu modes and some scheduling issues. The complexity of this isn't likely to worth it because even with multiple users they'd just see a difference in the ordering of ID allocations rather than hitting any limits (unless the hardware does have too many ETMs connected to one sink). Signed-off-by: James Clark Reviewed-by: Mike Leach Signed-off-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20240722101202.26915-15-james.clark@linaro.org --- include/linux/coresight.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 7d62b88bfb5c..9c3067e2e38b 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -290,6 +290,7 @@ struct coresight_device { bool sysfs_sink_activated; struct dev_ext_attribute *ea; struct coresight_device *def_sink; + struct coresight_trace_id_map perf_sink_id_map; /* sysfs links between components */ int nr_links; bool has_conns_grp; @@ -384,7 +385,7 @@ struct coresight_ops_link { struct coresight_ops_source { int (*cpu_id)(struct coresight_device *csdev); int (*enable)(struct coresight_device *csdev, struct perf_event *event, - enum cs_mode mode); + enum cs_mode mode, struct coresight_trace_id_map *id_map); void (*disable)(struct coresight_device *csdev, struct perf_event *event); }; -- cgit v1.2.3 From de0029fdde86092c75472c92e56a962f4edee0f6 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:57 +0100 Subject: coresight: Remove pending trace ID release mechanism Pending the release of IDs was a way of managing concurrent sysfs and Perf sessions in a single global ID map. Perf may have finished while sysfs hadn't, and Perf shouldn't release the IDs in use by sysfs and vice versa. Now that Perf uses its own exclusive ID maps, pending release doesn't result in any different behavior than just releasing all IDs when the last Perf session finishes. As part of the per-sink trace ID change, we would have still had to make the pending mechanism work on a per-sink basis, due to the overlapping ID allocations, so instead of making that more complicated, just remove it. Signed-off-by: James Clark Reviewed-by: Mike Leach Signed-off-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20240722101202.26915-16-james.clark@linaro.org --- include/linux/coresight.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 9c3067e2e38b..197949fd2c35 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -227,14 +227,12 @@ struct coresight_sysfs_link { * @used_ids: Bitmap to register available (bit = 0) and in use (bit = 1) IDs. * Initialised so that the reserved IDs are permanently marked as * in use. - * @pend_rel_ids: CPU IDs that have been released by the trace source but not - * yet marked as available, to allow re-allocation to the same - * CPU during a perf session. + * @perf_cs_etm_session_active: Number of Perf sessions using this ID map. */ struct coresight_trace_id_map { DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX); - DECLARE_BITMAP(pend_rel_ids, CORESIGHT_TRACE_IDS_MAX); atomic_t __percpu *cpu_map; + atomic_t perf_cs_etm_session_active; }; /** -- cgit v1.2.3 From 487eec8da80aef16229d30429f1b26090b1bf0eb Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:58 +0100 Subject: coresight: Emit sink ID in the HW_ID packets For Perf to be able to decode when per-sink trace IDs are used, emit the sink that's being written to for each ETM. Perf currently errors out if it sees a newer packet version so instead of bumping it, add a new minor version field. This can be used to signify new versions that have backwards compatible fields. Considering this change is only for high core count machines, it doesn't make sense to make a breaking change for everyone. Signed-off-by: James Clark Tested-by: Leo Yan Reviewed-by: Mike Leach Signed-off-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20240722101202.26915-17-james.clark@linaro.org --- include/linux/coresight-pmu.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coresight-pmu.h b/include/linux/coresight-pmu.h index 51ac441a37c3..89b0ac0014b0 100644 --- a/include/linux/coresight-pmu.h +++ b/include/linux/coresight-pmu.h @@ -49,12 +49,21 @@ * Interpretation of the PERF_RECORD_AUX_OUTPUT_HW_ID payload. * Used to associate a CPU with the CoreSight Trace ID. * [07:00] - Trace ID - uses 8 bits to make value easy to read in file. - * [59:08] - Unused (SBZ) - * [63:60] - Version + * [39:08] - Sink ID - as reported in /sys/bus/event_source/devices/cs_etm/sinks/ + * Added in minor version 1. + * [55:40] - Unused (SBZ) + * [59:56] - Minor Version - previously existing fields are compatible with + * all minor versions. + * [63:60] - Major Version - previously existing fields mean different things + * in new major versions. */ #define CS_AUX_HW_ID_TRACE_ID_MASK GENMASK_ULL(7, 0) -#define CS_AUX_HW_ID_VERSION_MASK GENMASK_ULL(63, 60) +#define CS_AUX_HW_ID_SINK_ID_MASK GENMASK_ULL(39, 8) -#define CS_AUX_HW_ID_CURR_VERSION 0 +#define CS_AUX_HW_ID_MINOR_VERSION_MASK GENMASK_ULL(59, 56) +#define CS_AUX_HW_ID_MAJOR_VERSION_MASK GENMASK_ULL(63, 60) + +#define CS_AUX_HW_ID_MAJOR_VERSION 0 +#define CS_AUX_HW_ID_MINOR_VERSION 1 #endif -- cgit v1.2.3 From 988d40a4d4e7d671305bea501562a5d1a1d479fa Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 22 Jul 2024 11:11:59 +0100 Subject: coresight: Make trace ID map spinlock local to the map Reduce contention on the lock by replacing the global lock with one for each map. Signed-off-by: James Clark Reviewed-by: Mike Leach Signed-off-by: James Clark Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20240722101202.26915-18-james.clark@linaro.org --- include/linux/coresight.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 197949fd2c35..c13342594278 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -233,6 +233,7 @@ struct coresight_trace_id_map { DECLARE_BITMAP(used_ids, CORESIGHT_TRACE_IDS_MAX); atomic_t __percpu *cpu_map; atomic_t perf_cs_etm_session_active; + spinlock_t lock; }; /** -- cgit v1.2.3 From 7c432a18ad216b4f7b08e93287586d60e12a3b7b Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Tue, 20 Aug 2024 15:27:55 +0100 Subject: firmware: arm_ffa: Update the FF-A command list with v1.2 additions Arm Firmware Framework for A-profile(FFA) v1.2 introduces register based discovery mechanism and direct messaging extensions that enables to target specific UUID within a partition. Let us add all the newly supported FF-A function IDs in the spec. Also update to the error values and associated handling. Message-Id: <20240820-ffa_v1-2-v2-2-18c0c5f3c65e@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index 89683f31ae12..b34f0c0dc2c5 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -73,6 +73,11 @@ #define FFA_FN64_MEM_PERM_GET FFA_SMC_64(0x88) #define FFA_MEM_PERM_SET FFA_SMC_32(0x89) #define FFA_FN64_MEM_PERM_SET FFA_SMC_64(0x89) +#define FFA_CONSOLE_LOG FFA_SMC_32(0x8A) +#define FFA_PARTITION_INFO_GET_REGS FFA_SMC_64(0x8B) +#define FFA_EL3_INTR_HANDLE FFA_SMC_32(0x8C) +#define FFA_MSG_SEND_DIRECT_REQ2 FFA_SMC_64(0x8D) +#define FFA_MSG_SEND_DIRECT_RESP2 FFA_SMC_64(0x8E) /* * For some calls it is necessary to use SMC64 to pass or return 64-bit values. -- cgit v1.2.3 From aaef3bc98129c86078b336f16788dd733b0728a4 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Tue, 20 Aug 2024 15:27:58 +0100 Subject: firmware: arm_ffa: Add support for FFA_MSG_SEND_DIRECT_{REQ,RESP}2 FFA_MSG_SEND_DIRECT_{REQ,RESP} supported only x3-x7 to pass implementation defined values as part of the message. This may not be sufficient sometimes and also it would be good to use all the registers supported by SMCCC v1.2 (x0-x17) for such register based communication. Also another limitation with the FFA_MSG_SEND_DIRECT_{REQ,RESP} is the ability to target a specific service within the partition based on it's UUID. In order to address both of the above limitation, FF-A v1.2 introduced FFA_MSG_SEND_DIRECT_{REQ,RESP}2 which has the ability to target the message to a specific service based on its UUID within a partition as well as utilise all the available registers(x4-x17 specifically) for the communication. This change adds support for FFA_MSG_SEND_DIRECT_REQ2 and FFA_MSG_SEND_DIRECT_RESP2. Message-Id: <20240820-ffa_v1-2-v2-5-18c0c5f3c65e@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index b34f0c0dc2c5..a28e2a6a13d0 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -270,6 +270,11 @@ struct ffa_indirect_msg_hdr { u32 size; }; +/* For use with FFA_MSG_SEND_DIRECT_{REQ,RESP}2 which pass data via registers */ +struct ffa_send_direct_data2 { + unsigned long data[14]; /* x4-x17 */ +}; + struct ffa_mem_region_addr_range { /* The base IPA of the constituent memory region, aligned to 4 kiB */ u64 address; @@ -431,6 +436,8 @@ struct ffa_msg_ops { int (*sync_send_receive)(struct ffa_device *dev, struct ffa_send_direct_data *data); int (*indirect_send)(struct ffa_device *dev, void *buf, size_t sz); + int (*sync_send_receive2)(struct ffa_device *dev, const uuid_t *uuid, + struct ffa_send_direct_data2 *data); }; struct ffa_mem_ops { -- cgit v1.2.3 From e68ac2b488495fa4d127d6105ce633849859957a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 15 Aug 2024 11:15:40 -0600 Subject: softirq: Remove unused 'action' parameter from action callback When soft interrupt actions are called, they are passed a pointer to the struct softirq action which contains the action's function pointer. This pointer isn't useful, as the action callback already knows what function it is. And since each callback handles a specific soft interrupt, the callback also knows which soft interrupt number is running. No soft interrupt action callback actually uses this parameter, so remove it from the function pointer signature. This clarifies that soft interrupt actions are global routines and makes it slightly cheaper to call them. Signed-off-by: Caleb Sander Mateos Signed-off-by: Thomas Gleixner Reviewed-by: Jens Axboe Link: https://lore.kernel.org/all/20240815171549.3260003-1-csander@purestorage.com --- include/linux/interrupt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 3f30c88e0b4c..694de61e0b38 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -594,7 +594,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; struct softirq_action { - void (*action)(struct softirq_action *); + void (*action)(void); }; asmlinkage void do_softirq(void); @@ -609,7 +609,7 @@ static inline void do_softirq_post_smp_call_flush(unsigned int unused) } #endif -extern void open_softirq(int nr, void (*action)(struct softirq_action *)); +extern void open_softirq(int nr, void (*action)(void)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); -- cgit v1.2.3 From 2fea0c26b82f304f43b3905e56d954cf98a6d0e9 Mon Sep 17 00:00:00 2001 From: Fan Wu Date: Fri, 2 Aug 2024 23:08:19 -0700 Subject: initramfs,lsm: add a security hook to do_populate_rootfs() This patch introduces a new hook to notify security system that the content of initramfs has been unpacked into the rootfs. Upon receiving this notification, the security system can activate a policy to allow only files that originated from the initramfs to execute or load into kernel during the early stages of booting. This approach is crucial for minimizing the attack surface by ensuring that only trusted files from the initramfs are operational in the critical boot phase. Signed-off-by: Fan Wu [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 520730fe2d94..22a14fc794fe 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -449,3 +449,5 @@ LSM_HOOK(int, 0, uring_override_creds, const struct cred *new) LSM_HOOK(int, 0, uring_sqpoll, void) LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd) #endif /* CONFIG_IO_URING */ + +LSM_HOOK(void, LSM_RET_VOID, initramfs_populated, void) diff --git a/include/linux/security.h b/include/linux/security.h index 62233fec8ead..3298855abdbc 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2256,4 +2256,12 @@ static inline int security_uring_cmd(struct io_uring_cmd *ioucmd) #endif /* CONFIG_SECURITY */ #endif /* CONFIG_IO_URING */ +#ifdef CONFIG_SECURITY +extern void security_initramfs_populated(void); +#else +static inline void security_initramfs_populated(void) +{ +} +#endif /* CONFIG_SECURITY */ + #endif /* ! __LINUX_SECURITY_H */ -- cgit v1.2.3 From 7138679ff2a2b1674f16618558d6cabea6ab2c53 Mon Sep 17 00:00:00 2001 From: Fan Wu Date: Fri, 2 Aug 2024 23:08:21 -0700 Subject: lsm: add new securityfs delete function When deleting a directory in the security file system, the existing securityfs_remove requires the directory to be empty, otherwise it will do nothing. This leads to a potential risk that the security file system might be in an unclean state when the intended deletion did not happen. This commit introduces a new function securityfs_recursive_remove to recursively delete a directory without leaving an unclean state. Co-developed-by: Christian Brauner (Microsoft) Signed-off-by: Fan Wu [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/security.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 3298855abdbc..f6d2bc69cfa6 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2090,6 +2090,7 @@ struct dentry *securityfs_create_symlink(const char *name, const char *target, const struct inode_operations *iops); extern void securityfs_remove(struct dentry *dentry); +extern void securityfs_recursive_remove(struct dentry *dentry); #else /* CONFIG_SECURITYFS */ -- cgit v1.2.3 From b55d26bd1891423a3cdccf816b386aec8bbefc87 Mon Sep 17 00:00:00 2001 From: Deven Bowers Date: Fri, 2 Aug 2024 23:08:25 -0700 Subject: block,lsm: add LSM blob and new LSM hooks for block devices This patch introduces a new LSM blob to the block_device structure, enabling the security subsystem to store security-sensitive data related to block devices. Currently, for a device mapper's mapped device containing a dm-verity target, critical security information such as the roothash and its signing state are not readily accessible. Specifically, while the dm-verity volume creation process passes the dm-verity roothash and its signature from userspace to the kernel, the roothash is stored privately within the dm-verity target, and its signature is discarded post-verification. This makes it extremely hard for the security subsystem to utilize these data. With the addition of the LSM blob to the block_device structure, the security subsystem can now retain and manage important security metadata such as the roothash and the signing state of a dm-verity by storing them inside the blob. Access decisions can then be based on these stored data. The implementation follows the same approach used for security blobs in other structures like struct file, struct inode, and struct superblock. The initialization of the security blob occurs after the creation of the struct block_device, performed by the security subsystem. Similarly, the security blob is freed by the security subsystem before the struct block_device is deallocated or freed. This patch also introduces a new hook security_bdev_setintegrity() to save block device's integrity data to the new LSM blob. For example, for dm-verity, it can use this hook to expose its roothash and signing state to LSMs, then LSMs can save these data into the LSM blob. Please note that the new hook should be invoked every time the security information is updated to keep these data current. For example, in dm-verity, if the mapping table is reloaded and configured to use a different dm-verity target with a new roothash and signing information, the previously stored data in the LSM blob will become obsolete. It is crucial to re-invoke the hook to refresh these data and ensure they are up to date. This necessity arises from the design of device-mapper, where a device-mapper device is first created, and then targets are subsequently loaded into it. These targets can be modified multiple times during the device's lifetime. Therefore, while the LSM blob is allocated during the creation of the block device, its actual contents are not initialized at this stage and can change substantially over time. This includes alterations from data that the LSM 'trusts' to those it does not, making it essential to handle these changes correctly. Failure to address this dynamic aspect could potentially allow for bypassing LSM checks. Signed-off-by: Deven Bowers Signed-off-by: Fan Wu [PM: merge fuzz, subject line tweaks] Signed-off-by: Paul Moore --- include/linux/blk_types.h | 3 +++ include/linux/lsm_hook_defs.h | 5 +++++ include/linux/lsm_hooks.h | 1 + include/linux/security.h | 26 ++++++++++++++++++++++++++ 4 files changed, 35 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 36ed96133217..413ebdff974b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -71,6 +71,9 @@ struct block_device { struct partition_meta_info *bd_meta_info; int bd_writers; +#ifdef CONFIG_SECURITY + void *bd_security; +#endif /* * keep this out-of-line as it's both big and not needed in the fast * path diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 22a14fc794fe..860821f3bf6f 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -451,3 +451,8 @@ LSM_HOOK(int, 0, uring_cmd, struct io_uring_cmd *ioucmd) #endif /* CONFIG_IO_URING */ LSM_HOOK(void, LSM_RET_VOID, initramfs_populated, void) + +LSM_HOOK(int, 0, bdev_alloc_security, struct block_device *bdev) +LSM_HOOK(void, LSM_RET_VOID, bdev_free_security, struct block_device *bdev) +LSM_HOOK(int, 0, bdev_setintegrity, struct block_device *bdev, + enum lsm_integrity_type type, const void *value, size_t size) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 11ea0063228f..4687985b9175 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -83,6 +83,7 @@ struct lsm_blob_sizes { int lbs_task; int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ int lbs_tun_dev; + int lbs_bdev; }; /* diff --git a/include/linux/security.h b/include/linux/security.h index f6d2bc69cfa6..d7cab2d5002f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -83,6 +83,10 @@ enum lsm_event { LSM_POLICY_CHANGE, }; +enum lsm_integrity_type { + __LSM_INT_MAX +}; + /* * These are reasons that can be passed to the security_locked_down() * LSM hook. Lockdown reasons that protect kernel integrity (ie, the @@ -509,6 +513,11 @@ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen); int security_locked_down(enum lockdown_reason what); int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len, void *val, size_t val_len, u64 id, u64 flags); +int security_bdev_alloc(struct block_device *bdev); +void security_bdev_free(struct block_device *bdev); +int security_bdev_setintegrity(struct block_device *bdev, + enum lsm_integrity_type type, const void *value, + size_t size); #else /* CONFIG_SECURITY */ static inline int call_blocking_lsm_notifier(enum lsm_event event, void *data) @@ -1483,6 +1492,23 @@ static inline int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, { return -EOPNOTSUPP; } + +static inline int security_bdev_alloc(struct block_device *bdev) +{ + return 0; +} + +static inline void security_bdev_free(struct block_device *bdev) +{ +} + +static inline int security_bdev_setintegrity(struct block_device *bdev, + enum lsm_integrity_type type, + const void *value, size_t size) +{ + return 0; +} + #endif /* CONFIG_SECURITY */ #if defined(CONFIG_SECURITY) && defined(CONFIG_WATCH_QUEUE) -- cgit v1.2.3 From a6af7bc3d72ff52c5526a392144347fcb3094149 Mon Sep 17 00:00:00 2001 From: Deven Bowers Date: Fri, 2 Aug 2024 23:08:26 -0700 Subject: dm-verity: expose root hash digest and signature data to LSMs dm-verity provides a strong guarantee of a block device's integrity. As a generic way to check the integrity of a block device, it provides those integrity guarantees to its higher layers, including the filesystem level. However, critical security metadata like the dm-verity roothash and its signing information are not easily accessible to the LSMs. To address this limitation, this patch introduces a mechanism to store and manage these essential security details within a newly added LSM blob in the block_device structure. This addition allows LSMs to make access control decisions on the integrity data stored within the block_device, enabling more flexible security policies. For instance, LSMs can now revoke access to dm-verity devices based on their roothashes, ensuring that only authorized and verified content is accessible. Additionally, LSMs can enforce policies to only allow files from dm-verity devices that have a valid digital signature to execute, effectively blocking any unsigned files from execution, thus enhancing security against unauthorized modifications. The patch includes new hook calls, `security_bdev_setintegrity()`, in dm-verity to expose the dm-verity roothash and the roothash signature to LSMs via preresume() callback. By using the preresume() callback, it ensures that the security metadata is consistently in sync with the metadata of the dm-verity target in the current active mapping table. The hook calls are depended on CONFIG_SECURITY. Signed-off-by: Deven Bowers Signed-off-by: Fan Wu Reviewed-by: Mikulas Patocka [PM: moved sig_size field as discussed] Signed-off-by: Paul Moore --- include/linux/security.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index d7cab2d5002f..e383022467db 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -83,8 +83,15 @@ enum lsm_event { LSM_POLICY_CHANGE, }; +struct dm_verity_digest { + const char *alg; + const u8 *digest; + size_t digest_len; +}; + enum lsm_integrity_type { - __LSM_INT_MAX + LSM_INT_DMVERITY_SIG_VALID, + LSM_INT_DMVERITY_ROOTHASH, }; /* -- cgit v1.2.3 From fb55e177d5936fb80fb2586036d195c57e7f6892 Mon Sep 17 00:00:00 2001 From: Fan Wu Date: Fri, 2 Aug 2024 23:08:28 -0700 Subject: lsm: add security_inode_setintegrity() hook This patch introduces a new hook to save inode's integrity data. For example, for fsverity enabled files, LSMs can use this hook to save the existence of verified fsverity builtin signature into the inode's security blob, and LSMs can make access decisions based on this data. Signed-off-by: Fan Wu [PM: subject line tweak, removed changelog] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 ++ include/linux/security.h | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 860821f3bf6f..1d59513bf230 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -180,6 +180,8 @@ LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid) LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new) LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, struct dentry *src, const char *name) +LSM_HOOK(int, 0, inode_setintegrity, const struct inode *inode, + enum lsm_integrity_type type, const void *value, size_t size) LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir, struct kernfs_node *kn) LSM_HOOK(int, 0, file_permission, struct file *file, int mask) diff --git a/include/linux/security.h b/include/linux/security.h index e383022467db..97b7c57e6560 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -410,6 +410,9 @@ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer void security_inode_getsecid(struct inode *inode, u32 *secid); int security_inode_copy_up(struct dentry *src, struct cred **new); int security_inode_copy_up_xattr(struct dentry *src, const char *name); +int security_inode_setintegrity(const struct inode *inode, + enum lsm_integrity_type type, const void *value, + size_t size); int security_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn); int security_file_permission(struct file *file, int mask); @@ -1026,6 +1029,13 @@ static inline int security_inode_copy_up(struct dentry *src, struct cred **new) return 0; } +static inline int security_inode_setintegrity(const struct inode *inode, + enum lsm_integrity_type type, + const void *value, size_t size) +{ + return 0; +} + static inline int security_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { -- cgit v1.2.3 From 7c373e4f1445263728d3eeab7e33e932c8f4a288 Mon Sep 17 00:00:00 2001 From: Fan Wu Date: Fri, 2 Aug 2024 23:08:29 -0700 Subject: fsverity: expose verified fsverity built-in signatures to LSMs This patch enhances fsverity's capabilities to support both integrity and authenticity protection by introducing the exposure of built-in signatures through a new LSM hook. This functionality allows LSMs, e.g. IPE, to enforce policies based on the authenticity and integrity of files, specifically focusing on built-in fsverity signatures. It enables a policy enforcement layer within LSMs for fsverity, offering granular control over the usage of authenticity claims. For instance, a policy could be established to only permit the execution of all files with verified built-in fsverity signatures. The introduction of a security_inode_setintegrity() hook call within fsverity's workflow ensures that the verified built-in signature of a file is exposed to LSMs. This enables LSMs to recognize and label fsverity files that contain a verified built-in fsverity signature. This hook is invoked subsequent to the fsverity_verify_signature() process, guaranteeing the signature's verification against fsverity's keyring. This mechanism is crucial for maintaining system security, as it operates in kernel space, effectively thwarting attempts by malicious binaries to bypass user space stack interactions. The second to last commit in this patch set will add a link to the IPE documentation in fsverity.rst. Signed-off-by: Deven Bowers Signed-off-by: Fan Wu Acked-by: Eric Biggers Signed-off-by: Paul Moore --- include/linux/security.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 97b7c57e6560..c37c32ebbdcd 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -92,6 +92,7 @@ struct dm_verity_digest { enum lsm_integrity_type { LSM_INT_DMVERITY_SIG_VALID, LSM_INT_DMVERITY_ROOTHASH, + LSM_INT_FSVERITY_BUILTINSIG_VALID, }; /* -- cgit v1.2.3 From 7cff549daa67c7549c5a8688674cea2df8c2ec80 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 16 Aug 2024 17:43:05 +0200 Subject: kernel: Add helper macros for loop unrolling This helps in easily initializing blocks of code (e.g. static calls and keys). UNROLL(N, MACRO, __VA_ARGS__) calls MACRO N times with the first argument as the index of the iteration. This allows string pasting to create unique tokens for variable names, function calls etc. As an example: #include #define MACRO(N, a, b) \ int add_##N(int a, int b) \ { \ return a + b + N; \ } UNROLL(2, MACRO, x, y) expands to: int add_0(int x, int y) { return x + y + 0; } int add_1(int x, int y) { return x + y + 1; } Tested-by: Guenter Roeck Reviewed-by: Kees Cook Reviewed-by: Casey Schaufler Reviewed-by: John Johansen Acked-by: Jiri Olsa Acked-by: Song Liu Acked-by: Andrii Nakryiko Acked-by: Casey Schaufler Nacked-by: Tetsuo Handa Signed-off-by: KP Singh Signed-off-by: Paul Moore --- include/linux/unroll.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 include/linux/unroll.h (limited to 'include/linux') diff --git a/include/linux/unroll.h b/include/linux/unroll.h new file mode 100644 index 000000000000..d42fd6366373 --- /dev/null +++ b/include/linux/unroll.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2023 Google LLC. + */ + +#ifndef __UNROLL_H +#define __UNROLL_H + +#include + +#define UNROLL(N, MACRO, args...) CONCATENATE(__UNROLL_, N)(MACRO, args) + +#define __UNROLL_0(MACRO, args...) +#define __UNROLL_1(MACRO, args...) __UNROLL_0(MACRO, args) MACRO(0, args) +#define __UNROLL_2(MACRO, args...) __UNROLL_1(MACRO, args) MACRO(1, args) +#define __UNROLL_3(MACRO, args...) __UNROLL_2(MACRO, args) MACRO(2, args) +#define __UNROLL_4(MACRO, args...) __UNROLL_3(MACRO, args) MACRO(3, args) +#define __UNROLL_5(MACRO, args...) __UNROLL_4(MACRO, args) MACRO(4, args) +#define __UNROLL_6(MACRO, args...) __UNROLL_5(MACRO, args) MACRO(5, args) +#define __UNROLL_7(MACRO, args...) __UNROLL_6(MACRO, args) MACRO(6, args) +#define __UNROLL_8(MACRO, args...) __UNROLL_7(MACRO, args) MACRO(7, args) +#define __UNROLL_9(MACRO, args...) __UNROLL_8(MACRO, args) MACRO(8, args) +#define __UNROLL_10(MACRO, args...) __UNROLL_9(MACRO, args) MACRO(9, args) +#define __UNROLL_11(MACRO, args...) __UNROLL_10(MACRO, args) MACRO(10, args) +#define __UNROLL_12(MACRO, args...) __UNROLL_11(MACRO, args) MACRO(11, args) +#define __UNROLL_13(MACRO, args...) __UNROLL_12(MACRO, args) MACRO(12, args) +#define __UNROLL_14(MACRO, args...) __UNROLL_13(MACRO, args) MACRO(13, args) +#define __UNROLL_15(MACRO, args...) __UNROLL_14(MACRO, args) MACRO(14, args) +#define __UNROLL_16(MACRO, args...) __UNROLL_15(MACRO, args) MACRO(15, args) +#define __UNROLL_17(MACRO, args...) __UNROLL_16(MACRO, args) MACRO(16, args) +#define __UNROLL_18(MACRO, args...) __UNROLL_17(MACRO, args) MACRO(17, args) +#define __UNROLL_19(MACRO, args...) __UNROLL_18(MACRO, args) MACRO(18, args) +#define __UNROLL_20(MACRO, args...) __UNROLL_19(MACRO, args) MACRO(19, args) + +#endif /* __UNROLL_H */ -- cgit v1.2.3 From 9b59a85a84dc37ca4f2c54df5e06aff4c1eae5d3 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Tue, 20 Aug 2024 12:38:08 -0700 Subject: workqueue: Don't call va_start / va_end twice Calling va_start / va_end multiple times is undefined and causes problems with certain compiler / platforms. Change alloc_ordered_workqueue_lockdep_map to a macro and updated __alloc_workqueue to take a va_list argument. Cc: Sergey Senozhatsky Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Matthew Brost Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 04dff07a9e2b..f3cc15940b4d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -543,20 +543,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ -__printf(1, 4) static inline struct workqueue_struct * -alloc_ordered_workqueue_lockdep_map(const char *fmt, unsigned int flags, - struct lockdep_map *lockdep_map, ...) -{ - struct workqueue_struct *wq; - va_list args; - - va_start(args, lockdep_map); - wq = alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | flags, - 1, lockdep_map, args); - va_end(args); - - return wq; -} +#define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...) \ + alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, lockdep_map, ##args) #endif /** -- cgit v1.2.3 From 8fb9f31984bdc0aaa1b0f7c7e7a27bd3137f8744 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Thu, 1 Aug 2024 09:33:51 +0800 Subject: f2fs: clean up val{>>,<<}F2FS_BLKSIZE_BITS Use F2FS_BYTES_TO_BLK(bytes) and F2FS_BLK_TO_BYTES(blk) for cleanup Signed-off-by: Zhiguo Niu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 01bee2b289c2..f17f89a259e2 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -19,7 +19,6 @@ #define F2FS_BLKSIZE_BITS PAGE_SHIFT /* bits for F2FS_BLKSIZE */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ #define F2FS_EXTENSION_LEN 8 /* max size of extension */ -#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS) #define NULL_ADDR ((block_t)0) /* used as block_t addresses */ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ @@ -28,6 +27,7 @@ #define F2FS_BYTES_TO_BLK(bytes) ((bytes) >> F2FS_BLKSIZE_BITS) #define F2FS_BLK_TO_BYTES(blk) ((blk) << F2FS_BLKSIZE_BITS) #define F2FS_BLK_END_BYTES(blk) (F2FS_BLK_TO_BYTES(blk + 1) - 1) +#define F2FS_BLK_ALIGN(x) (F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1)) /* 0, 1(node nid), 2(meta nid) are reserved node id */ #define F2FS_RESERVED_NODE_NUM 3 -- cgit v1.2.3 From 013f510d1fce563f37f420bf231b065885a16f21 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 1 Aug 2024 21:31:21 +0800 Subject: dm: Remove unused declaration dm_get_rq_mapinfo() Commit ae6ad75e5c3c ("dm: remove unused dm_get_rq_mapinfo()") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Reviewed-by: Damien Le Moal Signed-off-by: Mikulas Patocka --- include/linux/device-mapper.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 53ca3a913d06..8321f65897f3 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -524,7 +524,6 @@ int dm_post_suspending(struct dm_target *ti); int dm_noflush_suspending(struct dm_target *ti); void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors); void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone); -union map_info *dm_get_rq_mapinfo(struct request *rq); #ifdef CONFIG_BLK_DEV_ZONED struct dm_report_zones_args { -- cgit v1.2.3 From 0e1d5731d3c1e2214249ef36dcd13ad51ad304cf Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 20 Aug 2024 08:35:30 +0206 Subject: printk: Check printk_deferred_enter()/_exit() usage Add validation that printk_deferred_enter()/_exit() are called in non-migration contexts. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240820063001.36405-5-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/printk.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index b937cefcb31c..eee8e97da681 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -161,15 +161,16 @@ int _printk(const char *fmt, ...); */ __printf(1, 2) __cold int _printk_deferred(const char *fmt, ...); -extern void __printk_safe_enter(void); -extern void __printk_safe_exit(void); +extern void __printk_deferred_enter(void); +extern void __printk_deferred_exit(void); + /* * The printk_deferred_enter/exit macros are available only as a hack for * some code paths that need to defer all printk console printing. Interrupts * must be disabled for the deferred duration. */ -#define printk_deferred_enter __printk_safe_enter -#define printk_deferred_exit __printk_safe_exit +#define printk_deferred_enter() __printk_deferred_enter() +#define printk_deferred_exit() __printk_deferred_exit() /* * Please don't use printk_ratelimit(), because it shares ratelimiting state -- cgit v1.2.3 From b7049d88c1763d5f133b0e93e39da8e89a9f944b Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 20 Aug 2024 08:35:32 +0206 Subject: printk: nbcon: Remove return value for write_atomic() The return value of write_atomic() does not provide any useful information. On the contrary, it makes things more complicated for the caller to appropriately deal with the information. Change write_atomic() to not have a return value. If the message did not get printed due to loss of ownership, the caller will notice this on its own. If ownership was not lost, it will be assumed that the driver successfully printed the message and the sequence number for that console will be incremented. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240820063001.36405-7-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/console.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 31a8f5b85f5d..577b157fe4fb 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -345,7 +345,7 @@ struct console { struct hlist_node node; /* nbcon console specific members */ - bool (*write_atomic)(struct console *con, + void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; -- cgit v1.2.3 From 7d56936c1e5208b5eeea2a9a2f70e85248f6da49 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 20 Aug 2024 08:35:33 +0206 Subject: printk: nbcon: Add detailed doc for write_atomic() The write_atomic() callback has special requirements and is allowed to use special helper functions. Provide detailed documentation of the callback so that a developer has a chance of implementing it correctly. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240820063001.36405-8-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/console.h | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 577b157fe4fb..35c64ee3827b 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -303,7 +303,7 @@ struct nbcon_write_context { /** * struct console - The console descriptor structure * @name: The name of the console driver - * @write: Write callback to output messages (Optional) + * @write: Legacy write callback to output messages (Optional) * @read: Read callback for console input (Optional) * @device: The underlying TTY device driver (Optional) * @unblank: Callback to unblank the console (Optional) @@ -320,7 +320,6 @@ struct nbcon_write_context { * @data: Driver private data * @node: hlist node for the console list * - * @write_atomic: Write callback for atomic context * @nbcon_state: State for nbcon consoles * @nbcon_seq: Sequence number of the next record for nbcon to print * @pbufs: Pointer to nbcon private buffer @@ -345,8 +344,34 @@ struct console { struct hlist_node node; /* nbcon console specific members */ - void (*write_atomic)(struct console *con, - struct nbcon_write_context *wctxt); + + /** + * @write_atomic: + * + * NBCON callback to write out text in any context. (Optional) + * + * This callback is called with the console already acquired. However, + * a higher priority context is allowed to take it over by default. + * + * The callback must call nbcon_enter_unsafe() and nbcon_exit_unsafe() + * around any code where the takeover is not safe, for example, when + * manipulating the serial port registers. + * + * nbcon_enter_unsafe() will fail if the context has lost the console + * ownership in the meantime. In this case, the callback is no longer + * allowed to go forward. It must back out immediately and carefully. + * The buffer content is also no longer trusted since it no longer + * belongs to the context. + * + * The callback should allow the takeover whenever it is safe. It + * increases the chance to see messages when the system is in trouble. + * + * The callback can be called from any context (including NMI). + * Therefore it must avoid usage of any locking and instead rely + * on the console ownership for synchronization. + */ + void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); + atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; struct printk_buffers *pbufs; -- cgit v1.2.3 From 7a16a771890746b4060333d665ebe674945a3cfa Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 20 Aug 2024 08:35:34 +0206 Subject: printk: nbcon: Add callbacks to synchronize with driver Console drivers typically must deal with access to the hardware via user input/output (such as an interactive login shell) and output of kernel messages via printk() calls. To provide the necessary synchronization, usually some driver-specific locking mechanism is used (for example, the port spinlock for uart serial consoles). Until now, usage of this driver-specific locking has been hidden from the printk-subsystem and implemented within the various console callbacks. However, nbcon consoles would need to use it even in the generic code. Add device_lock() and device_unlock() callback which will need to get implemented by nbcon consoles. The callbacks will use whatever synchronization mechanism the driver is using for itself. The minimum requirement is to prevent CPU migration. It would allow a context friendly acquiring of nbcon console ownership in non-emergency and non-panic context. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240820063001.36405-9-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/console.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 35c64ee3827b..46b3c210b931 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -372,6 +372,49 @@ struct console { */ void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); + /** + * @device_lock: + * + * NBCON callback to begin synchronization with driver code. + * + * Console drivers typically must deal with access to the hardware + * via user input/output (such as an interactive login shell) and + * output of kernel messages via printk() calls. This callback is + * called by the printk-subsystem whenever it needs to synchronize + * with hardware access by the driver. It should be implemented to + * use whatever synchronization mechanism the driver is using for + * itself (for example, the port lock for uart serial consoles). + * + * The callback is always called from task context. It may use any + * synchronization method required by the driver. + * + * IMPORTANT: The callback MUST disable migration. The console driver + * may be using a synchronization mechanism that already takes + * care of this (such as spinlocks). Otherwise this function must + * explicitly call migrate_disable(). + * + * The flags argument is provided as a convenience to the driver. It + * will be passed again to device_unlock(). It can be ignored if the + * driver does not need it. + */ + void (*device_lock)(struct console *con, unsigned long *flags); + + /** + * @device_unlock: + * + * NBCON callback to finish synchronization with driver code. + * + * It is the counterpart to device_lock(). + * + * This callback is always called from task context. It must + * appropriately re-enable migration (depending on how device_lock() + * disabled migration). + * + * The flags argument is the value of the same variable that was + * passed to device_lock(). + */ + void (*device_unlock)(struct console *con, unsigned long flags); + atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; struct printk_buffers *pbufs; -- cgit v1.2.3 From 77e73c0687f5bc884fa1b0cb97aca912bcc01957 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 20 Aug 2024 08:35:36 +0206 Subject: serial: core: Provide low-level functions to lock port It will be necessary at times for the uart nbcon console drivers to acquire the port lock directly (without the additional nbcon functionality of the port lock wrappers). These are special cases such as the implementation of the device_lock()/device_unlock() callbacks or for internal port lock wrapper synchronization. Provide low-level variants __uart_port_lock_irqsave() and __uart_port_unlock_irqrestore() for this purpose. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240820063001.36405-11-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/serial_core.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index aea25eef9a1a..8872cd21e70a 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -590,6 +590,24 @@ struct uart_port { void *private_data; /* generic platform data pointer */ }; +/* + * Only for console->device_lock()/_unlock() callbacks and internal + * port lock wrapper synchronization. + */ +static inline void __uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) +{ + spin_lock_irqsave(&up->lock, *flags); +} + +/* + * Only for console->device_lock()/_unlock() callbacks and internal + * port lock wrapper synchronization. + */ +static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) +{ + spin_unlock_irqrestore(&up->lock, flags); +} + /** * uart_port_lock - Lock the UART port * @up: Pointer to UART port structure -- cgit v1.2.3 From eabd4600dafacf9895184259373f2445ff748654 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 20 Aug 2024 08:35:37 +0206 Subject: serial: core: Introduce wrapper to set @uart_port->cons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce uart_port_set_cons() as a wrapper to set @cons of a uart_port. The wrapper sets @cons under the port lock in order to prevent @cons from disappearing while another context is holding the port lock. This is necessary for a follow-up commit relating to the port lock wrappers, which rely on @cons not changing between lock and unlock. Signed-off-by: John Ogness Tested-by: Théo Lebrun # EyeQ5, AMBA-PL011 Acked-by: Greg Kroah-Hartman Reviewed-by: Petr Mladek Reviewed-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20240820063001.36405-12-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/serial_core.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 8872cd21e70a..2cf03ff2056a 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -608,6 +608,23 @@ static inline void __uart_port_unlock_irqrestore(struct uart_port *up, unsigned spin_unlock_irqrestore(&up->lock, flags); } +/** + * uart_port_set_cons - Safely set the @cons field for a uart + * @up: The uart port to set + * @con: The new console to set to + * + * This function must be used to set @up->cons. It uses the port lock to + * synchronize with the port lock wrappers in order to ensure that the console + * cannot change or disappear while another context is holding the port lock. + */ +static inline void uart_port_set_cons(struct uart_port *up, struct console *con) +{ + unsigned long flags; + + __uart_port_lock_irqsave(up, &flags); + up->cons = con; + __uart_port_unlock_irqrestore(up, flags); +} /** * uart_port_lock - Lock the UART port * @up: Pointer to UART port structure -- cgit v1.2.3 From dc219d8d858d95549543aa7a41b6e64a0da9e406 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 20 Aug 2024 08:35:38 +0206 Subject: console: Improve console_srcu_read_flags() comments It was not clear when exactly console_srcu_read_flags() must be used vs. directly reading @console->flags. Refactor and clarify that console_srcu_read_flags() is only needed if the console is registered or the caller is in a context where the registration status of the console may change (due to another context). The function requires the caller holds @console_srcu, which will ensure that the caller sees an appropriate @flags value for the registered console and that exit/cleanup routines will not run if the console is in the process of unregistration. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240820063001.36405-13-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/console.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 46b3c210b931..aafe3121b74e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -446,28 +446,34 @@ extern void console_list_unlock(void) __releases(console_mutex); extern struct hlist_head console_list; /** - * console_srcu_read_flags - Locklessly read the console flags + * console_srcu_read_flags - Locklessly read flags of a possibly registered + * console * @con: struct console pointer of console to read flags from * - * This function provides the necessary READ_ONCE() and data_race() - * notation for locklessly reading the console flags. The READ_ONCE() - * in this function matches the WRITE_ONCE() when @flags are modified - * for registered consoles with console_srcu_write_flags(). + * Locklessly reading @con->flags provides a consistent read value because + * there is at most one CPU modifying @con->flags and that CPU is using only + * read-modify-write operations to do so. * - * Only use this function to read console flags when locklessly - * iterating the console list via srcu. + * Requires console_srcu_read_lock to be held, which implies that @con might + * be a registered console. The purpose of holding console_srcu_read_lock is + * to guarantee that the console state is valid (CON_SUSPENDED/CON_ENABLED) + * and that no exit/cleanup routines will run if the console is currently + * undergoing unregistration. + * + * If the caller is holding the console_list_lock or it is _certain_ that + * @con is not and will not become registered, the caller may read + * @con->flags directly instead. * * Context: Any context. + * Return: The current value of the @con->flags field. */ static inline short console_srcu_read_flags(const struct console *con) { WARN_ON_ONCE(!console_srcu_read_lock_is_held()); /* - * Locklessly reading console->flags provides a consistent - * read value because there is at most one CPU modifying - * console->flags and that CPU is using only read-modify-write - * operations to do so. + * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified + * for registered consoles with console_srcu_write_flags(). */ return data_race(READ_ONCE(con->flags)); } -- cgit v1.2.3 From adf6f37d142e14896115929f3892bbc0a86e25bf Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 20 Aug 2024 08:35:39 +0206 Subject: nbcon: Add API to acquire context for non-printing operations Provide functions nbcon_device_try_acquire() and nbcon_device_release() which will try to acquire the nbcon console ownership with NBCON_PRIO_NORMAL and mark it unsafe for handover/takeover. These functions are to be used together with the device-specific locking when performing non-printing activities on the console device. They will allow synchronization against the atomic_write() callback which will be serialized, for higher priority contexts, only by acquiring the console context ownership. Pitfalls: The API requires to be called in a context with migration disabled because it uses per-CPU variables internally. The context is set unsafe for a takeover all the time. It guarantees full serialization against any atomic_write() caller except for the final flush in panic() which might try an unsafe takeover. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240820063001.36405-14-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/console.h | 2 ++ include/linux/printk.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index aafe3121b74e..3706f944de46 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -322,6 +322,7 @@ struct nbcon_write_context { * * @nbcon_state: State for nbcon consoles * @nbcon_seq: Sequence number of the next record for nbcon to print + * @nbcon_device_ctxt: Context available for non-printing operations * @pbufs: Pointer to nbcon private buffer */ struct console { @@ -417,6 +418,7 @@ struct console { atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; + struct nbcon_context __private nbcon_device_ctxt; struct printk_buffers *pbufs; }; diff --git a/include/linux/printk.h b/include/linux/printk.h index eee8e97da681..9687089f5ace 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -9,6 +9,8 @@ #include #include +struct console; + extern const char linux_banner[]; extern const char linux_proc_banner[]; @@ -198,6 +200,8 @@ extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; void printk_trigger_flush(void); void console_try_replay_all(void); +extern bool nbcon_device_try_acquire(struct console *con); +extern void nbcon_device_release(struct console *con); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -280,6 +284,16 @@ static inline void printk_trigger_flush(void) static inline void console_try_replay_all(void) { } + +static inline bool nbcon_device_try_acquire(struct console *con) +{ + return false; +} + +static inline void nbcon_device_release(struct console *con) +{ +} + #endif bool this_cpu_in_panic(void); -- cgit v1.2.3 From 4126f149c48b2ae757d11ea24675f7071ab22ebf Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 20 Aug 2024 08:35:40 +0206 Subject: serial: core: Acquire nbcon context in port->lock wrapper Currently the port->lock wrappers uart_port_lock(), uart_port_unlock() (and their variants) only lock/unlock the spin_lock. If the port is an nbcon console that has implemented the write_atomic() callback, the wrappers must also acquire/release the console context and mark the region as unsafe. This allows general port->lock synchronization to be synchronized against the nbcon write_atomic() callback. Note that __uart_port_using_nbcon() relies on the port->lock being held while a console is added and removed from the console list (i.e. all uart nbcon drivers *must* take the port->lock in their device_lock() callbacks). Signed-off-by: John Ogness Acked-by: Greg Kroah-Hartman Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240820063001.36405-15-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/serial_core.h | 82 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 2cf03ff2056a..4ab65874a850 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -625,6 +627,60 @@ static inline void uart_port_set_cons(struct uart_port *up, struct console *con) up->cons = con; __uart_port_unlock_irqrestore(up, flags); } + +/* Only for internal port lock wrapper usage. */ +static inline bool __uart_port_using_nbcon(struct uart_port *up) +{ + lockdep_assert_held_once(&up->lock); + + if (likely(!uart_console(up))) + return false; + + /* + * @up->cons is only modified under the port lock. Therefore it is + * certain that it cannot disappear here. + * + * @up->cons->node is added/removed from the console list under the + * port lock. Therefore it is certain that the registration status + * cannot change here, thus @up->cons->flags can be read directly. + */ + if (hlist_unhashed_lockless(&up->cons->node) || + !(up->cons->flags & CON_NBCON) || + !up->cons->write_atomic) { + return false; + } + + return true; +} + +/* Only for internal port lock wrapper usage. */ +static inline bool __uart_port_nbcon_try_acquire(struct uart_port *up) +{ + if (!__uart_port_using_nbcon(up)) + return true; + + return nbcon_device_try_acquire(up->cons); +} + +/* Only for internal port lock wrapper usage. */ +static inline void __uart_port_nbcon_acquire(struct uart_port *up) +{ + if (!__uart_port_using_nbcon(up)) + return; + + while (!nbcon_device_try_acquire(up->cons)) + cpu_relax(); +} + +/* Only for internal port lock wrapper usage. */ +static inline void __uart_port_nbcon_release(struct uart_port *up) +{ + if (!__uart_port_using_nbcon(up)) + return; + + nbcon_device_release(up->cons); +} + /** * uart_port_lock - Lock the UART port * @up: Pointer to UART port structure @@ -632,6 +688,7 @@ static inline void uart_port_set_cons(struct uart_port *up, struct console *con) static inline void uart_port_lock(struct uart_port *up) { spin_lock(&up->lock); + __uart_port_nbcon_acquire(up); } /** @@ -641,6 +698,7 @@ static inline void uart_port_lock(struct uart_port *up) static inline void uart_port_lock_irq(struct uart_port *up) { spin_lock_irq(&up->lock); + __uart_port_nbcon_acquire(up); } /** @@ -651,6 +709,7 @@ static inline void uart_port_lock_irq(struct uart_port *up) static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *flags) { spin_lock_irqsave(&up->lock, *flags); + __uart_port_nbcon_acquire(up); } /** @@ -661,7 +720,15 @@ static inline void uart_port_lock_irqsave(struct uart_port *up, unsigned long *f */ static inline bool uart_port_trylock(struct uart_port *up) { - return spin_trylock(&up->lock); + if (!spin_trylock(&up->lock)) + return false; + + if (!__uart_port_nbcon_try_acquire(up)) { + spin_unlock(&up->lock); + return false; + } + + return true; } /** @@ -673,7 +740,15 @@ static inline bool uart_port_trylock(struct uart_port *up) */ static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long *flags) { - return spin_trylock_irqsave(&up->lock, *flags); + if (!spin_trylock_irqsave(&up->lock, *flags)) + return false; + + if (!__uart_port_nbcon_try_acquire(up)) { + spin_unlock_irqrestore(&up->lock, *flags); + return false; + } + + return true; } /** @@ -682,6 +757,7 @@ static inline bool uart_port_trylock_irqsave(struct uart_port *up, unsigned long */ static inline void uart_port_unlock(struct uart_port *up) { + __uart_port_nbcon_release(up); spin_unlock(&up->lock); } @@ -691,6 +767,7 @@ static inline void uart_port_unlock(struct uart_port *up) */ static inline void uart_port_unlock_irq(struct uart_port *up) { + __uart_port_nbcon_release(up); spin_unlock_irq(&up->lock); } @@ -701,6 +778,7 @@ static inline void uart_port_unlock_irq(struct uart_port *up) */ static inline void uart_port_unlock_irqrestore(struct uart_port *up, unsigned long flags) { + __uart_port_nbcon_release(up); spin_unlock_irqrestore(&up->lock, flags); } -- cgit v1.2.3 From 5dde3b7354133846079fa51e55e74ef90a836759 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 20 Aug 2024 08:35:52 +0206 Subject: printk: nbcon: Add unsafe flushing on panic Add nbcon_atomic_flush_unsafe() to flush all nbcon consoles using the write_atomic() callback and allowing unsafe hostile takeovers. Call this at the end of panic() as a final attempt to flush any pending messages. Note that legacy consoles use unsafe methods for flushing from the beginning of panic (see bust_spinlocks()). Therefore, systems using both legacy and nbcon consoles may still fail to see panic messages due to unsafe legacy console usage. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240820063001.36405-27-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/printk.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 9687089f5ace..2e083f01f8a3 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -202,6 +202,7 @@ void printk_trigger_flush(void); void console_try_replay_all(void); extern bool nbcon_device_try_acquire(struct console *con); extern void nbcon_device_release(struct console *con); +void nbcon_atomic_flush_unsafe(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -294,6 +295,10 @@ static inline void nbcon_device_release(struct console *con) { } +static inline void nbcon_atomic_flush_unsafe(void) +{ +} + #endif bool this_cpu_in_panic(void); -- cgit v1.2.3 From e35a8884270bae11196eedf3b0a5bf22619f11f2 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Tue, 20 Aug 2024 08:35:55 +0206 Subject: printk: Coordinate direct printing in panic If legacy and nbcon consoles are registered and the nbcon consoles are allowed to flush (i.e. no boot consoles registered), the legacy consoles will no longer perform direct printing on the panic CPU until after the backtrace has been stored. This will give the safe nbcon consoles a chance to print the panic messages before allowing the unsafe legacy consoles to print. If no nbcon consoles are registered or they are not allowed to flush because boot consoles are registered, there is no change in behavior (i.e. legacy consoles will always attempt to print from the printk() caller context). Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240820063001.36405-30-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/printk.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 2e083f01f8a3..eca9bb2ee637 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -200,6 +200,7 @@ extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; void printk_trigger_flush(void); void console_try_replay_all(void); +void printk_legacy_allow_panic_sync(void); extern bool nbcon_device_try_acquire(struct console *con); extern void nbcon_device_release(struct console *con); void nbcon_atomic_flush_unsafe(void); @@ -286,6 +287,10 @@ static inline void console_try_replay_all(void) { } +static inline void printk_legacy_allow_panic_sync(void) +{ +} + static inline bool nbcon_device_try_acquire(struct console *con) { return false; -- cgit v1.2.3 From ecb5e1aa82c86642ec1eaafefd4e317dfba3a238 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Aug 2024 08:35:57 +0206 Subject: printk: nbcon: Implement emergency sections In emergency situations (something has gone wrong but the system continues to operate), usually important information (such as a backtrace) is generated via printk(). This information should be pushed out to the consoles ASAP. Add per-CPU emergency nesting tracking because an emergency can arise while in an emergency situation. Add functions to mark the beginning and end of emergency sections where the urgent messages are generated. Perform direct console flushing at the emergency priority if the current CPU is in an emergency state and it is safe to do so. Note that the emergency state is not system-wide. While one CPU is in an emergency state, another CPU may attempt to print console messages at normal priority. Also note that printk() already attempts to flush consoles in the caller context for normal priority. However, follow-up changes will introduce printing kthreads, in which case the normal priority printk() calls will offload to the kthreads. Co-developed-by: John Ogness Signed-off-by: John Ogness Signed-off-by: Thomas Gleixner (Intel) Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240820063001.36405-32-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/console.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 3706f944de46..9a13f91b0c43 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -553,10 +553,14 @@ static inline bool console_is_registered(const struct console *con) hlist_for_each_entry(con, &console_list, node) #ifdef CONFIG_PRINTK +extern void nbcon_cpu_emergency_enter(void); +extern void nbcon_cpu_emergency_exit(void); extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); #else +static inline void nbcon_cpu_emergency_enter(void) { } +static inline void nbcon_cpu_emergency_exit(void) { } static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } -- cgit v1.2.3 From d156263e247c334a8872578118e0709ed63c4806 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 21 Aug 2024 06:37:39 -1000 Subject: workqueue: Fix another htmldocs build warning Fix htmldocs build warning introduced by 9b59a85a84dc ("workqueue: Don't call va_start / va_end twice"). Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell Cc: Matthew Brost --- include/linux/workqueue.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index f3cc15940b4d..59c2695e12e7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -534,7 +534,7 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @lockdep_map: user-defined lockdep_map - * @...: args for @fmt + * @args: args for @fmt * * Same as alloc_ordered_workqueue but with the a user-define lockdep_map. * Useful for workqueues created with the same purpose and to avoid leaking a @@ -544,7 +544,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...) \ - alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, lockdep_map, ##args) + alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), \ + 1, lockdep_map, ##args) #endif /** -- cgit v1.2.3 From 496ddd19a0fad22f250fc7a7b7a8000155418934 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 8 Aug 2024 16:22:28 -0700 Subject: bpf: extract iterator argument type and name validation logic Verifier enforces that all iterator structs are named `bpf_iter_` and that whenever iterator is passed to a kfunc it's passed as a valid PTR -> STRUCT chain (with potentially const modifiers in between). We'll need this check for upcoming changes, so instead of duplicating the logic, extract it into a helper function. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240808232230.2848712-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index cffb43133c68..b8a583194c4a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -580,6 +580,7 @@ bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type); bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2); +int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -654,6 +655,10 @@ static inline bool btf_types_are_same(const struct btf *btf1, u32 id1, { return false; } +static inline int btf_check_iter_arg(struct btf *btf, const struct btf_type *func, int arg_idx) +{ + return -EOPNOTSUPP; +} #endif static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) -- cgit v1.2.3 From 74b1e94e94ea369923e5656eeb10b26b16591327 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 20 Aug 2024 13:51:19 -0700 Subject: net: repack struct netdev_queue Adding the NAPI pointer to struct netdev_queue made it grow into another cacheline, even though there was 44 bytes of padding available. The struct was historically grouped as follows: /* read-mostly stuff (align) */ /* ... random control path fields ... */ /* write-mostly stuff (align) */ /* ... 40 byte hole ... */ /* struct dql (align) */ It seems that people want to add control path fields after the read only fields. struct dql looks pretty innocent but it forces its own alignment and nothing indicates that there is a lot of empty space above it. Move dql above the xmit_lock. This shifts the empty space to the end of the struct rather than in the middle of it. Move two example fields there to set an example. Hopefully people will now add new fields at the end of the struct. A lot of the read-only stuff is also control path-only, but if we move it all we'll have another hole in the middle. Before: /* size: 384, cachelines: 6, members: 16 */ /* sum members: 284, holes: 3, sum holes: 100 */ After: /* size: 320, cachelines: 5, members: 16 */ /* sum members: 284, holes: 1, sum holes: 8 */ Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240820205119.1321322-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0ef3eaa23f4b..614ec5d3d75b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -644,9 +644,6 @@ struct netdev_queue { struct Qdisc __rcu *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; -#endif -#if defined(CONFIG_XPS) && defined(CONFIG_NUMA) - int numa_node; #endif unsigned long tx_maxrate; /* @@ -660,13 +657,13 @@ struct netdev_queue { #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif - /* NAPI instance for the queue - * Readers and writers must hold RTNL - */ - struct napi_struct *napi; + /* * write-mostly part */ +#ifdef CONFIG_BQL + struct dql dql; +#endif spinlock_t _xmit_lock ____cacheline_aligned_in_smp; int xmit_lock_owner; /* @@ -676,8 +673,16 @@ struct netdev_queue { unsigned long state; -#ifdef CONFIG_BQL - struct dql dql; +/* + * slow- / control-path part + */ + /* NAPI instance for the queue + * Readers and writers must hold RTNL + */ + struct napi_struct *napi; + +#if defined(CONFIG_XPS) && defined(CONFIG_NUMA) + int numa_node; #endif } ____cacheline_aligned_in_smp; -- cgit v1.2.3 From ba0fb44aed47693cc2482427f63ba6cd19051327 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Sun, 11 Aug 2024 10:09:35 +0300 Subject: dma-mapping: replace zone_dma_bits by zone_dma_limit The hardware DMA limit might not be power of 2. When RAM range starts above 0, say 4GB, DMA limit of 30 bits should end at 5GB. A single high bit can not encode this limit. Use a plain address for the DMA zone limit instead. Since the DMA zone can now potentially span beyond 4GB physical limit of DMA32, make sure to use DMA zone for GFP_DMA32 allocations in that case. Signed-off-by: Catalin Marinas Co-developed-by: Baruch Siach Signed-off-by: Baruch Siach Reviewed-by: Catalin Marinas Reviewed-by: Petr Tesarik Signed-off-by: Christoph Hellwig --- include/linux/dma-direct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index edbe13d00776..d7e30d4f7503 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -12,7 +12,7 @@ #include #include -extern unsigned int zone_dma_bits; +extern u64 zone_dma_limit; /* * Record the mapping of CPU physical to DMA addresses for a given region. -- cgit v1.2.3 From b5c58b2fdc427e7958412ecb2de2804a1f7c1572 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 24 Jul 2024 21:04:49 +0300 Subject: dma-mapping: direct calls for dma-iommu Directly call into dma-iommu just like we have been doing for dma-direct for a while. This avoids the indirect call overhead for IOMMU ops and removes the need to have DMA ops entirely for many common configurations. Signed-off-by: Leon Romanovsky Signed-off-by: Leon Romanovsky Acked-by: Greg Kroah-Hartman Acked-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/device.h | 5 ++ include/linux/dma-map-ops.h | 13 ---- include/linux/iommu-dma.h | 147 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 13 deletions(-) create mode 100644 include/linux/iommu-dma.h (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 34eb20f5966f..1c5280d28bc3 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -707,6 +707,8 @@ struct device_physical_location { * for dma allocations. This flag is managed by the dma ops * instance from ->dma_supported. * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers. + * @dma_iommu: Device is using default IOMMU implementation for DMA and + * doesn't rely on dma_ops structure. * * At the lowest level, every device in a Linux system is represented by an * instance of struct device. The device structure contains the information @@ -822,6 +824,9 @@ struct device { #ifdef CONFIG_DMA_NEED_SYNC bool dma_skip_sync:1; #endif +#ifdef CONFIG_IOMMU_DMA + bool dma_iommu:1; +#endif }; /** diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 02a1c825896b..077b15c93bb8 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -13,20 +13,7 @@ struct cma; struct iommu_ops; -/* - * Values for struct dma_map_ops.flags: - * - * DMA_F_PCI_P2PDMA_SUPPORTED: Indicates the dma_map_ops implementation can - * handle PCI P2PDMA pages in the map_sg/unmap_sg operation. - * DMA_F_CAN_SKIP_SYNC: DMA sync operations can be skipped if the device is - * coherent and it's not an SWIOTLB buffer. - */ -#define DMA_F_PCI_P2PDMA_SUPPORTED (1 << 0) -#define DMA_F_CAN_SKIP_SYNC (1 << 1) - struct dma_map_ops { - unsigned int flags; - void *(*alloc)(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h new file mode 100644 index 000000000000..d30a58bf00fd --- /dev/null +++ b/include/linux/iommu-dma.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + * + * DMA operations that map physical memory through IOMMU. + */ +#ifndef _LINUX_IOMMU_DMA_H +#define _LINUX_IOMMU_DMA_H + +#include + +#ifdef CONFIG_IOMMU_DMA +dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + unsigned long attrs); +void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, unsigned long attrs); +int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs); +void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, unsigned long attrs); +void *iommu_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, + gfp_t gfp, unsigned long attrs); +int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +int iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); +unsigned long iommu_dma_get_merge_boundary(struct device *dev); +size_t iommu_dma_opt_mapping_size(void); +size_t iommu_dma_max_mapping_size(struct device *dev); +void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t handle, unsigned long attrs); +dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs); +void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle, + size_t size, enum dma_data_direction dir, unsigned long attrs); +struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); +void iommu_dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir); +void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir); +void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir); +void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir); +void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir); +#else +static inline dma_addr_t iommu_dma_map_page(struct device *dev, + struct page *page, unsigned long offset, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + return DMA_MAPPING_ERROR; +} +static inline void iommu_dma_unmap_page(struct device *dev, + dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ +} +static inline int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + return -EINVAL; +} +static inline void iommu_dma_unmap_sg(struct device *dev, + struct scatterlist *sg, int nents, enum dma_data_direction dir, + unsigned long attrs) +{ +} +static inline void *iommu_dma_alloc(struct device *dev, size_t size, + dma_addr_t *handle, gfp_t gfp, unsigned long attrs) +{ + return NULL; +} +static inline int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + return -EINVAL; +} +static inline int iommu_dma_get_sgtable(struct device *dev, + struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, + size_t size, unsigned long attrs) +{ + return -EINVAL; +} +static inline unsigned long iommu_dma_get_merge_boundary(struct device *dev) +{ + return 0; +} +static inline size_t iommu_dma_opt_mapping_size(void) +{ + return 0; +} +static inline size_t iommu_dma_max_mapping_size(struct device *dev) +{ + return 0; +} +static inline void iommu_dma_free(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t handle, unsigned long attrs) +{ +} +static inline dma_addr_t iommu_dma_map_resource(struct device *dev, + phys_addr_t phys, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + return DMA_MAPPING_ERROR; +} +static inline void iommu_dma_unmap_resource(struct device *dev, + dma_addr_t handle, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ +} +static inline struct sg_table * +iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) +{ + return NULL; +} +static inline void iommu_dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ +} +static inline void iommu_dma_sync_single_for_cpu(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction dir) +{ +} +static inline void iommu_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) +{ +} +static inline void iommu_dma_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nelems, + enum dma_data_direction dir) +{ +} +static inline void iommu_dma_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nelems, + enum dma_data_direction dir) +{ +} +#endif /* CONFIG_IOMMU_DMA */ +#endif /* _LINUX_IOMMU_DMA_H */ -- cgit v1.2.3 From ae010757a55b57c8b82628e8ea9b7da2269131d9 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 22 Aug 2024 01:41:07 -0700 Subject: bpf: rename nocsr -> bpf_fastcall in verifier Attribute used by LLVM implementation of the feature had been changed from no_caller_saved_registers to bpf_fastcall (see [1]). This commit replaces references to nocsr by references to bpf_fastcall to keep LLVM and Kernel parts in sync. [1] https://github.com/llvm/llvm-project/pull/105417 Acked-by: Yonghong Song Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240822084112.3257995-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++--- include/linux/bpf_verifier.h | 18 +++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f0192c173ed8..00dc4dd28cbd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -808,12 +808,12 @@ struct bpf_func_proto { bool gpl_only; bool pkt_access; bool might_sleep; - /* set to true if helper follows contract for gcc/llvm - * attribute no_caller_saved_registers: + /* set to true if helper follows contract for llvm + * attribute bpf_fastcall: * - void functions do not scratch r0 * - functions taking N arguments scratch only registers r1-rN */ - bool allow_nocsr; + bool allow_fastcall; enum bpf_return_type ret_type; union { struct { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5cea15c81b8a..634a302a39e3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -577,13 +577,13 @@ struct bpf_insn_aux_data { bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ /* true if STX or LDX instruction is a part of a spill/fill - * pattern for a no_caller_saved_registers call. + * pattern for a bpf_fastcall call. */ - u8 nocsr_pattern:1; + u8 fastcall_pattern:1; /* for CALL instructions, a number of spill/fill pairs in the - * no_caller_saved_registers pattern. + * bpf_fastcall pattern. */ - u8 nocsr_spills_num:3; + u8 fastcall_spills_num:3; /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ @@ -653,10 +653,10 @@ struct bpf_subprog_info { u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; - /* offsets in range [stack_depth .. nocsr_stack_off) - * are used for no_caller_saved_registers spills and fills. + /* offsets in range [stack_depth .. fastcall_stack_off) + * are used for bpf_fastcall spills and fills. */ - s16 nocsr_stack_off; + s16 fastcall_stack_off; bool has_tail_call: 1; bool tail_call_reachable: 1; bool has_ld_abs: 1; @@ -664,8 +664,8 @@ struct bpf_subprog_info { bool is_async_cb: 1; bool is_exception_cb: 1; bool args_cached: 1; - /* true if nocsr stack region is used by functions that can't be inlined */ - bool keep_nocsr_stack: 1; + /* true if bpf_fastcall stack region is used by functions that can't be inlined */ + bool keep_fastcall_stack: 1; u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; -- cgit v1.2.3 From aa35e56a5217b86f9c05420c36c908baf3b2df5f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 19 Aug 2024 18:00:19 +0200 Subject: thermal: core: Introduce .should_bind() thermal zone callback The current design of the code binding cooling devices to trip points in thermal zones is convoluted and hard to follow. Namely, a driver that registers a thermal zone can provide .bind() and .unbind() operations for it, which are required to call either thermal_bind_cdev_to_trip() and thermal_unbind_cdev_from_trip(), respectively, or thermal_zone_bind_cooling_device() and thermal_zone_unbind_cooling_device(), respectively, for every relevant trip point and the given cooling device. Moreover, if .bind() is provided and .unbind() is not, the cleanup necessary during the removal of a thermal zone or a cooling device may not be carried out. In other words, the core relies on the thermal zone owners to do the right thing, which is error prone and far from obvious, even though all of that is not really necessary. Specifically, if the core could ask the thermal zone owner, through a special thermal zone callback, whether or not a given cooling device should be bound to a given trip point in the given thermal zone, it might as well carry out all of the binding and unbinding by itself. In particular, the unbinding can be done automatically without involving the thermal zone owner at all because all of the thermal instances associated with a thermal zone or cooling device going away must be deleted regardless. Accordingly, introduce a new thermal zone operation, .should_bind(), that can be invoked by the thermal core for a given thermal zone, trip point and cooling device combination in order to check whether or not the cooling device should be bound to the trip point at hand. It takes an additional cooling_spec argument allowing the thermal zone owner to specify the highest and lowest cooling states of the cooling device and its weight for the given trip point binding. Make the thermal core use this operation, if present, in the absence of .bind() and .unbind(). Note that .should_bind() will be called under the thermal zone lock. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Zhang Rui Acked-by: Huisong Li Reviewed-by: Daniel Lezcano Link: https://patch.msgid.link/9334403.CDJkKcVGEf@rjwysocki.net --- include/linux/thermal.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index ba6d74c11620..9c5d1f14224f 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -85,11 +85,21 @@ struct thermal_trip { struct thermal_zone_device; +struct cooling_spec { + unsigned long upper; /* Highest cooling state */ + unsigned long lower; /* Lowest cooling state */ + unsigned int weight; /* Cooling device weight */ +}; + struct thermal_zone_device_ops { int (*bind) (struct thermal_zone_device *, struct thermal_cooling_device *); int (*unbind) (struct thermal_zone_device *, struct thermal_cooling_device *); + bool (*should_bind) (struct thermal_zone_device *, + const struct thermal_trip *, + struct thermal_cooling_device *, + struct cooling_spec *); int (*get_temp) (struct thermal_zone_device *, int *); int (*set_trips) (struct thermal_zone_device *, int, int); int (*change_mode) (struct thermal_zone_device *, -- cgit v1.2.3 From 2fb85633641770d55b50c934db6b4fb02411a1fe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 19 Aug 2024 18:05:00 +0200 Subject: thermal: core: Unexport thermal_bind_cdev_to_trip() and thermal_unbind_cdev_from_trip() Since thermal_bind_cdev_to_trip() and thermal_unbind_cdev_from_trip() are only called locally in the thermal core now, they can be static, so change their definitions accordingly and drop their headers from the global thermal header file. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Zhang Rui Acked-by: Huisong Li Reviewed-by: Daniel Lezcano Link: https://patch.msgid.link/3512161.QJadu78ljV@rjwysocki.net --- include/linux/thermal.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 9c5d1f14224f..6599a26847f7 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -247,18 +247,10 @@ const char *thermal_zone_device_type(struct thermal_zone_device *tzd); int thermal_zone_device_id(struct thermal_zone_device *tzd); struct device *thermal_zone_device(struct thermal_zone_device *tzd); -int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz, - const struct thermal_trip *trip, - struct thermal_cooling_device *cdev, - unsigned long upper, unsigned long lower, - unsigned int weight); int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *, unsigned long, unsigned long, unsigned int); -int thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz, - const struct thermal_trip *trip, - struct thermal_cooling_device *cdev); int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *); void thermal_zone_device_update(struct thermal_zone_device *, -- cgit v1.2.3 From d51e783c17bab0c139bf78d6bd9d1f66673f7903 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 16 Aug 2024 17:43:06 +0200 Subject: lsm: count the LSMs enabled at compile time These macros are a clever trick to determine a count of the number of LSMs that are enabled in the config to ascertain the maximum number of static calls that need to be configured per LSM hook. Without this one would need to generate static calls for the total number of LSMs in the kernel (even if they are not compiled) times the number of LSM hooks which ends up being quite wasteful. Tested-by: Guenter Roeck Suggested-by: Kui-Feng Lee Suggested-by: Andrii Nakryiko Reviewed-by: Kees Cook Reviewed-by: Casey Schaufler Reviewed-by: John Johansen Acked-by: Casey Schaufler Acked-by: Song Liu Acked-by: Andrii Nakryiko Nacked-by: Tetsuo Handa Signed-off-by: KP Singh [PM: added IPE to the count during merge] Signed-off-by: Paul Moore --- include/linux/args.h | 6 +-- include/linux/lsm_count.h | 135 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+), 3 deletions(-) create mode 100644 include/linux/lsm_count.h (limited to 'include/linux') diff --git a/include/linux/args.h b/include/linux/args.h index 8ff60a54eb7d..2e8e65d975c7 100644 --- a/include/linux/args.h +++ b/include/linux/args.h @@ -17,9 +17,9 @@ * that as _n. */ -/* This counts to 12. Any more, it will return 13th argument. */ -#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n -#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +/* This counts to 15. Any more, it will return 16th argument. */ +#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _n, X...) _n +#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) /* Concatenate two parameters, but allow them to be expanded beforehand. */ #define __CONCAT(a, b) a ## b diff --git a/include/linux/lsm_count.h b/include/linux/lsm_count.h new file mode 100644 index 000000000000..16eb49761b25 --- /dev/null +++ b/include/linux/lsm_count.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2023 Google LLC. + */ + +#ifndef __LINUX_LSM_COUNT_H +#define __LINUX_LSM_COUNT_H + +#include + +#ifdef CONFIG_SECURITY + +/* + * Macros to count the number of LSMs enabled in the kernel at compile time. + */ + +/* + * Capabilities is enabled when CONFIG_SECURITY is enabled. + */ +#if IS_ENABLED(CONFIG_SECURITY) +#define CAPABILITIES_ENABLED 1, +#else +#define CAPABILITIES_ENABLED +#endif + +#if IS_ENABLED(CONFIG_SECURITY_SELINUX) +#define SELINUX_ENABLED 1, +#else +#define SELINUX_ENABLED +#endif + +#if IS_ENABLED(CONFIG_SECURITY_SMACK) +#define SMACK_ENABLED 1, +#else +#define SMACK_ENABLED +#endif + +#if IS_ENABLED(CONFIG_SECURITY_APPARMOR) +#define APPARMOR_ENABLED 1, +#else +#define APPARMOR_ENABLED +#endif + +#if IS_ENABLED(CONFIG_SECURITY_TOMOYO) +#define TOMOYO_ENABLED 1, +#else +#define TOMOYO_ENABLED +#endif + +#if IS_ENABLED(CONFIG_SECURITY_YAMA) +#define YAMA_ENABLED 1, +#else +#define YAMA_ENABLED +#endif + +#if IS_ENABLED(CONFIG_SECURITY_LOADPIN) +#define LOADPIN_ENABLED 1, +#else +#define LOADPIN_ENABLED +#endif + +#if IS_ENABLED(CONFIG_SECURITY_LOCKDOWN_LSM) +#define LOCKDOWN_ENABLED 1, +#else +#define LOCKDOWN_ENABLED +#endif + +#if IS_ENABLED(CONFIG_SECURITY_SAFESETID) +#define SAFESETID_ENABLED 1, +#else +#define SAFESETID_ENABLED +#endif + +#if IS_ENABLED(CONFIG_BPF_LSM) +#define BPF_LSM_ENABLED 1, +#else +#define BPF_LSM_ENABLED +#endif + +#if IS_ENABLED(CONFIG_SECURITY_LANDLOCK) +#define LANDLOCK_ENABLED 1, +#else +#define LANDLOCK_ENABLED +#endif + +#if IS_ENABLED(CONFIG_IMA) +#define IMA_ENABLED 1, +#else +#define IMA_ENABLED +#endif + +#if IS_ENABLED(CONFIG_EVM) +#define EVM_ENABLED 1, +#else +#define EVM_ENABLED +#endif + +#if IS_ENABLED(CONFIG_SECURITY_IPE) +#define IPE_ENABLED 1, +#else +#define IPE_ENABLED +#endif + +/* + * There is a trailing comma that we need to be accounted for. This is done by + * using a skipped argument in __COUNT_LSMS + */ +#define __COUNT_LSMS(skipped_arg, args...) COUNT_ARGS(args...) +#define COUNT_LSMS(args...) __COUNT_LSMS(args) + +#define MAX_LSM_COUNT \ + COUNT_LSMS( \ + CAPABILITIES_ENABLED \ + SELINUX_ENABLED \ + SMACK_ENABLED \ + APPARMOR_ENABLED \ + TOMOYO_ENABLED \ + YAMA_ENABLED \ + LOADPIN_ENABLED \ + LOCKDOWN_ENABLED \ + SAFESETID_ENABLED \ + BPF_LSM_ENABLED \ + LANDLOCK_ENABLED \ + IMA_ENABLED \ + EVM_ENABLED \ + IPE_ENABLED) + +#else + +#define MAX_LSM_COUNT 0 + +#endif /* CONFIG_SECURITY */ + +#endif /* __LINUX_LSM_COUNT_H */ -- cgit v1.2.3 From 417c5643cd67a55f424b203b492082035d0236c3 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 16 Aug 2024 17:43:07 +0200 Subject: lsm: replace indirect LSM hook calls with static calls LSM hooks are currently invoked from a linked list as indirect calls which are invoked using retpolines as a mitigation for speculative attacks (Branch History / Target injection) and add extra overhead which is especially bad in kernel hot paths: security_file_ioctl: 0xff...0320 <+0>: endbr64 0xff...0324 <+4>: push %rbp 0xff...0325 <+5>: push %r15 0xff...0327 <+7>: push %r14 0xff...0329 <+9>: push %rbx 0xff...032a <+10>: mov %rdx,%rbx 0xff...032d <+13>: mov %esi,%ebp 0xff...032f <+15>: mov %rdi,%r14 0xff...0332 <+18>: mov $0xff...7030,%r15 0xff...0339 <+25>: mov (%r15),%r15 0xff...033c <+28>: test %r15,%r15 0xff...033f <+31>: je 0xff...0358 0xff...0341 <+33>: mov 0x18(%r15),%r11 0xff...0345 <+37>: mov %r14,%rdi 0xff...0348 <+40>: mov %ebp,%esi 0xff...034a <+42>: mov %rbx,%rdx 0xff...034d <+45>: call 0xff...2e0 <__x86_indirect_thunk_array+352> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Indirect calls that use retpolines leading to overhead, not just due to extra instruction but also branch misses. 0xff...0352 <+50>: test %eax,%eax 0xff...0354 <+52>: je 0xff...0339 0xff...0356 <+54>: jmp 0xff...035a 0xff...0358 <+56>: xor %eax,%eax 0xff...035a <+58>: pop %rbx 0xff...035b <+59>: pop %r14 0xff...035d <+61>: pop %r15 0xff...035f <+63>: pop %rbp 0xff...0360 <+64>: jmp 0xff...47c4 <__x86_return_thunk> The indirect calls are not really needed as one knows the addresses of enabled LSM callbacks at boot time and only the order can possibly change at boot time with the lsm= kernel command line parameter. An array of static calls is defined per LSM hook and the static calls are updated at boot time once the order has been determined. With the hook now exposed as a static call, one can see that the retpolines are no longer there and the LSM callbacks are invoked directly: security_file_ioctl: 0xff...0ca0 <+0>: endbr64 0xff...0ca4 <+4>: nopl 0x0(%rax,%rax,1) 0xff...0ca9 <+9>: push %rbp 0xff...0caa <+10>: push %r14 0xff...0cac <+12>: push %rbx 0xff...0cad <+13>: mov %rdx,%rbx 0xff...0cb0 <+16>: mov %esi,%ebp 0xff...0cb2 <+18>: mov %rdi,%r14 0xff...0cb5 <+21>: jmp 0xff...0cc7 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Static key enabled for SELinux 0xffffffff818f0cb7 <+23>: jmp 0xff...0cde ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Static key enabled for BPF LSM. This is something that is changed to default to false to avoid the existing side effect issues of BPF LSM [1] in a subsequent patch. 0xff...0cb9 <+25>: xor %eax,%eax 0xff...0cbb <+27>: xchg %ax,%ax 0xff...0cbd <+29>: pop %rbx 0xff...0cbe <+30>: pop %r14 0xff...0cc0 <+32>: pop %rbp 0xff...0cc1 <+33>: cs jmp 0xff...0000 <__x86_return_thunk> 0xff...0cc7 <+39>: endbr64 0xff...0ccb <+43>: mov %r14,%rdi 0xff...0cce <+46>: mov %ebp,%esi 0xff...0cd0 <+48>: mov %rbx,%rdx 0xff...0cd3 <+51>: call 0xff...3230 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Direct call to SELinux. 0xff...0cd8 <+56>: test %eax,%eax 0xff...0cda <+58>: jne 0xff...0cbd 0xff...0cdc <+60>: jmp 0xff...0cb7 0xff...0cde <+62>: endbr64 0xff...0ce2 <+66>: mov %r14,%rdi 0xff...0ce5 <+69>: mov %ebp,%esi 0xff...0ce7 <+71>: mov %rbx,%rdx 0xff...0cea <+74>: call 0xff...e220 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Direct call to BPF LSM. 0xff...0cef <+79>: test %eax,%eax 0xff...0cf1 <+81>: jne 0xff...0cbd 0xff...0cf3 <+83>: jmp 0xff...0cb9 0xff...0cf5 <+85>: endbr64 0xff...0cf9 <+89>: mov %r14,%rdi 0xff...0cfc <+92>: mov %ebp,%esi 0xff...0cfe <+94>: mov %rbx,%rdx 0xff...0d01 <+97>: pop %rbx 0xff...0d02 <+98>: pop %r14 0xff...0d04 <+100>: pop %rbp 0xff...0d05 <+101>: ret 0xff...0d06 <+102>: int3 0xff...0d07 <+103>: int3 0xff...0d08 <+104>: int3 0xff...0d09 <+105>: int3 While this patch uses static_branch_unlikely indicating that an LSM hook is likely to be not present. In most cases this is still a better choice as even when an LSM with one hook is added, empty slots are created for all LSM hooks (especially when many LSMs that do not initialize most hooks are present on the system). There are some hooks that don't use the call_int_hook or call_void_hook. These hooks are updated to use a new macro called lsm_for_each_hook where the lsm_callback is directly invoked as an indirect call. Below are results of the relevant Unixbench system benchmarks with BPF LSM and SELinux enabled with default policies enabled with and without these patches. Benchmark Delta(%): (+ is better) ========================================================================== Execl Throughput +1.9356 File Write 1024 bufsize 2000 maxblocks +6.5953 Pipe Throughput +9.5499 Pipe-based Context Switching +3.0209 Process Creation +2.3246 Shell Scripts (1 concurrent) +1.4975 System Call Overhead +2.7815 System Benchmarks Index Score (Partial Only): +3.4859 In the best case, some syscalls like eventfd_create benefitted to about ~10%. Tested-by: Guenter Roeck Reviewed-by: Casey Schaufler Reviewed-by: Kees Cook Acked-by: Song Liu Acked-by: Andrii Nakryiko Signed-off-by: KP Singh Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 52 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 4687985b9175..090d1d3e19fe 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -30,19 +30,47 @@ #include #include #include +#include +#include +#include +#include union security_list_options { #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__); #include "lsm_hook_defs.h" #undef LSM_HOOK + void *lsm_func_addr; }; -struct security_hook_heads { - #define LSM_HOOK(RET, DEFAULT, NAME, ...) struct hlist_head NAME; - #include "lsm_hook_defs.h" - #undef LSM_HOOK +/* + * @key: static call key as defined by STATIC_CALL_KEY + * @trampoline: static call trampoline as defined by STATIC_CALL_TRAMP + * @hl: The security_hook_list as initialized by the owning LSM. + * @active: Enabled when the static call has an LSM hook associated. + */ +struct lsm_static_call { + struct static_call_key *key; + void *trampoline; + struct security_hook_list *hl; + /* this needs to be true or false based on what the key defaults to */ + struct static_key_false *active; } __randomize_layout; +/* + * Table of the static calls for each LSM hook. + * Once the LSMs are initialized, their callbacks will be copied to these + * tables such that the calls are filled backwards (from last to first). + * This way, we can jump directly to the first used static call, and execute + * all of them after. This essentially makes the entry point + * dynamic to adapt the number of static calls to the number of callbacks. + */ +struct lsm_static_calls_table { + #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ + struct lsm_static_call NAME[MAX_LSM_COUNT]; + #include + #undef LSM_HOOK +} __packed __randomize_layout; + /** * struct lsm_id - Identify a Linux Security Module. * @lsm: name of the LSM, must be approved by the LSM maintainers @@ -58,10 +86,14 @@ struct lsm_id { /* * Security module hook list structure. * For use with generic list macros for common operations. + * + * struct security_hook_list - Contents of a cacheable, mappable object. + * @scalls: The beginning of the array of static calls assigned to this hook. + * @hook: The callback for the hook. + * @lsm: The name of the lsm that owns this hook. */ struct security_hook_list { - struct hlist_node list; - struct hlist_head *head; + struct lsm_static_call *scalls; union security_list_options hook; const struct lsm_id *lsmid; } __randomize_layout; @@ -98,8 +130,11 @@ struct lsm_blob_sizes { * care of the common case and reduces the amount of * text involved. */ -#define LSM_HOOK_INIT(HEAD, HOOK) \ - { .head = &security_hook_heads.HEAD, .hook = { .HEAD = HOOK } } +#define LSM_HOOK_INIT(NAME, HOOK) \ + { \ + .scalls = static_calls_table.NAME, \ + .hook = { .NAME = HOOK } \ + } extern void security_add_hooks(struct security_hook_list *hooks, int count, const struct lsm_id *lsmid); @@ -134,7 +169,6 @@ struct lsm_info { /* DO NOT tamper with these variables outside of the LSM framework */ extern char *lsm_names; -extern struct security_hook_heads security_hook_heads; extern struct lsm_static_calls_table static_calls_table __ro_after_init; extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; -- cgit v1.2.3 From 6c65914a40d33e96ceedc757be0129139cdf7852 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 16 Aug 2024 11:54:23 -0700 Subject: Input: keypad-nomadik-ske - remove the driver The users of this driver were removed in 2013 in commit 28633c54bda6 ("ARM: ux500: Rip out keypad initialisation which is no longer used"). Remove the driver as well. Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/Zr-gX0dfN4te_8VG@google.com Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/keypad-nomadik-ske.h | 50 ------------------------ 1 file changed, 50 deletions(-) delete mode 100644 include/linux/platform_data/keypad-nomadik-ske.h (limited to 'include/linux') diff --git a/include/linux/platform_data/keypad-nomadik-ske.h b/include/linux/platform_data/keypad-nomadik-ske.h deleted file mode 100644 index 7efabbca1dca..000000000000 --- a/include/linux/platform_data/keypad-nomadik-ske.h +++ /dev/null @@ -1,50 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson SA 2010 - * - * Author: Naveen Kumar Gaddipati - * - * ux500 Scroll key and Keypad Encoder (SKE) header - */ - -#ifndef __SKE_H -#define __SKE_H - -#include - -/* register definitions for SKE peripheral */ -#define SKE_CR 0x00 -#define SKE_VAL0 0x04 -#define SKE_VAL1 0x08 -#define SKE_DBCR 0x0C -#define SKE_IMSC 0x10 -#define SKE_RIS 0x14 -#define SKE_MIS 0x18 -#define SKE_ICR 0x1C - -/* - * Keypad module - */ - -/** - * struct keypad_platform_data - structure for platform specific data - * @init: pointer to keypad init function - * @exit: pointer to keypad deinitialisation function - * @keymap_data: matrix scan code table for keycodes - * @krow: maximum number of rows - * @kcol: maximum number of columns - * @debounce_ms: platform specific debounce time - * @no_autorepeat: flag for auto repetition - * @wakeup_enable: allow waking up the system - */ -struct ske_keypad_platform_data { - int (*init)(void); - int (*exit)(void); - const struct matrix_keymap_data *keymap_data; - u8 krow; - u8 kcol; - u8 debounce_ms; - bool no_autorepeat; - bool wakeup_enable; -}; -#endif /*__SKE_KPD_H*/ -- cgit v1.2.3 From 5bc46b49c828a6dfaab80b71ecb63fe76a1096d2 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 22 Jul 2024 14:02:20 +0200 Subject: nvme-tcp: check for invalidated or revoked key key_lookup() will always return a key, even if that key is revoked or invalidated. So check for invalid keys before continuing. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme-keyring.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h index e10333d78dbb..19d2b256180f 100644 --- a/include/linux/nvme-keyring.h +++ b/include/linux/nvme-keyring.h @@ -12,7 +12,7 @@ key_serial_t nvme_tls_psk_default(struct key *keyring, const char *hostnqn, const char *subnqn); key_serial_t nvme_keyring_id(void); - +struct key *nvme_tls_key_lookup(key_serial_t key_id); #else static inline key_serial_t nvme_tls_psk_default(struct key *keyring, @@ -24,5 +24,9 @@ static inline key_serial_t nvme_keyring_id(void) { return 0; } +static inline struct key *nvme_tls_key_lookup(key_serial_t key_id) +{ + return ERR_PTR(-ENOTSUPP); +} #endif /* !CONFIG_NVME_KEYRING */ #endif /* _NVME_KEYRING_H */ -- cgit v1.2.3 From 559048d156ff3391c4b793779a824c9193e20442 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 5 Aug 2024 14:43:44 -0700 Subject: string: Check for "nonstring" attribute on strscpy() arguments GCC already checks for arguments that are marked with the "nonstring"[1] attribute when used on standard C String API functions (e.g. strcpy). Gain this compile-time checking also for the kernel's primary string copying function, strscpy(). Note that Clang has neither "nonstring" nor __builtin_has_attribute(). Link: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-nonstring-variable-attribute [1] Reviewed-by: Miguel Ojeda Tested-by: Miguel Ojeda Link: https://lore.kernel.org/r/20240805214340.work.339-kees@kernel.org Signed-off-by: Kees Cook --- include/linux/compiler.h | 3 +++ include/linux/compiler_types.h | 7 +++++++ include/linux/string.h | 12 ++++++++---- 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 2df665fa2964..ec55bcce4146 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -242,6 +242,9 @@ static inline void *offset_to_ptr(const int *off) /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */ +#define __must_be_cstr(p) BUILD_BUG_ON_ZERO(__annotated(p, nonstring)) + /* * This returns a constant expression while determining if an argument is * a constant expression, most importantly without evaluating the argument. diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index f14c275950b5..1a957ea2f4fe 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -421,6 +421,13 @@ struct ftrace_likely_data { #define __member_size(p) __builtin_object_size(p, 1) #endif +/* Determine if an attribute has been applied to a variable. */ +#if __has_builtin(__builtin_has_attribute) +#define __annotated(var, attr) __builtin_has_attribute(var, attr) +#else +#define __annotated(var, attr) (false) +#endif + /* * Some versions of gcc do not mark 'asm goto' volatile: * diff --git a/include/linux/string.h b/include/linux/string.h index 9edace076ddb..95b3fc308f4f 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -76,12 +76,16 @@ ssize_t sized_strscpy(char *, const char *, size_t); * known size. */ #define __strscpy0(dst, src, ...) \ - sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst)) -#define __strscpy1(dst, src, size) sized_strscpy(dst, src, size) + sized_strscpy(dst, src, sizeof(dst) + __must_be_array(dst) + \ + __must_be_cstr(dst) + __must_be_cstr(src)) +#define __strscpy1(dst, src, size) \ + sized_strscpy(dst, src, size + __must_be_cstr(dst) + __must_be_cstr(src)) #define __strscpy_pad0(dst, src, ...) \ - sized_strscpy_pad(dst, src, sizeof(dst) + __must_be_array(dst)) -#define __strscpy_pad1(dst, src, size) sized_strscpy_pad(dst, src, size) + sized_strscpy_pad(dst, src, sizeof(dst) + __must_be_array(dst) + \ + __must_be_cstr(dst) + __must_be_cstr(src)) +#define __strscpy_pad1(dst, src, size) \ + sized_strscpy_pad(dst, src, size + __must_be_cstr(dst) + __must_be_cstr(src)) /** * strscpy - Copy a C-string into a sized buffer -- cgit v1.2.3 From 84429b675bcfd2a518ae167ee4661cdf7539aa7d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 22 Aug 2024 15:50:09 +0200 Subject: fs: Allow fine-grained control of folio sizes We need filesystems to be able to communicate acceptable folio sizes to the pagecache for a variety of uses (e.g. large block sizes). Support a range of folio sizes between order-0 and order-31. Signed-off-by: Matthew Wilcox (Oracle) Co-developed-by: Pankaj Raghav Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-2-kernel@pankajraghav.com Tested-by: David Howells Reviewed-by: Hannes Reinecke Reviewed-by: Darrick J. Wong Reviewed-by: Daniel Gomez Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 90 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 76 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d9c7edb6422b..c60025bb584c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -204,14 +204,21 @@ enum mapping_flags { AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, - AS_LARGE_FOLIO_SUPPORT = 6, - AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ - AS_STABLE_WRITES, /* must wait for writeback before modifying + AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ + AS_STABLE_WRITES = 7, /* must wait for writeback before modifying folio contents */ - AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping, - including to move the mapping */ + AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ + /* Bits 16-25 are used for FOLIO_ORDER */ + AS_FOLIO_ORDER_BITS = 5, + AS_FOLIO_ORDER_MIN = 16, + AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS, }; +#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1) +#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN) +#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX) +#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK) + /** * mapping_set_error - record a writeback error in the address_space * @mapping: the mapping in which an error should be set @@ -367,9 +374,51 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) +/* + * mapping_set_folio_order_range() - Set the orders supported by a file. + * @mapping: The address space of the file. + * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). + * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive). + * + * The filesystem should call this function in its inode constructor to + * indicate which base size (min) and maximum size (max) of folio the VFS + * can use to cache the contents of the file. This should only be used + * if the filesystem needs special handling of folio sizes (ie there is + * something the core cannot know). + * Do not tune it based on, eg, i_size. + * + * Context: This should not be called while the inode is active as it + * is non-atomic. + */ +static inline void mapping_set_folio_order_range(struct address_space *mapping, + unsigned int min, + unsigned int max) +{ + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return; + + if (min > MAX_PAGECACHE_ORDER) + min = MAX_PAGECACHE_ORDER; + + if (max > MAX_PAGECACHE_ORDER) + max = MAX_PAGECACHE_ORDER; + + if (max < min) + max = min; + + mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | + (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX); +} + +static inline void mapping_set_folio_min_order(struct address_space *mapping, + unsigned int min) +{ + mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER); +} + /** * mapping_set_large_folios() - Indicate the file supports large folios. - * @mapping: The file. + * @mapping: The address space of the file. * * The filesystem should call this function in its inode constructor to * indicate that the VFS can use large folios to cache the contents of @@ -380,7 +429,23 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) */ static inline void mapping_set_large_folios(struct address_space *mapping) { - __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); + mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER); +} + +static inline unsigned int +mapping_max_folio_order(const struct address_space *mapping) +{ + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return 0; + return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX; +} + +static inline unsigned int +mapping_min_folio_order(const struct address_space *mapping) +{ + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return 0; + return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; } /* @@ -389,20 +454,17 @@ static inline void mapping_set_large_folios(struct address_space *mapping) */ static inline bool mapping_large_folio_support(struct address_space *mapping) { - /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ + /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, "Anonymous mapping always supports large folio"); - return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); + return mapping_max_folio_order(mapping) > 0; } /* Return the maximum folio size for this pagecache mapping, in bytes. */ -static inline size_t mapping_max_folio_size(struct address_space *mapping) +static inline size_t mapping_max_folio_size(const struct address_space *mapping) { - if (mapping_large_folio_support(mapping)) - return PAGE_SIZE << MAX_PAGECACHE_ORDER; - return PAGE_SIZE; + return PAGE_SIZE << mapping_max_folio_order(mapping); } static inline int filemap_nr_thps(struct address_space *mapping) -- cgit v1.2.3 From ab95d23bab220ef845c0d422f49452a475330eaf Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:10 +0200 Subject: filemap: allocate mapping_min_order folios in the page cache filemap_create_folio() and do_read_cache_folio() were always allocating folio of order 0. __filemap_get_folio was trying to allocate higher order folios when fgp_flags had higher order hint set but it will default to order 0 folio if higher order memory allocation fails. Supporting mapping_min_order implies that we guarantee each folio in the page cache has at least an order of mapping_min_order. When adding new folios to the page cache we must also ensure the index used is aligned to the mapping_min_order as the page cache requires the index to be aligned to the order of the folio. Co-developed-by: Luis Chamberlain Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240822135018.1931258-3-kernel@pankajraghav.com Tested-by: David Howells Reviewed-by: Hannes Reinecke Reviewed-by: Darrick J. Wong Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Daniel Gomez Reviewed-by: Dave Chinner Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c60025bb584c..4cc170949e9c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -448,6 +448,26 @@ mapping_min_folio_order(const struct address_space *mapping) return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; } +static inline unsigned long +mapping_min_folio_nrpages(struct address_space *mapping) +{ + return 1UL << mapping_min_folio_order(mapping); +} + +/** + * mapping_align_index() - Align index for this mapping. + * @mapping: The address_space. + * + * The index of a folio must be naturally aligned. If you are adding a + * new folio to the page cache and need to know what index to give it, + * call this function. + */ +static inline pgoff_t mapping_align_index(struct address_space *mapping, + pgoff_t index) +{ + return round_down(index, mapping_min_folio_nrpages(mapping)); +} + /* * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. -- cgit v1.2.3 From 3849687869092094003ba009dc00e2e0237a3b8a Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 21 Aug 2024 17:09:55 +0200 Subject: net: phy: Introduce ethernet link topology representation Link topologies containing multiple network PHYs attached to the same net_device can be found when using a PHY as a media converter for use with an SFP connector, on which an SFP transceiver containing a PHY can be used. With the current model, the transceiver's PHY can't be used for operations such as cable testing, timestamping, macsec offload, etc. The reason being that most of the logic for these configuration, coming from either ethtool netlink or ioctls tend to use netdev->phydev, which in multi-phy systems will reference the PHY closest to the MAC. Introduce a numbering scheme allowing to enumerate PHY devices that belong to any netdev, which can in turn allow userspace to take more precise decisions with regard to each PHY's configuration. The numbering is maintained per-netdev, in a phy_device_list. The numbering works similarly to a netdevice's ifindex, with identifiers that are only recycled once INT_MAX has been reached. This prevents races that could occur between PHY listing and SFP transceiver removal/insertion. The identifiers are assigned at phy_attach time, as the numbering depends on the netdevice the phy is attached to. The PHY index can be re-used for PHYs that are persistent. Signed-off-by: Maxime Chevallier Reviewed-by: Christophe Leroy Tested-by: Christophe Leroy Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +- include/linux/phy.h | 4 ++ include/linux/phy_link_topology.h | 82 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 include/linux/phy_link_topology.h (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 614ec5d3d75b..289a6133769c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -40,7 +40,6 @@ #include #endif #include - #include #include #include @@ -81,6 +80,7 @@ struct xdp_frame; struct xdp_metadata_ops; struct xdp_md; struct ethtool_netdev_state; +struct phy_link_topology; typedef u32 xdp_features_t; @@ -1951,6 +1951,7 @@ enum netdev_reg_state { * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one + * @link_topo: Physical link topology tracking attached PHYs * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. @@ -2342,6 +2343,7 @@ struct net_device { #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif + struct phy_link_topology *link_topo; struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; diff --git a/include/linux/phy.h b/include/linux/phy.h index 6b7d40d49129..3b0f4bf80650 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -554,6 +554,9 @@ struct macsec_ops; * @drv: Pointer to the driver for this PHY instance * @devlink: Create a link between phy dev and mac dev, if the external phy * used by current mac interface is managed by another mac interface. + * @phyindex: Unique id across the phy's parent tree of phys to address the PHY + * from userspace, similar to ifindex. A zero index means the PHY + * wasn't assigned an id yet. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. @@ -656,6 +659,7 @@ struct phy_device { struct device_link *devlink; + u32 phyindex; u32 phy_id; struct phy_c45_device_ids c45_ids; diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h new file mode 100644 index 000000000000..68a59e25821c --- /dev/null +++ b/include/linux/phy_link_topology.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PHY device list allow maintaining a list of PHY devices that are + * part of a netdevice's link topology. PHYs can for example be chained, + * as is the case when using a PHY that exposes an SFP module, on which an + * SFP transceiver that embeds a PHY is connected. + * + * This list can then be used by userspace to leverage individual PHY + * capabilities. + */ +#ifndef __PHY_LINK_TOPOLOGY_H +#define __PHY_LINK_TOPOLOGY_H + +#include +#include + +struct xarray; +struct phy_device; +struct sfp_bus; + +struct phy_link_topology { + struct xarray phys; + u32 next_phy_index; +}; + +struct phy_device_node { + enum phy_upstream upstream_type; + + union { + struct net_device *netdev; + struct phy_device *phydev; + } upstream; + + struct sfp_bus *parent_sfp_bus; + + struct phy_device *phy; +}; + +#if IS_ENABLED(CONFIG_PHYLIB) +int phy_link_topo_add_phy(struct net_device *dev, + struct phy_device *phy, + enum phy_upstream upt, void *upstream); + +void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy); + +static inline struct phy_device * +phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) +{ + struct phy_link_topology *topo = dev->link_topo; + struct phy_device_node *pdn; + + if (!topo) + return NULL; + + pdn = xa_load(&topo->phys, phyindex); + if (pdn) + return pdn->phy; + + return NULL; +} + +#else +static inline int phy_link_topo_add_phy(struct net_device *dev, + struct phy_device *phy, + enum phy_upstream upt, void *upstream) +{ + return 0; +} + +static inline void phy_link_topo_del_phy(struct net_device *dev, + struct phy_device *phy) +{ +} + +static inline struct phy_device * +phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) +{ + return NULL; +} +#endif + +#endif /* __PHY_LINK_TOPOLOGY_H */ -- cgit v1.2.3 From 4d76f115ab9121f4458330da962ae9ef5430e60b Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 21 Aug 2024 17:09:56 +0200 Subject: net: sfp: pass the phy_device when disconnecting an sfp module's PHY Pass the phy_device as a parameter to the sfp upstream .disconnect_phy operation. This is preparatory work to help track phy devices across a net_device's link. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Reviewed-by: Christophe Leroy Tested-by: Christophe Leroy Signed-off-by: David S. Miller --- include/linux/sfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index b14be59550e3..54abb4d22b2e 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -550,7 +550,7 @@ struct sfp_upstream_ops { void (*link_down)(void *priv); void (*link_up)(void *priv); int (*connect_phy)(void *priv, struct phy_device *); - void (*disconnect_phy)(void *priv); + void (*disconnect_phy)(void *priv, struct phy_device *); }; #if IS_ENABLED(CONFIG_SFP) -- cgit v1.2.3 From b2db6f4ace72e71fa09b8d1354f8ac9854140d74 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 21 Aug 2024 17:09:57 +0200 Subject: net: phy: add helpers to handle sfp phy connect/disconnect There are a few PHY drivers that can handle SFP modules through their sfp_upstream_ops. Introduce Phylib helpers to keep track of connected SFP PHYs in a netdevice's namespace, by adding the SFP PHY to the upstream PHY's netdev's namespace. By doing so, these SFP PHYs can be enumerated and exposed to users, which will be able to use their capabilities. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Reviewed-by: Christophe Leroy Tested-by: Christophe Leroy Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 3b0f4bf80650..a98bc91a0cde 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1781,6 +1781,8 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); +int phy_sfp_connect_phy(void *upstream, struct phy_device *phy); +void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); void phy_sfp_detach(void *upstream, struct sfp_bus *bus); int phy_sfp_probe(struct phy_device *phydev, -- cgit v1.2.3 From 0a2f7de0f3b96c4a30e56d2c79f8d812f89599a1 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 21 Aug 2024 17:09:58 +0200 Subject: net: sfp: Add helper to return the SFP bus name Knowing the bus name is helpful when we want to expose the link topology to userspace, add a helper to return the SFP bus name. This call will always be made while holding the RTNL which ensures that the SFP driver won't unbind from the device. The returned pointer to the bus name will only be used while RTNL is held. Signed-off-by: Maxime Chevallier Suggested-by: "Russell King (Oracle)" Reviewed-by: Christophe Leroy Tested-by: Christophe Leroy Signed-off-by: David S. Miller --- include/linux/sfp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 54abb4d22b2e..60c65cea74f6 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -576,6 +576,7 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode); int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, const struct sfp_upstream_ops *ops); void sfp_bus_del_upstream(struct sfp_bus *bus); +const char *sfp_get_name(struct sfp_bus *bus); #else static inline int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, @@ -654,6 +655,11 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, static inline void sfp_bus_del_upstream(struct sfp_bus *bus) { } + +static inline const char *sfp_get_name(struct sfp_bus *bus) +{ + return NULL; +} #endif #endif -- cgit v1.2.3 From c579286a514d88a3f0d3bdabfd4d88737b33cb17 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 19 Aug 2024 18:31:33 +0200 Subject: thermal: core: Drop unused bind/unbind functions and callbacks There are no more callers of thermal_zone_bind_cooling_device() and thermal_zone_unbind_cooling_device(), so drop them along with all of the corresponding headers, code and documentation. Moreover, because the .bind() and .unbind() thermal zone callbacks would only be used when the above functions, respectively, were called, drop them as well along with all of the code related to them. Signed-off-by: Rafael J. Wysocki Reviewed-by: Zhang Rui Acked-by: Huisong Li Reviewed-by: Daniel Lezcano Link: https://patch.msgid.link/4251116.1IzOArtZ34@rjwysocki.net --- include/linux/thermal.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 6599a26847f7..25ea8fe2313e 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -92,10 +92,6 @@ struct cooling_spec { }; struct thermal_zone_device_ops { - int (*bind) (struct thermal_zone_device *, - struct thermal_cooling_device *); - int (*unbind) (struct thermal_zone_device *, - struct thermal_cooling_device *); bool (*should_bind) (struct thermal_zone_device *, const struct thermal_trip *, struct thermal_cooling_device *, @@ -247,12 +243,6 @@ const char *thermal_zone_device_type(struct thermal_zone_device *tzd); int thermal_zone_device_id(struct thermal_zone_device *tzd); struct device *thermal_zone_device(struct thermal_zone_device *tzd); -int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, - struct thermal_cooling_device *, - unsigned long, unsigned long, - unsigned int); -int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int, - struct thermal_cooling_device *); void thermal_zone_device_update(struct thermal_zone_device *, enum thermal_notify_event); -- cgit v1.2.3 From ccce71013406a0a3c81850dab940f07b112349d3 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 14 Aug 2024 14:21:18 +0200 Subject: mtd: rawnand: davinci: make platform_data private There are no longer any users of the platform data for davinci rawnand in board files. We can remove the public pdata headers and move the structures that are still used into the driver compilation unit while removing the rest. Signed-off-by: Bartosz Golaszewski Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20240814122120.13975-1-brgl@bgdev.pl --- include/linux/platform_data/mtd-davinci-aemif.h | 36 ---------- include/linux/platform_data/mtd-davinci.h | 88 ------------------------- 2 files changed, 124 deletions(-) delete mode 100644 include/linux/platform_data/mtd-davinci-aemif.h delete mode 100644 include/linux/platform_data/mtd-davinci.h (limited to 'include/linux') diff --git a/include/linux/platform_data/mtd-davinci-aemif.h b/include/linux/platform_data/mtd-davinci-aemif.h deleted file mode 100644 index a49826214a39..000000000000 --- a/include/linux/platform_data/mtd-davinci-aemif.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * TI DaVinci AEMIF support - * - * Copyright 2010 (C) Texas Instruments, Inc. https://www.ti.com/ - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. - */ -#ifndef _MACH_DAVINCI_AEMIF_H -#define _MACH_DAVINCI_AEMIF_H - -#include - -#define NRCSR_OFFSET 0x00 -#define AWCCR_OFFSET 0x04 -#define A1CR_OFFSET 0x10 - -#define ACR_ASIZE_MASK 0x3 -#define ACR_EW_MASK BIT(30) -#define ACR_SS_MASK BIT(31) - -/* All timings in nanoseconds */ -struct davinci_aemif_timing { - u8 wsetup; - u8 wstrobe; - u8 whold; - - u8 rsetup; - u8 rstrobe; - u8 rhold; - - u8 ta; -}; - -#endif diff --git a/include/linux/platform_data/mtd-davinci.h b/include/linux/platform_data/mtd-davinci.h deleted file mode 100644 index dd474dd44848..000000000000 --- a/include/linux/platform_data/mtd-davinci.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * mach-davinci/nand.h - * - * Copyright © 2006 Texas Instruments. - * - * Ported to 2.6.23 Copyright © 2008 by - * Sander Huijsen - * Troy Kisky - * Dirk Behme - * - * -------------------------------------------------------------------------- - */ - -#ifndef __ARCH_ARM_DAVINCI_NAND_H -#define __ARCH_ARM_DAVINCI_NAND_H - -#include - -#define NANDFCR_OFFSET 0x60 -#define NANDFSR_OFFSET 0x64 -#define NANDF1ECC_OFFSET 0x70 - -/* 4-bit ECC syndrome registers */ -#define NAND_4BIT_ECC_LOAD_OFFSET 0xbc -#define NAND_4BIT_ECC1_OFFSET 0xc0 -#define NAND_4BIT_ECC2_OFFSET 0xc4 -#define NAND_4BIT_ECC3_OFFSET 0xc8 -#define NAND_4BIT_ECC4_OFFSET 0xcc -#define NAND_ERR_ADD1_OFFSET 0xd0 -#define NAND_ERR_ADD2_OFFSET 0xd4 -#define NAND_ERR_ERRVAL1_OFFSET 0xd8 -#define NAND_ERR_ERRVAL2_OFFSET 0xdc - -/* NOTE: boards don't need to use these address bits - * for ALE/CLE unless they support booting from NAND. - * They're used unless platform data overrides them. - */ -#define MASK_ALE 0x08 -#define MASK_CLE 0x10 - -struct davinci_nand_pdata { /* platform_data */ - uint32_t mask_ale; - uint32_t mask_cle; - - /* - * 0-indexed chip-select number of the asynchronous - * interface to which the NAND device has been connected. - * - * So, if you have NAND connected to CS3 of DA850, you - * will pass '1' here. Since the asynchronous interface - * on DA850 starts from CS2. - */ - uint32_t core_chipsel; - - /* for packages using two chipselects */ - uint32_t mask_chipsel; - - /* board's default static partition info */ - struct mtd_partition *parts; - unsigned nr_parts; - - /* none == NAND_ECC_ENGINE_TYPE_NONE (strongly *not* advised!!) - * soft == NAND_ECC_ENGINE_TYPE_SOFT - * else == NAND_ECC_ENGINE_TYPE_ON_HOST, according to ecc_bits - * - * All DaVinci-family chips support 1-bit hardware ECC. - * Newer ones also support 4-bit ECC, but are awkward - * using it with large page chips. - */ - enum nand_ecc_engine_type engine_type; - enum nand_ecc_placement ecc_placement; - u8 ecc_bits; - - /* e.g. NAND_BUSWIDTH_16 */ - unsigned options; - /* e.g. NAND_BBT_USE_FLASH */ - unsigned bbt_options; - - /* Main and mirror bbt descriptor overrides */ - struct nand_bbt_descr *bbt_td; - struct nand_bbt_descr *bbt_md; - - /* Access timings */ - struct davinci_aemif_timing *timing; -}; - -#endif /* __ARCH_ARM_DAVINCI_NAND_H */ -- cgit v1.2.3 From 8a48281cfa7f380707d42b649acda3ea36348697 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Fri, 23 Aug 2024 15:42:01 +0800 Subject: PCI: Make pci_bus_type constant Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the pci_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Link: https://lore.kernel.org/r/20240823074202.139265-1-kunwu.chan@linux.dev Suggested-by: Greg Kroah-Hartman Signed-off-by: Kunwu Chan Signed-off-by: Bjorn Helgaas Cc: Greg Kroah-Hartman --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 4cf89a4b4cbc..197292b336a5 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1098,7 +1098,7 @@ enum pcie_bus_config_types { extern enum pcie_bus_config_types pcie_bus_config; -extern struct bus_type pci_bus_type; +extern const struct bus_type pci_bus_type; /* Do NOT directly access these two variables, unless you are arch-specific PCI * code, or PCI core code. */ -- cgit v1.2.3 From 9246b487ab3c3b5993aae7552b7a4c541cc14a49 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Fri, 23 Aug 2024 17:57:08 +0800 Subject: PCI: Add function 0 DMA alias quirk for Glenfly Arise chip Add DMA support for audio function of Glenfly Arise chip, which uses Requester ID of function 0. Link: https://lore.kernel.org/r/CA2BBD087345B6D1+20240823095708.3237375-1-wangyuli@uniontech.com Signed-off-by: SiyuLi Signed-off-by: WangYuli [bhelgaas: lower-case hex to match local code, drop unused Device IDs] Signed-off-by: Bjorn Helgaas Reviewed-by: Takashi Iwai --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index e388c8b1cbc2..2c94d4004dd5 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2661,6 +2661,8 @@ #define PCI_DEVICE_ID_DCI_PCCOM8 0x0002 #define PCI_DEVICE_ID_DCI_PCCOM2 0x0004 +#define PCI_VENDOR_ID_GLENFLY 0x6766 + #define PCI_VENDOR_ID_INTEL 0x8086 #define PCI_DEVICE_ID_INTEL_EESSC 0x0008 #define PCI_DEVICE_ID_INTEL_HDA_CML_LP 0x02c8 -- cgit v1.2.3 From d59232afb0344e33e9399f308d9b4a03876e7676 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 13 Aug 2024 21:24:22 +0000 Subject: bpf: Rename ARG_PTR_TO_KPTR -> ARG_KPTR_XCHG_DEST ARG_PTR_TO_KPTR is currently only used by the bpf_kptr_xchg helper. Although it limits reg types for that helper's first arg to PTR_TO_MAP_VALUE, any arbitrary mapval won't do: further custom verification logic ensures that the mapval reg being xchgd-into is pointing to a kptr field. If this is not the case, it's not safe to xchg into that reg's pointee. Let's rename the bpf_arg_type to more accurately describe the fairly specific expectations that this arg type encodes. This is a nonfunctional change. Acked-by: Martin KaFai Lau Signed-off-by: Dave Marchevsky Signed-off-by: Amery Hung Link: https://lore.kernel.org/r/20240813212424.2871455-4-amery.hung@bytedance.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 00dc4dd28cbd..dc63083f76b7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -744,7 +744,7 @@ enum bpf_arg_type { ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ - ARG_PTR_TO_KPTR, /* pointer to referenced kptr */ + ARG_KPTR_XCHG_DEST, /* pointer to destination that kptrs are bpf_kptr_xchg'd into */ ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */ __BPF_ARG_TYPE_MAX, -- cgit v1.2.3 From 9e83dd3ebb14fadccb936308b7b101c75da76324 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 23 Aug 2024 18:39:34 +0800 Subject: irqchip/loongson-eiointc: Rename CPUHP_AP_IRQ_LOONGARCH_STARTING Rename CPUHP_AP_IRQ_LOONGARCH_STARTING to CPUHP_AP_IRQ_EIOINTC_STARTING because the upcoming AVECINTC irqchip driver will introduce a new state and so both are clearly identifiable. Signed-off-by: Huacai Chen Signed-off-by: Tianyang Zhang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240823103936.25092-3-zhangtianyang@loongson.cn --- include/linux/cpuhotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 51ba681b915a..e49807f7805c 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -145,7 +145,7 @@ enum cpuhp_state { CPUHP_AP_IRQ_ARMADA_XP_STARTING, CPUHP_AP_IRQ_BCM2836_STARTING, CPUHP_AP_IRQ_MIPS_GIC_STARTING, - CPUHP_AP_IRQ_LOONGARCH_STARTING, + CPUHP_AP_IRQ_EIOINTC_STARTING, CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, CPUHP_AP_IRQ_RISCV_IMSIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, -- cgit v1.2.3 From ae16f05c928a1336d5d9d19fd805d7bf29c3f0c8 Mon Sep 17 00:00:00 2001 From: Tianyang Zhang Date: Fri, 23 Aug 2024 18:43:37 +0800 Subject: irqchip/loongarch-avec: Add AVEC irqchip support Introduce the advanced extended interrupt controllers (AVECINTC). This feature will allow each core to have 256 independent interrupt vectors and MSI interrupts can be independently routed to any vector on any CPU. The whole topology of irqchips in LoongArch machines looks like this if AVECINTC is supported: +-----+ +-----------------------+ +-------+ | IPI | --> | CPUINTC | <-- | Timer | +-----+ +-----------------------+ +-------+ ^ ^ ^ | | | +---------+ +----------+ +---------+ +-------+ | EIOINTC | | AVECINTC | | LIOINTC | <-- | UARTs | +---------+ +----------+ +---------+ +-------+ ^ ^ | | +---------+ +---------+ | PCH-PIC | | PCH-MSI | +---------+ +---------+ ^ ^ ^ | | | +---------+ +---------+ +---------+ | Devices | | PCH-LPC | | Devices | +---------+ +---------+ +---------+ ^ | +---------+ | Devices | +---------+ Co-developed-by: Jianmin Lv Signed-off-by: Jianmin Lv Co-developed-by: Liupu Wang Signed-off-by: Liupu Wang Co-developed-by: Huacai Chen Signed-off-by: Huacai Chen Signed-off-by: Tianyang Zhang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240823104337.25577-2-zhangtianyang@loongson.cn --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index e49807f7805c..55a726d317d4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -146,6 +146,7 @@ enum cpuhp_state { CPUHP_AP_IRQ_BCM2836_STARTING, CPUHP_AP_IRQ_MIPS_GIC_STARTING, CPUHP_AP_IRQ_EIOINTC_STARTING, + CPUHP_AP_IRQ_AVECINTC_STARTING, CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, CPUHP_AP_IRQ_RISCV_IMSIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, -- cgit v1.2.3 From 17e28a9aeae40d2de3c1ea3b94819ed94bfd6392 Mon Sep 17 00:00:00 2001 From: Costa Shulyupin Date: Thu, 22 Aug 2024 15:31:58 +0300 Subject: genirq: Fix typo in struct comment Remove redundant "e" in "assign(e)ments". Signed-off-by: Costa Shulyupin Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240822123205.2186221-1-costa.shul@redhat.com --- include/linux/interrupt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 694de61e0b38..457151f9f263 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -276,7 +276,7 @@ struct irq_affinity_notify { #define IRQ_AFFINITY_MAX_SETS 4 /** - * struct irq_affinity - Description for automatic irq affinity assignements + * struct irq_affinity - Description for automatic irq affinity assignments * @pre_vectors: Don't apply affinity to @pre_vectors at beginning of * the MSI(-X) vector space * @post_vectors: Don't apply affinity to @post_vectors at end of -- cgit v1.2.3 From 7d3aed652d090508990d245f9d80dcc481910d02 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 22 Aug 2024 05:51:54 +0000 Subject: net: refactor ->ndo_bpf calls into dev_xdp_propagate When net devices propagate xdp configurations to slave devices, we will need to perform a memory provider check to ensure we're not binding xdp to a device using unreadable netmem. Currently the ->ndo_bpf calls in a few places. Adding checks to all these places would not be ideal. Refactor all the ->ndo_bpf calls into one place where we can add this check in the future. Suggested-by: Jakub Kicinski Signed-off-by: Mina Almasry Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 289a6133769c..7dfa836d9dbf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3925,6 +3925,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); u8 dev_xdp_prog_count(struct net_device *dev); +int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); -- cgit v1.2.3 From 2b8e976b984278edbeab3251d370e76d237699f9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Aug 2024 15:18:14 +0100 Subject: io_uring: user registered clockid for wait timeouts Add a new registration opcode IORING_REGISTER_CLOCK, which allows the user to select which clock id it wants to use with CQ waiting timeouts. It only allows a subset of all posix clocks and currently supports CLOCK_MONOTONIC and CLOCK_BOOTTIME. Suggested-by: Lewis Baker Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/98f2bc8a3c36cdf8f0e6a275245e81e903459703.1723039801.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3315005df117..4b9ba523978d 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -239,6 +239,9 @@ struct io_ring_ctx { struct io_rings *rings; struct percpu_ref refs; + clockid_t clockid; + enum tk_offsets clock_offset; + enum task_work_notify_mode notify_method; unsigned sq_thread_idle; } ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 1e9046e3a154608f63ce79edcb01e6afd6b10c7c Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Wed, 14 Aug 2024 17:35:55 +0200 Subject: rpmb: add Replay Protected Memory Block (RPMB) subsystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A number of storage technologies support a specialised hardware partition designed to be resistant to replay attacks. The underlying HW protocols differ but the operations are common. The RPMB partition cannot be accessed via standard block layer, but by a set of specific RPMB commands. Such a partition provides authenticated and replay protected access, hence suitable as a secure storage. The initial aim of this patch is to provide a simple RPMB driver interface which can be accessed by the optee driver to facilitate early RPMB access to OP-TEE OS (secure OS) during the boot time. A TEE device driver can claim the RPMB interface, for example, via rpmb_interface_register() or rpmb_dev_find_device(). The RPMB driver provides a callback to route RPMB frames to the RPMB device accessible via rpmb_route_frames(). The detailed operation of implementing the access is left to the TEE device driver itself. Signed-off-by: Tomas Winkler Signed-off-by: Alex Bennée Signed-off-by: Shyam Saini Signed-off-by: Jens Wiklander Reviewed-by: Linus Walleij Tested-by: Manuel Traut Reviewed-by: Ulf Hansson Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240814153558.708365-2-jens.wiklander@linaro.org Signed-off-by: Ulf Hansson --- include/linux/rpmb.h | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 include/linux/rpmb.h (limited to 'include/linux') diff --git a/include/linux/rpmb.h b/include/linux/rpmb.h new file mode 100644 index 000000000000..cccda73eea4d --- /dev/null +++ b/include/linux/rpmb.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2015-2019 Intel Corp. All rights reserved + * Copyright (C) 2021-2022 Linaro Ltd + */ +#ifndef __RPMB_H__ +#define __RPMB_H__ + +#include +#include + +/** + * enum rpmb_type - type of underlying storage technology + * + * @RPMB_TYPE_EMMC : emmc (JESD84-B50.1) + * @RPMB_TYPE_UFS : UFS (JESD220) + * @RPMB_TYPE_NVME : NVM Express + */ +enum rpmb_type { + RPMB_TYPE_EMMC, + RPMB_TYPE_UFS, + RPMB_TYPE_NVME, +}; + +/** + * struct rpmb_descr - RPMB description provided by the underlying block device + * + * @type : block device type + * @route_frames : routes frames to and from the RPMB device + * @dev_id : unique device identifier read from the hardware + * @dev_id_len : length of unique device identifier + * @reliable_wr_count: number of sectors that can be written in one access + * @capacity : capacity of the device in units of 128K + * + * @dev_id is intended to be used as input when deriving the authenticaion key. + */ +struct rpmb_descr { + enum rpmb_type type; + int (*route_frames)(struct device *dev, u8 *req, unsigned int req_len, + u8 *resp, unsigned int resp_len); + u8 *dev_id; + size_t dev_id_len; + u16 reliable_wr_count; + u16 capacity; +}; + +/** + * struct rpmb_dev - device which can support RPMB partition + * + * @dev : device + * @id : device_id + * @list_node : linked list node + * @descr : RPMB description + */ +struct rpmb_dev { + struct device dev; + int id; + struct list_head list_node; + struct rpmb_descr descr; +}; + +#define to_rpmb_dev(x) container_of((x), struct rpmb_dev, dev) + +#if IS_ENABLED(CONFIG_RPMB) +struct rpmb_dev *rpmb_dev_get(struct rpmb_dev *rdev); +void rpmb_dev_put(struct rpmb_dev *rdev); +struct rpmb_dev *rpmb_dev_find_device(const void *data, + const struct rpmb_dev *start, + int (*match)(struct device *dev, + const void *data)); +int rpmb_interface_register(struct class_interface *intf); +void rpmb_interface_unregister(struct class_interface *intf); +struct rpmb_dev *rpmb_dev_register(struct device *dev, + struct rpmb_descr *descr); +int rpmb_dev_unregister(struct rpmb_dev *rdev); + +int rpmb_route_frames(struct rpmb_dev *rdev, u8 *req, + unsigned int req_len, u8 *resp, unsigned int resp_len); + +#else +static inline struct rpmb_dev *rpmb_dev_get(struct rpmb_dev *rdev) +{ + return NULL; +} + +static inline void rpmb_dev_put(struct rpmb_dev *rdev) { } + +static inline struct rpmb_dev * +rpmb_dev_find_device(const void *data, const struct rpmb_dev *start, + int (*match)(struct device *dev, const void *data)) +{ + return NULL; +} + +static inline int rpmb_interface_register(struct class_interface *intf) +{ + return -EOPNOTSUPP; +} + +static inline void rpmb_interface_unregister(struct class_interface *intf) +{ +} + +static inline struct rpmb_dev * +rpmb_dev_register(struct device *dev, struct rpmb_descr *descr) +{ + return NULL; +} + +static inline int rpmb_dev_unregister(struct rpmb_dev *dev) +{ + return 0; +} + +static inline int rpmb_route_frames(struct rpmb_dev *rdev, u8 *req, + unsigned int req_len, u8 *resp, + unsigned int resp_len) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_RPMB */ + +#endif /* __RPMB_H__ */ -- cgit v1.2.3 From c30b855e814d9094e369a19fbd86c9bb5badc154 Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Wed, 14 Aug 2024 17:35:57 +0200 Subject: tee: add tee_device_set_dev_groups() Add tee_device_set_dev_groups() to TEE drivers to supply driver specific attribute groups. The class specific attributes are from now on added via the tee_class, which currently only consist of implementation_id. Signed-off-by: Jens Wiklander Reviewed-by: Sumit Garg Link: https://lore.kernel.org/r/20240814153558.708365-4-jens.wiklander@linaro.org Signed-off-by: Ulf Hansson --- include/linux/tee_core.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index efd16ed52315..a38494d6b5f4 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -154,6 +154,18 @@ int tee_device_register(struct tee_device *teedev); */ void tee_device_unregister(struct tee_device *teedev); +/** + * tee_device_set_dev_groups() - Set device attribute groups + * @teedev: Device to register + * @dev_groups: Attribute groups + * + * Assigns the provided @dev_groups to the @teedev to be registered later + * with tee_device_register(). Calling this function is optional, but if + * it's called it must be called before tee_device_register(). + */ +void tee_device_set_dev_groups(struct tee_device *teedev, + const struct attribute_group **dev_groups); + /** * tee_session_calc_client_uuid() - Calculates client UUID for session * @uuid: Resulting UUID -- cgit v1.2.3 From d3596c73011d1b0e9b43ae12f1e0f491d6bb96eb Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 21 Aug 2024 15:24:06 +0200 Subject: mmc: core: Remove struct mmc_context_info The 'mmc_context_info' structure is unused. It has been introduced in: - commit 2220eedfd7ae ("mmc: fix async request mechanism for sequential read scenarios") in 2013-02 and its usages have been removed in: - commit 126b62700386 ("mmc: core: Remove code no longer needed after the switch to blk-mq") - commit 0fbfd1251830 ("mmc: block: Remove code no longer needed after the switch to blk-mq") in 2017-12. Now remove this unused structure. Signed-off-by: Christophe JAILLET Reviewed-by: Vladimir Zapolskiy Link: https://lore.kernel.org/r/232106a8a6a374dee25feea9b94498361568c10b.1724246389.git.christophe.jaillet@wanadoo.fr Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 88c6a76042ee..f85df7d045ca 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -291,20 +291,6 @@ struct mmc_slot { void *handler_priv; }; -/** - * mmc_context_info - synchronization details for mmc context - * @is_done_rcv wake up reason was done request - * @is_new_req wake up reason was new request - * @is_waiting_last_req mmc context waiting for single running request - * @wait wait queue - */ -struct mmc_context_info { - bool is_done_rcv; - bool is_new_req; - bool is_waiting_last_req; - wait_queue_head_t wait; -}; - struct regulator; struct mmc_pwrseq; -- cgit v1.2.3 From f5e1638bf3c785600e2ab53e5532007af5296581 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Sat, 24 Aug 2024 01:59:17 +0300 Subject: mmc: core: remove left-over data structure declarations The last users of 'enum mmc_blk_status' and 'struct mmc_async_req' were removed by commit 126b62700386 ("mmc: core: Remove code no longer needed after the switch to blk-mq") in 2017, remove these two left-over data structures. Signed-off-by: Vladimir Zapolskiy Link: https://lore.kernel.org/r/20240823225917.2826156-1-vladimir.zapolskiy@linaro.org Signed-off-by: Ulf Hansson --- include/linux/mmc/core.h | 12 ------------ include/linux/mmc/host.h | 10 ---------- 2 files changed, 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h index 2c7928a50907..f0ac2e469b32 100644 --- a/include/linux/mmc/core.h +++ b/include/linux/mmc/core.h @@ -11,18 +11,6 @@ struct mmc_data; struct mmc_request; -enum mmc_blk_status { - MMC_BLK_SUCCESS = 0, - MMC_BLK_PARTIAL, - MMC_BLK_CMD_ERR, - MMC_BLK_RETRY, - MMC_BLK_ABORT, - MMC_BLK_DATA_ERR, - MMC_BLK_ECC_ERR, - MMC_BLK_NOMEDIUM, - MMC_BLK_NEW_REQUEST, -}; - struct mmc_command { u32 opcode; u32 arg; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index f85df7d045ca..ac01cd1622ef 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -264,16 +264,6 @@ struct mmc_cqe_ops { void (*cqe_recovery_finish)(struct mmc_host *host); }; -struct mmc_async_req { - /* active mmc request */ - struct mmc_request *mrq; - /* - * Check error status of completed mmc request. - * Returns 0 if success otherwise non zero. - */ - enum mmc_blk_status (*err_check)(struct mmc_card *, struct mmc_async_req *); -}; - /** * struct mmc_slot - MMC slot functions * -- cgit v1.2.3 From 8d3cefaf659265aa82b0373a563fdb9d16a2b947 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 15 Aug 2024 21:44:50 +0200 Subject: i2c: core: Lock address during client device instantiation Krzysztof reported an issue [0] which is caused by parallel attempts to instantiate the same I2C client device. This can happen if driver supports auto-detection, but certain devices are also instantiated explicitly. The original change isn't actually wrong, it just revealed that I2C core isn't prepared yet to handle this scenario. Calls to i2c_new_client_device() can be nested, therefore we can't use a simple mutex here. Parallel instantiation of devices at different addresses is ok, so we just have to prevent parallel instantiation at the same address. We can use a bitmap with one bit per 7-bit I2C client address, and atomic bit operations to set/check/clear bits. Now a parallel attempt to instantiate a device at the same address will result in -EBUSY being returned, avoiding the "sysfs: cannot create duplicate filename" splash. Note: This patch version includes small cosmetic changes to the Tested-by version, only functional change is that address locking is supported for slave addresses too. [0] https://lore.kernel.org/linux-i2c/9479fe4e-eb0c-407e-84c0-bd60c15baf74@ans.pl/T/#m12706546e8e2414d8f1a0dc61c53393f731685cc Fixes: caba40ec3531 ("eeprom: at24: Probe for DDR3 thermal sensor in the SPD case") Cc: stable@vger.kernel.org Tested-by: Krzysztof Piotr Oledzki Signed-off-by: Heiner Kallweit Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 7eedd0c662da..8d79440cd8ce 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -761,6 +761,9 @@ struct i2c_adapter { struct regulator *bus_regulator; struct dentry *debugfs; + + /* 7bit address space */ + DECLARE_BITMAP(addrs_in_instantiation, 1 << 7); }; #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev) -- cgit v1.2.3 From af8a6e65f42c6c89414017cc4d78403e83056172 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 3 May 2024 12:10:52 +0200 Subject: USB: serial: set driver owner when registering drivers Modules registering drivers with usb_serial_register_drivers() might forget to set .owner field. The field is used by some of other kernel parts for reference counting (try_module_get()), so it is expected that drivers will set it. Solve the problem by moving this task away from the drivers to the core USB serial code, just like we did for platform_driver in commit 9447057eaff8 ("platform_device: use a macro instead of platform_driver_register"). Signed-off-by: Krzysztof Kozlowski [ johan: amend commit summary ] Signed-off-by: Johan Hovold --- include/linux/usb/serial.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h index 1a0a4dc87980..75b2b763f1ba 100644 --- a/include/linux/usb/serial.h +++ b/include/linux/usb/serial.h @@ -311,8 +311,11 @@ struct usb_serial_driver { #define to_usb_serial_driver(d) \ container_of(d, struct usb_serial_driver, driver) -int usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[], - const char *name, const struct usb_device_id *id_table); +#define usb_serial_register_drivers(serial_drivers, name, id_table) \ + __usb_serial_register_drivers(serial_drivers, THIS_MODULE, name, id_table) +int __usb_serial_register_drivers(struct usb_serial_driver *const serial_drivers[], + struct module *owner, const char *name, + const struct usb_device_id *id_table); void usb_serial_deregister_drivers(struct usb_serial_driver *const serial_drivers[]); void usb_serial_port_softint(struct usb_serial_port *port); -- cgit v1.2.3 From 1d4684fbe88dc28e2bf79f5e94a432f0469d2dac Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 2 Aug 2024 17:32:02 -0700 Subject: iommufd: Reorder include files Reorder include files to alphabetic order to simplify maintenance, and separate local headers and global headers with a blank line. No functional change intended. Link: https://patch.msgid.link/r/7524b037cc05afe19db3c18f863253e1d1554fa2.1722644866.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index ffc3a949f837..c2f2f6b9148e 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -6,9 +6,9 @@ #ifndef __LINUX_IOMMUFD_H #define __LINUX_IOMMUFD_H -#include -#include #include +#include +#include struct device; struct iommufd_device; -- cgit v1.2.3 From 03c3d7c74371a46d967fbf41628874ec04ddda96 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 15 Aug 2024 22:11:31 +0200 Subject: nvme-rdma: send cntlid in the RDMA_CM_REQUEST Private Data When sending a RDMA_CM_REQUEST, the NVMe RDMA Transport Specification allows you to populate the cntlid field in the RDMA_CM_REQUEST Private Data. The cntlid is returned by the target on completion of the first RDMA_CM_REQUEST command (which creates the admin queue). The cntlid field can then be populated by the host when the I/O queues are created (using additional RDMA_CM_REQUEST commands), such that the target can perform extra validation for additional RDMA_CM_REQUEST commands. This additional error code and error message is also added, such that nvme_rdma_cm_msg() will display the proper error message if the target fails the RDMA_CM_REQUEST command because of this extra validation. Signed-off-by: Niklas Cassel Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme-rdma.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index eb2f04d636c8..97c5f00b9aa3 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -25,6 +25,7 @@ enum nvme_rdma_cm_status { NVME_RDMA_CM_NO_RSC = 0x06, NVME_RDMA_CM_INVALID_IRD = 0x07, NVME_RDMA_CM_INVALID_ORD = 0x08, + NVME_RDMA_CM_INVALID_CNTLID = 0x09, }; static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) @@ -46,6 +47,8 @@ static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) return "invalid IRD"; case NVME_RDMA_CM_INVALID_ORD: return "Invalid ORD"; + case NVME_RDMA_CM_INVALID_CNTLID: + return "invalid controller ID"; default: return "unrecognized reason"; } @@ -64,7 +67,8 @@ struct nvme_rdma_cm_req { __le16 qid; __le16 hrqsize; __le16 hsqsize; - u8 rsvd[24]; + __le16 cntlid; + u8 rsvd[22]; }; /** -- cgit v1.2.3 From 4715d87e11ac805ab95a75adfbf93d67dfbdbe81 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 22 Aug 2024 14:07:02 +0200 Subject: ethtool: Add support for specifying information source in cable test results Enhance the ethtool cable test interface by introducing the ability to specify the source of the diagnostic information for cable test results. This is particularly useful for PHYs that offer multiple diagnostic methods, such as Time Domain Reflectometry (TDR) and Active Link Cable Diagnostic (ALCD). Key changes: - Added `ethnl_cable_test_result_with_src` and `ethnl_cable_test_fault_length_with_src` functions to allow specifying the information source when reporting cable test results. - Updated existing `ethnl_cable_test_result` and `ethnl_cable_test_fault_length` functions to use TDR as the default source, ensuring backward compatibility. - Modified the UAPI to support these new attributes, enabling drivers to provide more detailed diagnostic information. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20240822120703.1393130-3-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- include/linux/ethtool_netlink.h | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index fae0dfb9a9c8..aba91335273a 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -23,8 +23,10 @@ struct phy_device; int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd); void ethnl_cable_test_free(struct phy_device *phydev); void ethnl_cable_test_finished(struct phy_device *phydev); -int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result); -int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm); +int ethnl_cable_test_result_with_src(struct phy_device *phydev, u8 pair, + u8 result, u32 src); +int ethnl_cable_test_fault_length_with_src(struct phy_device *phydev, u8 pair, + u32 cm, u32 src); int ethnl_cable_test_amplitude(struct phy_device *phydev, u8 pair, s16 mV); int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV); int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last, @@ -54,14 +56,14 @@ static inline void ethnl_cable_test_free(struct phy_device *phydev) static inline void ethnl_cable_test_finished(struct phy_device *phydev) { } -static inline int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, - u8 result) +static inline int ethnl_cable_test_result_with_src(struct phy_device *phydev, + u8 pair, u8 result, u32 src) { return -EOPNOTSUPP; } -static inline int ethnl_cable_test_fault_length(struct phy_device *phydev, - u8 pair, u32 cm) +static inline int ethnl_cable_test_fault_length_with_src(struct phy_device *phydev, + u8 pair, u32 cm, u32 src) { return -EOPNOTSUPP; } @@ -119,4 +121,19 @@ static inline bool ethtool_dev_mm_supported(struct net_device *dev) } #endif /* IS_ENABLED(CONFIG_ETHTOOL_NETLINK) */ + +static inline int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, + u8 result) +{ + return ethnl_cable_test_result_with_src(phydev, pair, result, + ETHTOOL_A_CABLE_INF_SRC_TDR); +} + +static inline int ethnl_cable_test_fault_length(struct phy_device *phydev, + u8 pair, u32 cm) +{ + return ethnl_cable_test_fault_length_with_src(phydev, pair, cm, + ETHTOOL_A_CABLE_INF_SRC_TDR); +} + #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ -- cgit v1.2.3 From 19f1f11c9a8e92a2211a3854bd2cfbd4bb6303f1 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 22 Aug 2024 13:57:27 +0100 Subject: net: qualcomm: rmnet: Correct spelling in if_rmnet.h Correct spelling in if_rmnet.h As reported by codespell. Cc: Sean Tranchetti Signed-off-by: Simon Horman Reviewed-by: Subash Abhinov Kasiviswanathan Link: https://patch.msgid.link/20240822-net-spell-v1-6-3a98971ce2d2@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/if_rmnet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_rmnet.h b/include/linux/if_rmnet.h index 839d1e48b85e..c44bf6e80ecb 100644 --- a/include/linux/if_rmnet.h +++ b/include/linux/if_rmnet.h @@ -42,7 +42,7 @@ struct rmnet_map_ul_csum_header { /* csum_info field: * OFFSET: where (offset in bytes) to insert computed checksum - * UDP: 1 = UDP checksum (zero checkum means no checksum) + * UDP: 1 = UDP checksum (zero checksum means no checksum) * ENABLED: 1 = checksum computation requested */ #define MAP_CSUM_UL_OFFSET_MASK GENMASK(13, 0) -- cgit v1.2.3 From 70d0bb45fae87a3b08970a318e15f317446a1956 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 22 Aug 2024 13:57:33 +0100 Subject: net: Correct spelling in headers Correct spelling in Networking headers. As reported by codespell. Signed-off-by: Simon Horman Link: https://patch.msgid.link/20240822-net-spell-v1-12-3a98971ce2d2@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/etherdevice.h | 2 +- include/linux/netdevice.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 0ed47d00549b..30114c25ad12 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -645,7 +645,7 @@ static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) } /** - * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame + * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame * @skb: Buffer to pad * * An Ethernet frame should have a minimum size of 60 bytes. This function diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7dfa836d9dbf..fce70990b209 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1237,7 +1237,7 @@ struct netdev_net_notifier { * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid) - * Deletes the FDB entry from dev coresponding to addr. + * Deletes the FDB entry from dev corresponding to addr. * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, @@ -3514,7 +3514,7 @@ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, dql_completed(&dev_queue->dql, bytes); /* - * Without the memory barrier there is a small possiblity that + * Without the memory barrier there is a small possibility that * netdev_tx_sent_queue will miss the update and cause the queue to * be stopped forever */ @@ -4583,7 +4583,7 @@ void dev_uc_flush(struct net_device *dev); void dev_uc_init(struct net_device *dev); /** - * __dev_uc_sync - Synchonize device's unicast list + * __dev_uc_sync - Synchronize device's unicast list * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed @@ -4627,7 +4627,7 @@ void dev_mc_flush(struct net_device *dev); void dev_mc_init(struct net_device *dev); /** - * __dev_mc_sync - Synchonize device's multicast list + * __dev_mc_sync - Synchronize device's multicast list * @dev: device to sync * @sync: function to call if address should be added * @unsync: function to call if address should be removed -- cgit v1.2.3 From cead0b8991713bdeb5b947758dd62287fcf8da40 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Mon, 26 Aug 2024 16:09:43 +0530 Subject: nvme: rename apptag and appmask to lbat and lbatm Rename apptag and appmask to lbat and lbatm so that it matches the field names used in NVMe spec. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Suggested-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- include/linux/nvme.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 7b2ae2e43544..b58d9405d65e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -987,8 +987,8 @@ struct nvme_rw_command { __le16 control; __le32 dsmgmt; __le32 reftag; - __le16 apptag; - __le16 appmask; + __le16 lbat; + __le16 lbatm; }; enum { @@ -1057,8 +1057,8 @@ struct nvme_write_zeroes_cmd { __le16 control; __le32 dsmgmt; __le32 reftag; - __le16 apptag; - __le16 appmask; + __le16 lbat; + __le16 lbatm; }; enum nvme_zone_mgmt_action { -- cgit v1.2.3 From 50c6dbdfd16e312382842198a7919341ad480e05 Mon Sep 17 00:00:00 2001 From: Max Ramanouski Date: Sun, 25 Aug 2024 01:01:11 +0300 Subject: x86/ioremap: Improve iounmap() address range checks Allowing iounmap() on memory that was not ioremap()'d in the first place is obviously a bad idea. There is currently a feeble attempt to avoid errant iounmap()s by checking to see if the address is below "high_memory". But that's imprecise at best because there are plenty of high addresses that are also invalid to call iounmap() on. Thankfully, there is a more precise helper: is_ioremap_addr(). x86 just does not use it in iounmap(). Restrict iounmap() to addresses in the ioremap region, by using is_ioremap_addr(). This aligns x86 closer to the generic iounmap() implementation. Additionally, add a warning in case there is an attempt to iounmap() invalid memory. This replaces an existing silent return and will help alert folks to any incorrect usage of iounmap(). Due to VMALLOC_START on i386 not being present in asm/pgtable.h, include for asm/vmalloc.h had to be added to include/linux/ioremap.h. [ dhansen: tweak subject and changelog ] Signed-off-by: Max Ramanouski Signed-off-by: Dave Hansen Reviewed-by: Christoph Hellwig Reviewed-by: Alistair Popple Link: https://lore.kernel.org/all/20240824220111.84441-1-max8rr8%40gmail.com --- include/linux/ioremap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h index f0e99fc7dd8b..2bd1661fe9ad 100644 --- a/include/linux/ioremap.h +++ b/include/linux/ioremap.h @@ -4,6 +4,7 @@ #include #include +#include #if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP) /* -- cgit v1.2.3 From 1bf8012fc6997f2117f6919369cde16659db60e0 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 19 Aug 2024 22:59:45 +0800 Subject: pstore: replace spinlock_t by raw_spinlock_t pstore_dump() is called when both preemption and local IRQ are disabled, and a spinlock is obtained, which is problematic for the RT kernel because in this configuration, spinlocks are sleep locks. Replace the spinlock_t with raw_spinlock_t to avoid sleeping in atomic context. Signed-off-by: Wen Yang Cc: Kees Cook Cc: Tony Luck Cc: Guilherme G. Piccoli Cc: linux-hardening@vger.kernel.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/20240819145945.61274-1-wen.yang@linux.dev Signed-off-by: Kees Cook --- include/linux/pstore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 638507a3c8ff..fed601053c51 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -182,7 +182,7 @@ struct pstore_info { struct module *owner; const char *name; - spinlock_t buf_lock; + raw_spinlock_t buf_lock; char *buf; size_t bufsize; -- cgit v1.2.3 From 13acf2b74803a9a9ab3475c306d5814cf3cefbd8 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 26 Aug 2024 14:32:42 +0800 Subject: ata: libata: Remove obsolete function declarations The function ata_schedule_scsi_eh() was removed with commit f8bbfc247efb ("[PATCH] SCSI: make scsi_implement_eh() generic API for SCSI transports"), and the function ata_sff_irq_clear() was removed with commit 37f65b8bc262("libata-sff: ata_sff_irq_clear() is BMDMA specific"). Remove the now useless declarations of these functions in drivers/ata/libata.h and include/linux/libata.h. Signed-off-by: Gaosheng Cui Signed-off-by: Damien Le Moal --- include/linux/libata.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 0279c0a6302f..6552e90753ae 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -2006,7 +2006,6 @@ extern unsigned int ata_sff_data_xfer(struct ata_queued_cmd *qc, extern unsigned int ata_sff_data_xfer32(struct ata_queued_cmd *qc, unsigned char *buf, unsigned int buflen, int rw); extern void ata_sff_irq_on(struct ata_port *ap); -extern void ata_sff_irq_clear(struct ata_port *ap); extern int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc, u8 status, int in_wq); extern void ata_sff_queue_work(struct work_struct *work); -- cgit v1.2.3 From cda1fba15cb2282b3c364805c9767698f11c3b0e Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Fri, 23 Aug 2024 00:25:12 +0200 Subject: dpll: add Embedded SYNC feature for a pin Implement and document new pin attributes for providing Embedded SYNC capabilities to the DPLL subsystem users through a netlink pin-get do/dump messages. Allow the user to set Embedded SYNC frequency with pin-set do netlink message. Reviewed-by: Aleksandr Loktionov Signed-off-by: Arkadiusz Kubalewski Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20240822222513.255179-2-arkadiusz.kubalewski@intel.com Signed-off-by: Jakub Kicinski --- include/linux/dpll.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index d275736230b3..81f7b623d0ba 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -15,6 +15,7 @@ struct dpll_device; struct dpll_pin; +struct dpll_pin_esync; struct dpll_device_ops { int (*mode_get)(const struct dpll_device *dpll, void *dpll_priv, @@ -83,6 +84,13 @@ struct dpll_pin_ops { int (*ffo_get)(const struct dpll_pin *pin, void *pin_priv, const struct dpll_device *dpll, void *dpll_priv, s64 *ffo, struct netlink_ext_ack *extack); + int (*esync_set)(const struct dpll_pin *pin, void *pin_priv, + const struct dpll_device *dpll, void *dpll_priv, + u64 freq, struct netlink_ext_ack *extack); + int (*esync_get)(const struct dpll_pin *pin, void *pin_priv, + const struct dpll_device *dpll, void *dpll_priv, + struct dpll_pin_esync *esync, + struct netlink_ext_ack *extack); }; struct dpll_pin_frequency { @@ -111,6 +119,13 @@ struct dpll_pin_phase_adjust_range { s32 max; }; +struct dpll_pin_esync { + u64 freq; + const struct dpll_pin_frequency *range; + u8 range_num; + u8 pulse; +}; + struct dpll_pin_properties { const char *board_label; const char *panel_label; -- cgit v1.2.3 From 7e8fb2eda9885ea2d13179a4c0bbf810f900ef25 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 22 Jul 2024 22:16:47 -0700 Subject: jbd2: fix kernel-doc for j_transaction_overhead_buffers Use the correct struct member name in the kernel-doc notation to prevent a kernel-doc build warning. include/linux/jbd2.h:1303: warning: Function parameter or struct member 'j_transaction_overhead_buffers' not described in 'journal_s' include/linux/jbd2.h:1303: warning: Excess struct member 'j_transaction_overhead' description in 'journal_s' Fixes: e3a00a23781c ("jbd2: precompute number of transaction descriptor blocks") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20240710182252.4c281445@canb.auug.org.au/ Signed-off-by: Randy Dunlap Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240723051647.3053491-1-rdunlap@infradead.org Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5157d92b6f23..17662eae408f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1086,7 +1086,7 @@ struct journal_s int j_revoke_records_per_block; /** - * @j_transaction_overhead: + * @j_transaction_overhead_buffers: * * Number of blocks each transaction needs for its own bookkeeping */ -- cgit v1.2.3 From fa10db138d205362026daecc8ea65d4e339ade00 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 1 Aug 2024 09:38:10 +0800 Subject: jbd2: remove unused return value of jbd2_fc_release_bufs Remove unused return value of jbd2_fc_release_bufs. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240801013815.2393869-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 17662eae408f..8aef9bb6ad57 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1675,7 +1675,7 @@ int jbd2_fc_get_buf(journal_t *journal, struct buffer_head **bh_out); int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode); int jbd2_fc_wait_bufs(journal_t *journal, int num_blks); -int jbd2_fc_release_bufs(journal_t *journal); +void jbd2_fc_release_bufs(journal_t *journal); /* * is_journal_abort -- cgit v1.2.3 From 9a948c0c8e741776ee6d938b49d34d22034fbb7d Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 14 Aug 2024 11:34:15 +0800 Subject: ceph: Remove unused declarations These functions is never implemented and used. Signed-off-by: Yue Haibing Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index f66f6aac74f6..d7941478158c 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -449,8 +449,6 @@ extern int ceph_osdc_init(struct ceph_osd_client *osdc, extern void ceph_osdc_stop(struct ceph_osd_client *osdc); extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); -extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, - struct ceph_msg *msg); extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); -- cgit v1.2.3 From 373d3f8dcbb1c0d764123653ca4574be636b8d86 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Sun, 11 Aug 2024 12:47:26 +0800 Subject: wifi: rfkill: Correct parameter type for rfkill_set_hw_state_reason() Change type of parameter @reason to enum rfkill_hard_block_reasons for API rfkill_set_hw_state_reason() according to its comments, and all kernel callers have invoked the API with enum type actually. Signed-off-by: Zijun Hu Link: https://patch.msgid.link/20240811-rfkill_fix-v2-1-9050760336f4@quicinc.com Signed-off-by: Johannes Berg --- include/linux/rfkill.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index 373003ace639..997b34197385 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -147,7 +147,8 @@ void rfkill_destroy(struct rfkill *rfkill); * Prefer to use rfkill_set_hw_state if you don't need any special reason. */ bool rfkill_set_hw_state_reason(struct rfkill *rfkill, - bool blocked, unsigned long reason); + bool blocked, + enum rfkill_hard_block_reasons reason); /** * rfkill_set_hw_state - Set the internal rfkill hardware block state * @rfkill: pointer to the rfkill class to modify. @@ -280,7 +281,7 @@ static inline void rfkill_destroy(struct rfkill *rfkill) static inline bool rfkill_set_hw_state_reason(struct rfkill *rfkill, bool blocked, - unsigned long reason) + enum rfkill_hard_block_reasons reason) { return blocked; } -- cgit v1.2.3 From 21b91d40575fb64f3f280f6c3af586e32a704a92 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 8 Aug 2024 22:05:54 +0800 Subject: efi: Remove unused declaration efi_initialize_iomem_resources() Since commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture"), this is not used anymore. Signed-off-by: Yue Haibing Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 6bf3c4fe8511..e28d88066033 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -764,8 +764,6 @@ extern int efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md); extern int __efi_mem_desc_lookup(u64 phys_addr, efi_memory_desc_t *out_md); extern void efi_mem_reserve(phys_addr_t addr, u64 size); extern int efi_mem_reserve_persistent(phys_addr_t addr, u64 size); -extern void efi_initialize_iomem_resources(struct resource *code_resource, - struct resource *data_resource, struct resource *bss_resource); extern u64 efi_get_fdt_params(struct efi_memory_map_data *data); extern struct kobject *efi_kobj; -- cgit v1.2.3 From 2b55d6a42d14c8675e38d6d9adca3014fdf01951 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 20 Aug 2024 17:59:35 +0200 Subject: rcu/kvfree: Add kvfree_rcu_barrier() API Add a kvfree_rcu_barrier() function. It waits until all in-flight pointers are freed over RCU machinery. It does not wait any GP completion and it is within its right to return immediately if there are no outstanding pointers. This function is useful when there is a need to guarantee that a memory is fully freed before destroying memory caches. For example, during unloading a kernel module. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Vlastimil Babka --- include/linux/rcutiny.h | 5 +++++ include/linux/rcutree.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d9ac7b136aea..522123050ff8 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -111,6 +111,11 @@ static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) kvfree(ptr); } +static inline void kvfree_rcu_barrier(void) +{ + rcu_barrier(); +} + #ifdef CONFIG_KASAN_GENERIC void kvfree_call_rcu(struct rcu_head *head, void *ptr); #else diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 254244202ea9..58e7db80f3a8 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -35,6 +35,7 @@ static inline void rcu_virt_note_context_switch(void) void synchronize_rcu_expedited(void); void kvfree_call_rcu(struct rcu_head *head, void *ptr); +void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_dyntick_idle(void); -- cgit v1.2.3 From b3c34245756adada8a50bdaedbb3965b071c7b0a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 9 Aug 2024 17:36:55 +0200 Subject: kasan: catch invalid free before SLUB reinitializes the object Currently, when KASAN is combined with init-on-free behavior, the initialization happens before KASAN's "invalid free" checks. More importantly, a subsequent commit will want to RCU-delay the actual SLUB freeing of an object, and we'd like KASAN to still validate synchronously that freeing the object is permitted. (Otherwise this change will make the existing testcase kmem_cache_invalid_free fail.) So add a new KASAN hook that allows KASAN to pre-validate a kmem_cache_free() operation before SLUB actually starts modifying the object or its metadata. Inside KASAN, this: - moves checks from poison_slab_object() into check_slab_allocation() - moves kasan_arch_is_ready() up into callers of poison_slab_object() - removes "ip" argument of poison_slab_object() and __kasan_slab_free() (since those functions no longer do any reporting) Acked-by: Vlastimil Babka #slub Reviewed-by: Andrey Konovalov Signed-off-by: Jann Horn Signed-off-by: Vlastimil Babka --- include/linux/kasan.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 70d6a8f6e25d..1570c7191176 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -175,13 +175,55 @@ static __always_inline void * __must_check kasan_init_slab_obj( return (void *)object; } -bool __kasan_slab_free(struct kmem_cache *s, void *object, - unsigned long ip, bool init); +bool __kasan_slab_pre_free(struct kmem_cache *s, void *object, + unsigned long ip); +/** + * kasan_slab_pre_free - Check whether freeing a slab object is safe. + * @object: Object to be freed. + * + * This function checks whether freeing the given object is safe. It may + * check for double-free and invalid-free bugs and report them. + * + * This function is intended only for use by the slab allocator. + * + * @Return true if freeing the object is unsafe; false otherwise. + */ +static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, + void *object) +{ + if (kasan_enabled()) + return __kasan_slab_pre_free(s, object, _RET_IP_); + return false; +} + +bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init); +/** + * kasan_slab_free - Poison, initialize, and quarantine a slab object. + * @object: Object to be freed. + * @init: Whether to initialize the object. + * + * This function informs that a slab object has been freed and is not + * supposed to be accessed anymore, except for objects in + * SLAB_TYPESAFE_BY_RCU caches. + * + * For KASAN modes that have integrated memory initialization + * (kasan_has_integrated_init() == true), this function also initializes + * the object's memory. For other modes, the @init argument is ignored. + * + * This function might also take ownership of the object to quarantine it. + * When this happens, KASAN will defer freeing the object to a later + * stage and handle it internally until then. The return value indicates + * whether KASAN took ownership of the object. + * + * This function is intended only for use by the slab allocator. + * + * @Return true if KASAN took ownership of the object; false otherwise. + */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { if (kasan_enabled()) - return __kasan_slab_free(s, object, _RET_IP_, init); + return __kasan_slab_free(s, object, init); return false; } @@ -371,6 +413,12 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache, { return (void *)object; } + +static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) +{ + return false; +} + static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) { return false; -- cgit v1.2.3 From b8c8ba73c68bb3c3e9dad22f488b86c540c839f9 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 9 Aug 2024 17:36:56 +0200 Subject: slub: Introduce CONFIG_SLUB_RCU_DEBUG Currently, KASAN is unable to catch use-after-free in SLAB_TYPESAFE_BY_RCU slabs because use-after-free is allowed within the RCU grace period by design. Add a SLUB debugging feature which RCU-delays every individual kmem_cache_free() before either actually freeing the object or handing it off to KASAN, and change KASAN to poison freed objects as normal when this option is enabled. For now I've configured Kconfig.debug to default-enable this feature in the KASAN GENERIC and SW_TAGS modes; I'm not enabling it by default in HW_TAGS mode because I'm not sure if it might have unwanted performance degradation effects there. Note that this is mostly useful with KASAN in the quarantine-based GENERIC mode; SLAB_TYPESAFE_BY_RCU slabs are basically always also slabs with a ->ctor, and KASAN's assign_tag() currently has to assign fixed tags for those, reducing the effectiveness of SW_TAGS/HW_TAGS mode. (A possible future extension of this work would be to also let SLUB call the ->ctor() on every allocation instead of only when the slab page is allocated; then tag-based modes would be able to assign new tags on every reallocation.) Tested-by: syzbot+263726e59eab6b442723@syzkaller.appspotmail.com Reviewed-by: Andrey Konovalov Acked-by: Marco Elver Acked-by: Vlastimil Babka #slab Signed-off-by: Jann Horn Signed-off-by: Vlastimil Babka --- include/linux/kasan.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 1570c7191176..00a3bf7c0d8f 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -196,15 +196,18 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s, return false; } -bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init); +bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init, + bool still_accessible); /** * kasan_slab_free - Poison, initialize, and quarantine a slab object. * @object: Object to be freed. * @init: Whether to initialize the object. + * @still_accessible: Whether the object contents are still accessible. * * This function informs that a slab object has been freed and is not - * supposed to be accessed anymore, except for objects in - * SLAB_TYPESAFE_BY_RCU caches. + * supposed to be accessed anymore, except when @still_accessible is set + * (indicating that the object is in a SLAB_TYPESAFE_BY_RCU cache and an RCU + * grace period might not have passed yet). * * For KASAN modes that have integrated memory initialization * (kasan_has_integrated_init() == true), this function also initializes @@ -220,10 +223,11 @@ bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init); * @Return true if KASAN took ownership of the object; false otherwise. */ static __always_inline bool kasan_slab_free(struct kmem_cache *s, - void *object, bool init) + void *object, bool init, + bool still_accessible) { if (kasan_enabled()) - return __kasan_slab_free(s, object, init); + return __kasan_slab_free(s, object, init, still_accessible); return false; } @@ -419,7 +423,8 @@ static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object) return false; } -static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init) +static inline bool kasan_slab_free(struct kmem_cache *s, void *object, + bool init, bool still_accessible) { return false; } -- cgit v1.2.3 From a9b8f337ea4eb66e4980f90bb73e8786e56cacfd Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Mon, 12 Aug 2024 06:29:14 +0530 Subject: ACPI: scan: Add a weak arch_sort_irqchip_probe() to order the IRQCHIP probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unlike OF framework, the irqchip probe using IRQCHIP_ACPI_DECLARE has no order defined. Depending on the Makefile is not a good idea. So, usually it is worked around by mandating only root interrupt controller probed using IRQCHIP_ACPI_DECLARE and other interrupt controllers are probed via cascade mechanism. However, this is also not a clean solution because if there are multiple root controllers (ex: RINTC in RISC-V which is per CPU) which need to be probed first, then the cascade will happen for every root controller. So, introduce an architecture specific weak function arch_sort_irqchip_probe() to order the probing of the interrupt controllers which can be implemented by different architectures as per their interrupt controller hierarchy. Signed-off-by: Sunil V L Tested-by: Björn Töpel Link: https://patch.msgid.link/20240812005929.113499-3-sunilvl@ventanamicro.com Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 0687a442fec7..3fff86f95c2f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1343,6 +1343,8 @@ struct acpi_probe_entry { kernel_ulong_t driver_data; }; +void arch_sort_irqchip_probe(struct acpi_probe_entry *ap_head, int nr); + #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, \ valid, data, fn) \ static const struct acpi_probe_entry __acpi_probe_##name \ -- cgit v1.2.3 From f7d7ccf92f2b9398781f791b4af1a74a9f65b5c3 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Mon, 12 Aug 2024 06:29:15 +0530 Subject: ACPI: bus: Add acpi_riscv_init() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new function for RISC-V to do architecture specific initialization similar to acpi_arm_init(). Some of the ACPI tables are architecture specific and there is no reason trying to find them on other architectures. So, add acpi_riscv_init() similar to acpi_arm_init(). Signed-off-by: Sunil V L Tested-by: Björn Töpel Link: https://patch.msgid.link/20240812005929.113499-4-sunilvl@ventanamicro.com Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 3fff86f95c2f..892025d873f0 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1531,6 +1531,12 @@ void acpi_arm_init(void); static inline void acpi_arm_init(void) { } #endif +#ifdef CONFIG_RISCV +void acpi_riscv_init(void); +#else +static inline void acpi_riscv_init(void) { } +#endif + #ifdef CONFIG_ACPI_PCC void acpi_init_pcc(void); #else -- cgit v1.2.3 From 21734d29f84ad610dbafdca5f4c1bf7ecceeb2fb Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Mon, 12 Aug 2024 06:29:19 +0530 Subject: ACPI: bus: Add RINTC IRQ model for RISC-V MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the IRQ model for RISC-V INTC so that acpi_set_irq_model can use this for RISC-V. Signed-off-by: Sunil V L Tested-by: Björn Töpel Link: https://patch.msgid.link/20240812005929.113499-8-sunilvl@ventanamicro.com Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 892025d873f0..3a21f1cf126f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -107,6 +107,7 @@ enum acpi_irq_model_id { ACPI_IRQ_MODEL_PLATFORM, ACPI_IRQ_MODEL_GIC, ACPI_IRQ_MODEL_LPIC, + ACPI_IRQ_MODEL_RINTC, ACPI_IRQ_MODEL_COUNT }; -- cgit v1.2.3 From fbe826b1c10699a70b81a441fe47b817d1019f37 Mon Sep 17 00:00:00 2001 From: Sunil V L Date: Mon, 12 Aug 2024 06:29:27 +0530 Subject: irqchip/riscv-imsic: Add ACPI support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RISC-V IMSIC interrupt controller provides IPI and MSI support. Currently, DT based drivers setup the IPI feature early during boot but defer setting up the MSI functionality. However, in ACPI systems, PCI subsystem is probed early and assume MSI controller is already setup. Hence, both IPI and MSI features need to be initialized early itself. Signed-off-by: Sunil V L Reviewed-by: Anup Patel Tested-by: Björn Töpel Acked-by: Thomas Gleixner Link: https://patch.msgid.link/20240812005929.113499-16-sunilvl@ventanamicro.com Signed-off-by: Rafael J. Wysocki --- include/linux/irqchip/riscv-imsic.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h index faf0b800b1b0..7494952c5518 100644 --- a/include/linux/irqchip/riscv-imsic.h +++ b/include/linux/irqchip/riscv-imsic.h @@ -8,6 +8,8 @@ #include #include +#include +#include #include #define IMSIC_MMIO_PAGE_SHIFT 12 @@ -84,4 +86,11 @@ static inline const struct imsic_global_config *imsic_get_global_config(void) #endif +#ifdef CONFIG_ACPI +int imsic_platform_acpi_probe(struct fwnode_handle *fwnode); +struct fwnode_handle *imsic_acpi_get_fwnode(struct device *dev); +#else +static inline struct fwnode_handle *imsic_acpi_get_fwnode(struct device *dev) { return NULL; } +#endif + #endif -- cgit v1.2.3 From a707f85d47cad3978bef8fe90c7a79e3998e9d36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 3 Aug 2024 14:34:17 +0200 Subject: HID: bpf: constify parameter rdesc of call_hid_bpf_rdesc_fixup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The parameter is never modified, so mark it as const. Also inline the return statement to avoid a type mismatch error. This is a prerequisite for constification changes in the HID core. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-1-f53d7a7b29d8@weissschuh.net Signed-off-by: Benjamin Tissoires --- include/linux/hid_bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h index d4d063cf63b5..6a47223e6460 100644 --- a/include/linux/hid_bpf.h +++ b/include/linux/hid_bpf.h @@ -212,7 +212,7 @@ int hid_bpf_connect_device(struct hid_device *hdev); void hid_bpf_disconnect_device(struct hid_device *hdev); void hid_bpf_destroy_device(struct hid_device *hid); int hid_bpf_device_init(struct hid_device *hid); -u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *size); +u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, const u8 *rdesc, unsigned int *size); #else /* CONFIG_HID_BPF */ static inline u8 *dispatch_hid_bpf_device_event(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 *size, int interrupt, -- cgit v1.2.3 From 6737769ca0b6bfc82df53db5fd69068902d608db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 3 Aug 2024 14:34:18 +0200 Subject: HID: constify parameter rdesc of hid_parse_report() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The parameter is never modified, so mark it as const. This is a prerequisite for constification changes in the HID core. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-2-f53d7a7b29d8@weissschuh.net Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 1533c9dcd3a6..e7a5d6f2f2eb 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -953,7 +953,7 @@ struct hid_device *hid_allocate_device(void); struct hid_report *hid_register_report(struct hid_device *device, enum hid_report_type type, unsigned int id, unsigned int application); -int hid_parse_report(struct hid_device *hid, __u8 *start, unsigned size); +int hid_parse_report(struct hid_device *hid, const __u8 *start, unsigned size); struct hid_report *hid_validate_values(struct hid_device *hid, enum hid_report_type type, unsigned int id, unsigned int field_index, -- cgit v1.2.3 From 24ddd0d7de7aaf50f537fd727f31616cb5a65a9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 3 Aug 2024 14:34:19 +0200 Subject: HID: constify hid_device::rdesc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Once a report descriptor has been created by the HID core it is not supposed to be modified anymore. Enforce this invariant through the type system. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-3-f53d7a7b29d8@weissschuh.net Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index e7a5d6f2f2eb..dda34d8cad19 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -602,7 +602,7 @@ struct hid_ll_driver; struct hid_device { /* device report descriptor */ __u8 *dev_rdesc; unsigned dev_rsize; - __u8 *rdesc; + const __u8 *rdesc; unsigned rsize; struct hid_collection *collection; /* List of HID collections */ unsigned collection_size; /* Number of allocated hid_collections */ -- cgit v1.2.3 From 80cfb508f3fe4c7c6a567fc4aa863c9a38709cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 3 Aug 2024 14:34:20 +0200 Subject: HID: constify params and return value of fetch_item() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fetch_item() does not modify the descriptor it operates on. As a prerequisite for the constification of hid_driver::dev_rdesc, mark the parameters and return value of fetch_item() as const. Also adapt the variable types in the callers to match this constification. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-4-f53d7a7b29d8@weissschuh.net Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index dda34d8cad19..502bbc6f078c 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -46,7 +46,7 @@ struct hid_item { __s16 s16; __u32 u32; __s32 s32; - __u8 *longdata; + const __u8 *longdata; } data; }; -- cgit v1.2.3 From 3593630c89d7d3963c42262694f8aa8b4727a0ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 3 Aug 2024 14:34:21 +0200 Subject: HID: constify hid_device::dev_rdesc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Once a report descriptor has been created by the HID core it is not supposed to be modified anymore. Enforce this invariant through the type system. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-5-f53d7a7b29d8@weissschuh.net Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index 502bbc6f078c..c5fb43db0f2e 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -600,7 +600,7 @@ struct hid_driver; struct hid_ll_driver; struct hid_device { /* device report descriptor */ - __u8 *dev_rdesc; + const __u8 *dev_rdesc; unsigned dev_rsize; const __u8 *rdesc; unsigned rsize; -- cgit v1.2.3 From fe73965d078670406acee0218f118c0870d6a58b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 3 Aug 2024 14:34:22 +0200 Subject: HID: change return type of report_fixup() to const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By allowing the drivers to return a "const *" they can constify their static report arrays. This makes it clear to driver authors that the HID core will not modify those reports and they can be reused for multiple devices. Furthermore security is slightly improved as those reports are protected against accidental or malicious modifications. [bentiss: fixup hid-cougar.c and hid-multitouch.c for latest version of the master branch] Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20240803-hid-const-fixup-v2-6-f53d7a7b29d8@weissschuh.net Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index c5fb43db0f2e..3a998fa3812d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -822,7 +822,7 @@ struct hid_driver { struct hid_usage *usage, __s32 value); void (*report)(struct hid_device *hdev, struct hid_report *report); - __u8 *(*report_fixup)(struct hid_device *hdev, __u8 *buf, + const __u8 *(*report_fixup)(struct hid_device *hdev, __u8 *buf, unsigned int *size); int (*input_mapping)(struct hid_device *hdev, -- cgit v1.2.3 From ba7e053ec89f61be9d27bfb244de52849b5138aa Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Fri, 16 Aug 2024 10:19:09 +0200 Subject: power: supply: max77693: Expose input current limit and CC current properties There are two charger current limit registers: - Fast charge current limit (which controls current going from the charger to the battery); - CHGIN input current limit (which controls current going into the charger through the cable). Add the necessary functions to retrieve the CHGIN input limit (from CHARGER regulator) and maximum fast charge current values, and expose them as power supply properties. Tested-by: Henrik Grimler Signed-off-by: Artur Weber Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240816-max77693-charger-extcon-v4-3-050a0a9bfea0@gmail.com Signed-off-by: Sebastian Reichel --- include/linux/mfd/max77693-private.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h index 54444ff2a5de..20c5e02ed9da 100644 --- a/include/linux/mfd/max77693-private.h +++ b/include/linux/mfd/max77693-private.h @@ -217,6 +217,10 @@ enum max77693_charger_battery_state { #define CHG_CNFG_01_CHGRSTRT_MASK (0x3 << CHG_CNFG_01_CHGRSTRT_SHIFT) #define CHG_CNFG_01_PQEN_MAKS BIT(CHG_CNFG_01_PQEN_SHIFT) +/* MAX77693_CHG_REG_CHG_CNFG_02 register */ +#define CHG_CNFG_02_CC_SHIFT 0 +#define CHG_CNFG_02_CC_MASK 0x3F + /* MAX77693_CHG_REG_CHG_CNFG_03 register */ #define CHG_CNFG_03_TOITH_SHIFT 0 #define CHG_CNFG_03_TOTIME_SHIFT 3 @@ -244,6 +248,7 @@ enum max77693_charger_battery_state { #define CHG_CNFG_12_VCHGINREG_MASK (0x3 << CHG_CNFG_12_VCHGINREG_SHIFT) /* MAX77693 CHG_CNFG_09 Register */ +#define CHG_CNFG_09_CHGIN_ILIM_SHIFT 0 #define CHG_CNFG_09_CHGIN_ILIM_MASK 0x7F /* MAX77693 CHG_CTRL Register */ -- cgit v1.2.3 From a96c5515d0d15df103598b2bc57245d66143b5dd Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Mon, 26 Aug 2024 21:43:08 +0000 Subject: net: dsa: microchip: Add KSZ8895/KSZ8864 switch support KSZ8895/KSZ8864 is a switch family between KSZ8863/73 and KSZ8795, so it shares some registers and functions in those switches already implemented in the KSZ DSA driver. Signed-off-by: Tristram Ha Tested-by: Pieter Van Trappen Reviewed-by: Oleksij Rempel Signed-off-by: Jakub Kicinski --- include/linux/platform_data/microchip-ksz.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h index 8c659db4da6b..d074019474f5 100644 --- a/include/linux/platform_data/microchip-ksz.h +++ b/include/linux/platform_data/microchip-ksz.h @@ -28,6 +28,8 @@ enum ksz_chip_id { KSZ8794_CHIP_ID = 0x8794, KSZ8765_CHIP_ID = 0x8765, KSZ8830_CHIP_ID = 0x8830, + KSZ8864_CHIP_ID = 0x8864, + KSZ8895_CHIP_ID = 0x8895, KSZ9477_CHIP_ID = 0x00947700, KSZ9896_CHIP_ID = 0x00989600, KSZ9897_CHIP_ID = 0x00989700, -- cgit v1.2.3 From 1934b212615dc617ac84fc306333ab2b9fc3b04f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Aug 2024 18:00:01 +0200 Subject: file: reclaim 24 bytes from f_owner We do embedd struct fown_struct into struct file letting it take up 32 bytes in total. We could tweak struct fown_struct to be more compact but really it shouldn't even be embedded in struct file in the first place. Instead, actual users of struct fown_struct should allocate the struct on demand. This frees up 24 bytes in struct file. That will have some potentially user-visible changes for the ownership fcntl()s. Some of them can now fail due to allocation failures. Practically, that probably will almost never happen as the allocations are small and they only happen once per file. The fown_struct is used during kill_fasync() which is used by e.g., pipes to generate a SIGIO signal. Sending of such signals is conditional on userspace having set an owner for the file using one of the F_OWNER fcntl()s. Such users will be unaffected if struct fown_struct is allocated during the fcntl() call. There are a few subsystems that call __f_setown() expecting file->f_owner to be allocated: (1) tun devices file->f_op->fasync::tun_chr_fasync() -> __f_setown() There are no callers of tun_chr_fasync(). (2) tty devices file->f_op->fasync::tty_fasync() -> __tty_fasync() -> __f_setown() tty_fasync() has no additional callers but __tty_fasync() has. Note that __tty_fasync() only calls __f_setown() if the @on argument is true. It's called from: file->f_op->release::tty_release() -> tty_release() -> __tty_fasync() -> __f_setown() tty_release() calls __tty_fasync() with @on false => __f_setown() is never called from tty_release(). => All callers of tty_release() are safe as well. file->f_op->release::tty_open() -> tty_release() -> __tty_fasync() -> __f_setown() __tty_hangup() calls __tty_fasync() with @on false => __f_setown() is never called from tty_release(). => All callers of __tty_hangup() are safe as well. From the callchains it's obvious that (1) and (2) end up getting called via file->f_op->fasync(). That can happen either through the F_SETFL fcntl() with the FASYNC flag raised or via the FIOASYNC ioctl(). If FASYNC is requested and the file isn't already FASYNC then file->f_op->fasync() is called with @on true which ends up causing both (1) and (2) to call __f_setown(). (1) and (2) are the only subsystems that call __f_setown() from the file->f_op->fasync() handler. So both (1) and (2) have been updated to allocate a struct fown_struct prior to calling fasync_helper() to register with the fasync infrastructure. That's safe as they both call fasync_helper() which also does allocations if @on is true. The other interesting case are file leases: (3) file leases lease_manager_ops->lm_setup::lease_setup() -> __f_setown() Which in turn is called from: generic_add_lease() -> lease_manager_ops->lm_setup::lease_setup() -> __f_setown() So here again we can simply make generic_add_lease() allocate struct fown_struct prior to the lease_manager_ops->lm_setup::lease_setup() which happens under a spinlock. With that the two remaining subsystems that call __f_setown() are: (4) dnotify (5) sockets Both have their own custom ioctls to set struct fown_struct and both have been converted to allocate a struct fown_struct on demand from their respective ioctls. Interactions with O_PATH are fine as well e.g., when opening a /dev/tty as O_PATH then no file->f_op->open() happens thus no file->f_owner is allocated. That's fine as no file operation will be set for those and the device has never been opened. fcntl()s called on such things will just allocate a ->f_owner on demand. Although I have zero idea why'd you care about f_owner on an O_PATH fd. Link: https://lore.kernel.org/r/20240813-work-f_owner-v2-1-4e9343a79f9f@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/fs.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index fb0426f349fc..7af239ca87e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -947,6 +947,7 @@ static inline unsigned imajor(const struct inode *inode) } struct fown_struct { + struct file *file; /* backpointer for security modules */ rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ @@ -1011,7 +1012,7 @@ struct file { struct mutex f_pos_lock; loff_t f_pos; unsigned int f_flags; - struct fown_struct f_owner; + struct fown_struct *f_owner; const struct cred *f_cred; struct file_ra_state f_ra; struct path f_path; @@ -1076,6 +1077,12 @@ struct file_lease; #define OFFT_OFFSET_MAX type_max(off_t) #endif +int file_f_owner_allocate(struct file *file); +static inline struct fown_struct *file_f_owner(const struct file *file) +{ + return READ_ONCE(file->f_owner); +} + extern void send_sigio(struct fown_struct *fown, int fd, int band); static inline struct inode *file_inode(const struct file *f) @@ -1124,7 +1131,7 @@ extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); -extern int send_sigurg(struct fown_struct *fown); +extern int send_sigurg(struct file *file); /* * sb->s_flags. Note that these mirror the equivalent MS_* flags where -- cgit v1.2.3 From a55d1cbd1720679cfe9837bce250e397ec513989 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Aug 2024 16:14:46 +0200 Subject: fs: switch f_iocb_flags and f_ra Now that we shrank struct file by 24 bytes we still have a 4 byte hole. If we move struct file_ra_state into the union and f_iocb_flags out of the union we close that whole and bring down struct file to 192 bytes. Which means struct file is 3 cachelines and we managed to shrink it by 40 bytes this cycle. I've tried to audit all codepaths that use f_ra and none of them seem to rely on it in file->f_op->release() and never have since commit 1da177e4c3f4 ("Linux-2.6.12-rc2"). Link: https://lore.kernel.org/r/20240823-luftdicht-berappen-d69a2166a0db@brauner Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7af239ca87e2..095a956aeb29 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -999,9 +999,9 @@ struct file { struct callback_head f_task_work; /* fput() must use workqueue (most kernel threads). */ struct llist_node f_llist; - unsigned int f_iocb_flags; + /* Invalid after last fput(). */ + struct file_ra_state f_ra; }; - /* * Protects f_ep, f_flags. * Must not be taken from IRQ context. @@ -1012,9 +1012,9 @@ struct file { struct mutex f_pos_lock; loff_t f_pos; unsigned int f_flags; + unsigned int f_iocb_flags; struct fown_struct *f_owner; const struct cred *f_cred; - struct file_ra_state f_ra; struct path f_path; struct inode *f_inode; /* cached value */ const struct file_operations *f_op; -- cgit v1.2.3 From 947697c6f0f75f9866f2f891a102dece7a09a064 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 22 Aug 2024 10:18:52 +0530 Subject: uapi: Define GENMASK_U128 This adds GENMASK_U128() and __GENMASK_U128() macros using __BITS_PER_U128 and __int128 data types. These macros will be used in providing support for generating 128 bit masks. The macros wouldn't work in all assembler flavors for reasons described in the comments on top of declarations. Enforce it for more by adding !__ASSEMBLY__ guard. Cc: Yury Norov Cc: Rasmus Villemoes Cc: Arnd Bergmann > Cc: linux-kernel@vger.kernel.org Cc: linux-arch@vger.kernel.org Reviewed-by: Arnd Bergmann Signed-off-by: Anshuman Khandual Signed-off-by: Yury Norov --- include/linux/bits.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bits.h b/include/linux/bits.h index 0eb24d21aac2..60044b608817 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -36,4 +36,19 @@ #define GENMASK_ULL(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK_ULL(h, l)) +#if !defined(__ASSEMBLY__) +/* + * Missing asm support + * + * __GENMASK_U128() depends on _BIT128() which would not work + * in the asm code, as it shifts an 'unsigned __init128' data + * type instead of direct representation of 128 bit constants + * such as long and unsigned long. The fundamental problem is + * that a 128 bit constant will get silently truncated by the + * gcc compiler. + */ +#define GENMASK_U128(h, l) \ + (GENMASK_INPUT_CHECK(h, l) + __GENMASK_U128(h, l)) +#endif + #endif /* __LINUX_BITS_H */ -- cgit v1.2.3 From 57413d8e172c10c90fbd91f98d0f7d8eb27e824c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Aug 2024 08:50:47 +0200 Subject: fs: sort out the fallocate mode vs flag mess The fallocate system call takes a mode argument, but that argument contains a wild mix of exclusive modes and an optional flags. Replace FALLOC_FL_SUPPORTED_MASK with FALLOC_FL_MODE_MASK, which excludes the optional flag bit, so that we can use switch statement on the value to easily enumerate the cases while getting the check for duplicate modes for free. To make this (and in the future the file system implementations) more readable also add a symbolic name for the 0 mode used to allocate blocks. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240827065123.1762168-4-hch@lst.de Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/falloc.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/falloc.h b/include/linux/falloc.h index f3f0b97b1675..3f49f3df6af5 100644 --- a/include/linux/falloc.h +++ b/include/linux/falloc.h @@ -25,12 +25,18 @@ struct space_resv { #define FS_IOC_UNRESVSP64 _IOW('X', 43, struct space_resv) #define FS_IOC_ZERO_RANGE _IOW('X', 57, struct space_resv) -#define FALLOC_FL_SUPPORTED_MASK (FALLOC_FL_KEEP_SIZE | \ - FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_COLLAPSE_RANGE | \ - FALLOC_FL_ZERO_RANGE | \ - FALLOC_FL_INSERT_RANGE | \ - FALLOC_FL_UNSHARE_RANGE) +/* + * Mask of all supported fallocate modes. Only one can be set at a time. + * + * In addition to the mode bit, the mode argument can also encode flags. + * FALLOC_FL_KEEP_SIZE is the only supported flag so far. + */ +#define FALLOC_FL_MODE_MASK (FALLOC_FL_ALLOCATE_RANGE | \ + FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | \ + FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE | \ + FALLOC_FL_UNSHARE_RANGE) /* on ia32 l_start is on a 32-bit boundary */ #if defined(CONFIG_X86_64) -- cgit v1.2.3 From 654beb75ca95240c00c78be0bbc2ea66a125a997 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Wed, 21 Aug 2024 18:46:33 +0200 Subject: dma: ipu: Remove include/linux/dma/ipu-dma.h When this file was renamed in commit b8a6d9980f75 ("dma: ipu: rename mach/ipu.h to include/linux/dma/ipu-dma.h"), 4 .c files have been modified accordingly: drivers/dma/ipu/ipu_idmac.c drivers/dma/ipu/ipu_irq.c --> removed in commit f1de55ff7c70 ("dmaengine: ipu: Remove the driver") in 2023-08 drivers/media/platform/soc_camera/mx3_camera.c --> removed in commit c93cc61475eb ("[media] staging/media: remove deprecated mx3 driver") in 2016-06 drivers/video/mx3fb.c --> removed in commit bfac19e239a7 ("fbdev: mx3fb: Remove the driver") in 2023-08 Now include/linux/dma/ipu-dma.h is unused and can be removed as-well. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/532e7e2816ccf226f3ab1fa76ec7873bc09299d0.1724258714.git.christophe.jaillet@wanadoo.fr Signed-off-by: Vinod Koul --- include/linux/dma/ipu-dma.h | 174 -------------------------------------------- 1 file changed, 174 deletions(-) delete mode 100644 include/linux/dma/ipu-dma.h (limited to 'include/linux') diff --git a/include/linux/dma/ipu-dma.h b/include/linux/dma/ipu-dma.h deleted file mode 100644 index 6969391580d2..000000000000 --- a/include/linux/dma/ipu-dma.h +++ /dev/null @@ -1,174 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2008 - * Guennadi Liakhovetski, DENX Software Engineering, - * - * Copyright (C) 2005-2007 Freescale Semiconductor, Inc. - */ - -#ifndef __LINUX_DMA_IPU_DMA_H -#define __LINUX_DMA_IPU_DMA_H - -#include -#include - -/* IPU DMA Controller channel definitions. */ -enum ipu_channel { - IDMAC_IC_0 = 0, /* IC (encoding task) to memory */ - IDMAC_IC_1 = 1, /* IC (viewfinder task) to memory */ - IDMAC_ADC_0 = 1, - IDMAC_IC_2 = 2, - IDMAC_ADC_1 = 2, - IDMAC_IC_3 = 3, - IDMAC_IC_4 = 4, - IDMAC_IC_5 = 5, - IDMAC_IC_6 = 6, - IDMAC_IC_7 = 7, /* IC (sensor data) to memory */ - IDMAC_IC_8 = 8, - IDMAC_IC_9 = 9, - IDMAC_IC_10 = 10, - IDMAC_IC_11 = 11, - IDMAC_IC_12 = 12, - IDMAC_IC_13 = 13, - IDMAC_SDC_0 = 14, /* Background synchronous display data */ - IDMAC_SDC_1 = 15, /* Foreground data (overlay) */ - IDMAC_SDC_2 = 16, - IDMAC_SDC_3 = 17, - IDMAC_ADC_2 = 18, - IDMAC_ADC_3 = 19, - IDMAC_ADC_4 = 20, - IDMAC_ADC_5 = 21, - IDMAC_ADC_6 = 22, - IDMAC_ADC_7 = 23, - IDMAC_PF_0 = 24, - IDMAC_PF_1 = 25, - IDMAC_PF_2 = 26, - IDMAC_PF_3 = 27, - IDMAC_PF_4 = 28, - IDMAC_PF_5 = 29, - IDMAC_PF_6 = 30, - IDMAC_PF_7 = 31, -}; - -/* Order significant! */ -enum ipu_channel_status { - IPU_CHANNEL_FREE, - IPU_CHANNEL_INITIALIZED, - IPU_CHANNEL_READY, - IPU_CHANNEL_ENABLED, -}; - -#define IPU_CHANNELS_NUM 32 - -enum pixel_fmt { - /* 1 byte */ - IPU_PIX_FMT_GENERIC, - IPU_PIX_FMT_RGB332, - IPU_PIX_FMT_YUV420P, - IPU_PIX_FMT_YUV422P, - IPU_PIX_FMT_YUV420P2, - IPU_PIX_FMT_YVU422P, - /* 2 bytes */ - IPU_PIX_FMT_RGB565, - IPU_PIX_FMT_RGB666, - IPU_PIX_FMT_BGR666, - IPU_PIX_FMT_YUYV, - IPU_PIX_FMT_UYVY, - /* 3 bytes */ - IPU_PIX_FMT_RGB24, - IPU_PIX_FMT_BGR24, - /* 4 bytes */ - IPU_PIX_FMT_GENERIC_32, - IPU_PIX_FMT_RGB32, - IPU_PIX_FMT_BGR32, - IPU_PIX_FMT_ABGR32, - IPU_PIX_FMT_BGRA32, - IPU_PIX_FMT_RGBA32, -}; - -enum ipu_color_space { - IPU_COLORSPACE_RGB, - IPU_COLORSPACE_YCBCR, - IPU_COLORSPACE_YUV -}; - -/* - * Enumeration of IPU rotation modes - */ -enum ipu_rotate_mode { - /* Note the enum values correspond to BAM value */ - IPU_ROTATE_NONE = 0, - IPU_ROTATE_VERT_FLIP = 1, - IPU_ROTATE_HORIZ_FLIP = 2, - IPU_ROTATE_180 = 3, - IPU_ROTATE_90_RIGHT = 4, - IPU_ROTATE_90_RIGHT_VFLIP = 5, - IPU_ROTATE_90_RIGHT_HFLIP = 6, - IPU_ROTATE_90_LEFT = 7, -}; - -/* - * Enumeration of DI ports for ADC. - */ -enum display_port { - DISP0, - DISP1, - DISP2, - DISP3 -}; - -struct idmac_video_param { - unsigned short in_width; - unsigned short in_height; - uint32_t in_pixel_fmt; - unsigned short out_width; - unsigned short out_height; - uint32_t out_pixel_fmt; - unsigned short out_stride; - bool graphics_combine_en; - bool global_alpha_en; - bool key_color_en; - enum display_port disp; - unsigned short out_left; - unsigned short out_top; -}; - -/* - * Union of initialization parameters for a logical channel. So far only video - * parameters are used. - */ -union ipu_channel_param { - struct idmac_video_param video; -}; - -struct idmac_tx_desc { - struct dma_async_tx_descriptor txd; - struct scatterlist *sg; /* scatterlist for this */ - unsigned int sg_len; /* tx-descriptor. */ - struct list_head list; -}; - -struct idmac_channel { - struct dma_chan dma_chan; - dma_cookie_t completed; /* last completed cookie */ - union ipu_channel_param params; - enum ipu_channel link; /* input channel, linked to the output */ - enum ipu_channel_status status; - void *client; /* Only one client per channel */ - unsigned int n_tx_desc; - struct idmac_tx_desc *desc; /* allocated tx-descriptors */ - struct scatterlist *sg[2]; /* scatterlist elements in buffer-0 and -1 */ - struct list_head free_list; /* free tx-descriptors */ - struct list_head queue; /* queued tx-descriptors */ - spinlock_t lock; /* protects sg[0,1], queue */ - struct mutex chan_mutex; /* protects status, cookie, free_list */ - bool sec_chan_en; - int active_buffer; - unsigned int eof_irq; - char eof_name[16]; /* EOF IRQ name for request_irq() */ -}; - -#define to_tx_desc(tx) container_of(tx, struct idmac_tx_desc, txd) -#define to_idmac_chan(c) container_of(c, struct idmac_channel, dma_chan) - -#endif /* __LINUX_DMA_IPU_DMA_H */ -- cgit v1.2.3 From 73d5fc92a11cacb73a1aac0b5793c47e48c5b537 Mon Sep 17 00:00:00 2001 From: Nishad Saraf Date: Mon, 19 Aug 2024 14:19:48 -0700 Subject: dmaengine: amd: qdma: Add AMD QDMA driver Adds driver to enable PCIe board which uses AMD QDMA (the Queue-based Direct Memory Access) subsystem. For example, Xilinx Alveo V70 AI Accelerator devices. https://www.xilinx.com/applications/data-center/v70.html The QDMA subsystem is used in conjunction with the PCI Express IP block to provide high performance data transfer between host memory and the card's DMA subsystem. +-------+ +-------+ +-----------+ PCIe | | | | | | Tx/Rx | | | | AXI | | <=======> | PCIE | <===> | QDMA | <====>| User Logic| | | | | | | +-------+ +-------+ +-----------+ The primary mechanism to transfer data using the QDMA is for the QDMA engine to operate on instructions (descriptors) provided by the host operating system. Using the descriptors, the QDMA can move data in both the Host to Card (H2C) direction, or the Card to Host (C2H) direction. The QDMA provides a per-queue basis option whether DMA traffic goes to an AXI4 memory map (MM) interface or to an AXI4-Stream interface. The hardware detail is provided by https://docs.xilinx.com/r/en-US/pg302-qdma Implements dmaengine APIs to support MM DMA transfers. - probe the available DMA channels - use dma_slave_map for channel lookup - use virtual channel to manage dmaengine tx descriptors - implement device_prep_slave_sg callback to handle host scatter gather list Signed-off-by: Nishad Saraf Signed-off-by: Lizhi Hou Link: https://lore.kernel.org/r/20240819211948.688786-2-lizhi.hou@amd.com Signed-off-by: Vinod Koul --- include/linux/platform_data/amd_qdma.h | 36 ++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 include/linux/platform_data/amd_qdma.h (limited to 'include/linux') diff --git a/include/linux/platform_data/amd_qdma.h b/include/linux/platform_data/amd_qdma.h new file mode 100644 index 000000000000..576d952f97ed --- /dev/null +++ b/include/linux/platform_data/amd_qdma.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2023-2024, Advanced Micro Devices, Inc. + */ + +#ifndef _PLATDATA_AMD_QDMA_H +#define _PLATDATA_AMD_QDMA_H + +#include + +/** + * struct qdma_queue_info - DMA queue information. This information is used to + * match queue when DMA channel is requested + * @dir: Channel transfer direction + */ +struct qdma_queue_info { + enum dma_transfer_direction dir; +}; + +#define QDMA_FILTER_PARAM(qinfo) ((void *)(qinfo)) + +struct dma_slave_map; + +/** + * struct qdma_platdata - Platform specific data for QDMA engine + * @max_mm_channels: Maximum number of MM DMA channels in each direction + * @device_map: DMA slave map + * @irq_index: The index of first IRQ + */ +struct qdma_platdata { + u32 max_mm_channels; + u32 irq_index; + struct dma_slave_map *device_map; +}; + +#endif /* _PLATDATA_AMD_QDMA_H */ -- cgit v1.2.3 From 4bb59323450db610b06c549fa4d5936e94fb9a36 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Sat, 10 Aug 2024 17:45:40 +0800 Subject: dmaengine: ti: k3-udma: Remove unused declarations Commit d70241913413 ("dmaengine: ti: k3-udma: Add glue layer for non DMAengine users") declared but never implemented these. Signed-off-by: Yue Haibing Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20240810094540.2589310-1-yuehaibing@huawei.com Signed-off-by: Vinod Koul --- include/linux/dma/k3-udma-glue.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma/k3-udma-glue.h b/include/linux/dma/k3-udma-glue.h index 1e491c5dcac2..2dea217629d0 100644 --- a/include/linux/dma/k3-udma-glue.h +++ b/include/linux/dma/k3-udma-glue.h @@ -136,8 +136,6 @@ u32 k3_udma_glue_rx_flow_get_fdq_id(struct k3_udma_glue_rx_channel *rx_chn, u32 k3_udma_glue_rx_get_flow_id_base(struct k3_udma_glue_rx_channel *rx_chn); int k3_udma_glue_rx_get_irq(struct k3_udma_glue_rx_channel *rx_chn, u32 flow_num); -void k3_udma_glue_rx_put_irq(struct k3_udma_glue_rx_channel *rx_chn, - u32 flow_num); void k3_udma_glue_reset_rx_chn(struct k3_udma_glue_rx_channel *rx_chn, u32 flow_num, void *data, void (*cleanup)(void *data, dma_addr_t desc_dma), -- cgit v1.2.3 From 41845541adebc503b8574943c92670016d5e566b Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 23 Aug 2024 17:05:18 +0800 Subject: firmware: arm_scmi: Add initial support for i.MX BBM protocol i.MX95 has a battery-backed module(BBM), which has persistent storage (GPR), an RTC, and the ON/OFF button. The System Manager(SM) firmware use SCMI vendor protocol(SCMI BBM) to let agent be able to use GPR, RTC and ON/OFF button. Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Message-Id: <20240823-imx95-bbm-misc-v2-v8-2-e600ed9e9271@nxp.com> Signed-off-by: Sudeep Holla --- include/linux/scmi_imx_protocol.h | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 include/linux/scmi_imx_protocol.h (limited to 'include/linux') diff --git a/include/linux/scmi_imx_protocol.h b/include/linux/scmi_imx_protocol.h new file mode 100644 index 000000000000..2df2ea0f1809 --- /dev/null +++ b/include/linux/scmi_imx_protocol.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * SCMI Message Protocol driver NXP extension header + * + * Copyright 2024 NXP. + */ + +#ifndef _LINUX_SCMI_NXP_PROTOCOL_H +#define _LINUX_SCMI_NXP_PROTOCOL_H + +#include +#include +#include +#include + +enum scmi_nxp_protocol { + SCMI_PROTOCOL_IMX_BBM = 0x81, +}; + +struct scmi_imx_bbm_proto_ops { + int (*rtc_time_set)(const struct scmi_protocol_handle *ph, u32 id, + uint64_t sec); + int (*rtc_time_get)(const struct scmi_protocol_handle *ph, u32 id, + u64 *val); + int (*rtc_alarm_set)(const struct scmi_protocol_handle *ph, u32 id, + bool enable, u64 sec); + int (*button_get)(const struct scmi_protocol_handle *ph, u32 *state); +}; + +enum scmi_nxp_notification_events { + SCMI_EVENT_IMX_BBM_RTC = 0x0, + SCMI_EVENT_IMX_BBM_BUTTON = 0x1, +}; + +struct scmi_imx_bbm_notif_report { + bool is_rtc; + bool is_button; + ktime_t timestamp; + unsigned int rtc_id; + unsigned int rtc_evt; +}; +#endif -- cgit v1.2.3 From 61c9f03e22fc57fe61726c513b1f92c0ed1ef00f Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 23 Aug 2024 17:05:19 +0800 Subject: firmware: arm_scmi: Add initial support for i.MX MISC protocol i.MX95 System Manager(SM) firmware includes a SCMI vendor protocol, SCMI MISC protocol which includes controls that are misc settings/actions that must be exposed from the SM to agents. They are device specific and are usually define to access bit fields in various mix block control modules, IOMUX_GPR, and other General Purpose registers, Control Status Registers owned by the SM. Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Message-Id: <20240823-imx95-bbm-misc-v2-v8-3-e600ed9e9271@nxp.com> Signed-off-by: Sudeep Holla --- include/linux/scmi_imx_protocol.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scmi_imx_protocol.h b/include/linux/scmi_imx_protocol.h index 2df2ea0f1809..066216f1357a 100644 --- a/include/linux/scmi_imx_protocol.h +++ b/include/linux/scmi_imx_protocol.h @@ -15,6 +15,7 @@ enum scmi_nxp_protocol { SCMI_PROTOCOL_IMX_BBM = 0x81, + SCMI_PROTOCOL_IMX_MISC = 0x84, }; struct scmi_imx_bbm_proto_ops { @@ -30,6 +31,7 @@ struct scmi_imx_bbm_proto_ops { enum scmi_nxp_notification_events { SCMI_EVENT_IMX_BBM_RTC = 0x0, SCMI_EVENT_IMX_BBM_BUTTON = 0x1, + SCMI_EVENT_IMX_MISC_CONTROL = 0x0, }; struct scmi_imx_bbm_notif_report { @@ -39,4 +41,19 @@ struct scmi_imx_bbm_notif_report { unsigned int rtc_id; unsigned int rtc_evt; }; + +struct scmi_imx_misc_ctrl_notify_report { + ktime_t timestamp; + unsigned int ctrl_id; + unsigned int flags; +}; + +struct scmi_imx_misc_proto_ops { + int (*misc_ctrl_set)(const struct scmi_protocol_handle *ph, u32 id, + u32 num, u32 *val); + int (*misc_ctrl_get)(const struct scmi_protocol_handle *ph, u32 id, + u32 *num, u32 *val); + int (*misc_ctrl_req_notify)(const struct scmi_protocol_handle *ph, + u32 ctrl_id, u32 evt_id, u32 flags); +}; #endif -- cgit v1.2.3 From 0b4f8a68b292e7ee82107b1ce15c3aad31c864b1 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 23 Aug 2024 17:05:21 +0800 Subject: firmware: imx: Add i.MX95 MISC driver The i.MX95 System manager exports SCMI MISC protocol for linux to do various settings, such as set board gpio expander as wakeup source. The driver is to add the support. Reviewed-by: Cristian Marussi Signed-off-by: Peng Fan Message-Id: <20240823-imx95-bbm-misc-v2-v8-5-e600ed9e9271@nxp.com> Signed-off-by: Sudeep Holla --- include/linux/firmware/imx/sm.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 include/linux/firmware/imx/sm.h (limited to 'include/linux') diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h new file mode 100644 index 000000000000..62a2690e2abd --- /dev/null +++ b/include/linux/firmware/imx/sm.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Copyright 2024 NXP + */ + +#ifndef _SCMI_IMX_H +#define _SCMI_IMX_H + +#include +#include +#include + +#define SCMI_IMX_CTRL_PDM_CLK_SEL 0 /* AON PDM clock sel */ +#define SCMI_IMX_CTRL_MQS1_SETTINGS 1 /* AON MQS settings */ +#define SCMI_IMX_CTRL_SAI1_MCLK 2 /* AON SAI1 MCLK */ +#define SCMI_IMX_CTRL_SAI3_MCLK 3 /* WAKE SAI3 MCLK */ +#define SCMI_IMX_CTRL_SAI4_MCLK 4 /* WAKE SAI4 MCLK */ +#define SCMI_IMX_CTRL_SAI5_MCLK 5 /* WAKE SAI5 MCLK */ + +#if IS_ENABLED(CONFIG_IMX_SCMI_MISC_EXT) +int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val); +int scmi_imx_misc_ctrl_set(u32 id, u32 val); +#else +static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val) +{ + return -EOPNOTSUPP; +} + +static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val); +{ + return -EOPNOTSUPP; +} +#endif +#endif -- cgit v1.2.3 From c42a01264ba1497eb3193c08ff3c2656d98250a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Jul 2024 06:10:24 +0200 Subject: dma-mapping: don't return errors from dma_set_min_align_mask A NULL dev->dma_parms indicates either a bus that is not DMA capable or grave bug in the implementation of the bus code. There isn't much the driver can do in terms of error handling for either case, so just warn and continue as DMA operations will fail anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy Reviewed-by: Martin K. Petersen --- include/linux/dma-mapping.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index f693aafe221f..cfd6bafec3f9 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -575,13 +575,12 @@ static inline unsigned int dma_get_min_align_mask(struct device *dev) return 0; } -static inline int dma_set_min_align_mask(struct device *dev, +static inline void dma_set_min_align_mask(struct device *dev, unsigned int min_align_mask) { if (WARN_ON_ONCE(!dev->dma_parms)) - return -EIO; + return; dev->dma_parms->min_align_mask = min_align_mask; - return 0; } #ifndef dma_get_cache_alignment -- cgit v1.2.3 From 560a861ab4174b42240157ab5cebe36b8c7bc418 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Jul 2024 06:06:28 +0200 Subject: dma-mapping: don't return errors from dma_set_seg_boundary A NULL dev->dma_parms indicates either a bus that is not DMA capable or grave bug in the implementation of the bus code. There isn't much the driver can do in terms of error handling for either case, so just warn and continue as DMA operations will fail anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy Reviewed-by: Martin K. Petersen --- include/linux/dma-mapping.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index cfd6bafec3f9..6bd1333dbacb 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -559,13 +559,11 @@ static inline unsigned long dma_get_seg_boundary_nr_pages(struct device *dev, return (dma_get_seg_boundary(dev) >> page_shift) + 1; } -static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) +static inline void dma_set_seg_boundary(struct device *dev, unsigned long mask) { - if (dev->dma_parms) { - dev->dma_parms->segment_boundary_mask = mask; - return 0; - } - return -EIO; + if (WARN_ON_ONCE(!dev->dma_parms)) + return; + dev->dma_parms->segment_boundary_mask = mask; } static inline unsigned int dma_get_min_align_mask(struct device *dev) -- cgit v1.2.3 From 334304ac2baca7f3e821c47cf5129d90e7a6b1e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Jul 2024 06:07:38 +0200 Subject: dma-mapping: don't return errors from dma_set_max_seg_size A NULL dev->dma_parms indicates either a bus that is not DMA capable or grave bug in the implementation of the bus code. There isn't much the driver can do in terms of error handling for either case, so just warn and continue as DMA operations will fail anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy Reviewed-by: Martin K. Petersen Acked-by: Ulf Hansson # For MMC --- include/linux/dma-mapping.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 6bd1333dbacb..1524da363734 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -524,13 +524,11 @@ static inline unsigned int dma_get_max_seg_size(struct device *dev) return SZ_64K; } -static inline int dma_set_max_seg_size(struct device *dev, unsigned int size) +static inline void dma_set_max_seg_size(struct device *dev, unsigned int size) { - if (dev->dma_parms) { - dev->dma_parms->max_segment_size = size; - return 0; - } - return -EIO; + if (WARN_ON_ONCE(!dev->dma_parms)) + return; + dev->dma_parms->max_segment_size = size; } static inline unsigned long dma_get_seg_boundary(struct device *dev) -- cgit v1.2.3 From b31c9d9dc343146b9f4ce67b4eee748c49296e99 Mon Sep 17 00:00:00 2001 From: Peter Hutterer Date: Tue, 27 Aug 2024 17:19:29 +0900 Subject: HID: hidraw: add HIDIOCREVOKE ioctl There is a need for userspace applications to open HID devices directly. Use-cases include configuration of gaming mice or direct access to joystick devices. The latter is currently handled by the uaccess tag in systemd, other devices include more custom/local configurations or just sudo. A better approach is what we already have for evdev devices: give the application a file descriptor and revoke it when it may no longer access that device. This patch is the hidraw equivalent to the EVIOCREVOKE ioctl, see commit c7dc65737c9a ("Input: evdev - add EVIOCREVOKE ioctl") for full details. An MR for systemd-logind has been filed here: https://github.com/systemd/systemd/pull/33970 Signed-off-by: Peter Hutterer Link: https://patch.msgid.link/20240827-hidraw-revoke-v5-1-d004a7451aea@kernel.org Signed-off-by: Benjamin Tissoires --- include/linux/hidraw.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/hidraw.h b/include/linux/hidraw.h index cd67f4ca5599..18fd30a288de 100644 --- a/include/linux/hidraw.h +++ b/include/linux/hidraw.h @@ -32,6 +32,7 @@ struct hidraw_list { struct hidraw *hidraw; struct list_head node; struct mutex read_mutex; + bool revoked; }; #ifdef CONFIG_HIDRAW -- cgit v1.2.3 From b35243a447b9fe6457fa8e1352152b818436ba5a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Aug 2024 19:37:54 +0200 Subject: block: rework bio splitting The current setup with bio_may_exceed_limit and __bio_split_to_limits is a bit of a mess. Change it so that __bio_split_to_limits does all the work and is just a variant of bio_split_to_limits that returns nr_segs. This is done by inlining it and instead have the various bio_split_* helpers directly submit the potentially split bios. To support btrfs, the rw version has a lower level helper split out that just returns the offset to split. This turns out to nicely clean up the btrfs flow as well. Signed-off-by: Christoph Hellwig Acked-by: David Sterba Reviewed-by: Damien Le Moal Tested-by: Hans Holmberg Reviewed-by: Hans Holmberg Link: https://lore.kernel.org/r/20240826173820.1690925-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index a46e2047bea4..faceadb040f9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,8 +324,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); -struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, - unsigned *segs, struct bio_set *bs, unsigned max_bytes); +int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, unsigned max_bytes); /** * bio_next_split - get next @sectors from a bio, splitting if necessary -- cgit v1.2.3 From 379b122a3ec8033aa43cb70e8ecb6fb7f98aa68f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 26 Aug 2024 19:37:55 +0200 Subject: block: constify the lim argument to queue_limits_max_zone_append_sectors queue_limits_max_zone_append_sectors doesn't change the lim argument, so mark it as const. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Tested-by: Hans Holmberg Reviewed-by: Hans Holmberg Link: https://lore.kernel.org/r/20240826173820.1690925-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e85ec73a07d5..ec3ea5d1f99d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1187,7 +1187,8 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l) +static inline unsigned int +queue_limits_max_zone_append_sectors(const struct queue_limits *l) { unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); -- cgit v1.2.3 From 2e1a57d5b0adbb5bd1d85245ec29b6d3cc7edc69 Mon Sep 17 00:00:00 2001 From: Chris Morgan Date: Wed, 21 Aug 2024 16:54:52 -0500 Subject: mfd: axp20x: Add ADC, BAT, and USB cells for AXP717 Add support for the AXP717 PMIC to utilize the ADC (for reading voltage, current, and temperature information from the PMIC) as well as the USB charger and battery. Signed-off-by: Chris Morgan Link: https://lore.kernel.org/r/20240821215456.962564-12-macroalpha82@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/axp20x.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index 8c0a33a2e9ce..9b7ef473adcb 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -115,6 +115,16 @@ enum axp20x_variants { #define AXP313A_IRQ_STATE 0x21 #define AXP717_ON_INDICATE 0x00 +#define AXP717_PMU_STATUS_2 0x01 +#define AXP717_BC_DETECT 0x05 +#define AXP717_PMU_FAULT 0x08 +#define AXP717_MODULE_EN_CONTROL_1 0x0b +#define AXP717_MIN_SYS_V_CONTROL 0x15 +#define AXP717_INPUT_VOL_LIMIT_CTRL 0x16 +#define AXP717_INPUT_CUR_LIMIT_CTRL 0x17 +#define AXP717_MODULE_EN_CONTROL_2 0x19 +#define AXP717_BOOST_CONTROL 0x1e +#define AXP717_VSYS_V_POWEROFF 0x24 #define AXP717_IRQ0_EN 0x40 #define AXP717_IRQ1_EN 0x41 #define AXP717_IRQ2_EN 0x42 @@ -125,6 +135,9 @@ enum axp20x_variants { #define AXP717_IRQ2_STATE 0x4a #define AXP717_IRQ3_STATE 0x4b #define AXP717_IRQ4_STATE 0x4c +#define AXP717_ICC_CHG_SET 0x62 +#define AXP717_ITERM_CHG_SET 0x63 +#define AXP717_CV_CHG_SET 0x64 #define AXP717_DCDC_OUTPUT_CONTROL 0x80 #define AXP717_DCDC1_CONTROL 0x83 #define AXP717_DCDC2_CONTROL 0x84 @@ -145,6 +158,19 @@ enum axp20x_variants { #define AXP717_CLDO3_CONTROL 0x9d #define AXP717_CLDO4_CONTROL 0x9e #define AXP717_CPUSLDO_CONTROL 0x9f +#define AXP717_BATT_PERCENT_DATA 0xa4 +#define AXP717_ADC_CH_EN_CONTROL 0xc0 +#define AXP717_BATT_V_H 0xc4 +#define AXP717_BATT_V_L 0xc5 +#define AXP717_VBUS_V_H 0xc6 +#define AXP717_VBUS_V_L 0xc7 +#define AXP717_VSYS_V_H 0xc8 +#define AXP717_VSYS_V_L 0xc9 +#define AXP717_BATT_CHRG_I_H 0xca +#define AXP717_BATT_CHRG_I_L 0xcb +#define AXP717_ADC_DATA_SEL 0xcd +#define AXP717_ADC_DATA_H 0xce +#define AXP717_ADC_DATA_L 0xcf #define AXP806_STARTUP_SRC 0x00 #define AXP806_CHIP_ID 0x03 -- cgit v1.2.3 From c0390d541128e8820af8177a572d9d87ff68a3bb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 21:06:58 +0200 Subject: fs: pack struct file Now that we shrunk struct file to 192 bytes aka 3 cachelines reorder struct file to not leave any holes or have members cross cachelines. Add a short comment to each of the fields and mark the cachelines. It's possible that we may have to tweak this based on profiling in the future. So far I had Jens test this comparing io_uring with non-fixed and fixed files and it improved performance. The layout is a combination of Jens' and my changes. Link: https: //lore.kernel.org/r/20240824-peinigen-hocken-7384b977c643@brauner Signed-off-by: Christian Brauner --- include/linux/fs.h | 91 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 095a956aeb29..af8bbd4eeb3a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -987,52 +987,63 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) index < ra->start + ra->size); } -/* - * f_{lock,count,pos_lock} members can be highly contended and share - * the same cacheline. f_{lock,mode} are very frequently used together - * and so share the same cacheline as well. The read-mostly - * f_{path,inode,op} are kept on a separate cacheline. +/** + * struct file - Represents a file + * @f_count: reference count + * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. + * @f_mode: FMODE_* flags often used in hotpaths + * @f_op: file operations + * @f_mapping: Contents of a cacheable, mappable object. + * @private_data: filesystem or driver specific data + * @f_inode: cached inode + * @f_flags: file flags + * @f_iocb_flags: iocb flags + * @f_cred: stashed credentials of creator/opener + * @f_path: path of the file + * @f_pos_lock: lock protecting file position + * @f_pos: file position + * @f_version: file version + * @f_security: LSM security context of this file + * @f_owner: file owner + * @f_wb_err: writeback error + * @f_sb_err: per sb writeback errors + * @f_ep: link of all epoll hooks for this file + * @f_task_work: task work entry point + * @f_llist: work queue entrypoint + * @f_ra: file's readahead state */ struct file { - union { - /* fput() uses task work when closing and freeing file (default). */ - struct callback_head f_task_work; - /* fput() must use workqueue (most kernel threads). */ - struct llist_node f_llist; - /* Invalid after last fput(). */ - struct file_ra_state f_ra; - }; - /* - * Protects f_ep, f_flags. - * Must not be taken from IRQ context. - */ - spinlock_t f_lock; - fmode_t f_mode; - atomic_long_t f_count; - struct mutex f_pos_lock; - loff_t f_pos; - unsigned int f_flags; - unsigned int f_iocb_flags; - struct fown_struct *f_owner; - const struct cred *f_cred; - struct path f_path; - struct inode *f_inode; /* cached value */ + atomic_long_t f_count; + spinlock_t f_lock; + fmode_t f_mode; const struct file_operations *f_op; - - u64 f_version; + struct address_space *f_mapping; + void *private_data; + struct inode *f_inode; + unsigned int f_flags; + unsigned int f_iocb_flags; + const struct cred *f_cred; + /* --- cacheline 1 boundary (64 bytes) --- */ + struct path f_path; + struct mutex f_pos_lock; + loff_t f_pos; + u64 f_version; + /* --- cacheline 2 boundary (128 bytes) --- */ #ifdef CONFIG_SECURITY - void *f_security; + void *f_security; #endif - /* needed for tty driver, and maybe others */ - void *private_data; - + struct fown_struct *f_owner; + errseq_t f_wb_err; + errseq_t f_sb_err; #ifdef CONFIG_EPOLL - /* Used by fs/eventpoll.c to link all the hooks to this file */ - struct hlist_head *f_ep; -#endif /* #ifdef CONFIG_EPOLL */ - struct address_space *f_mapping; - errseq_t f_wb_err; - errseq_t f_sb_err; /* for syncfs */ + struct hlist_head *f_ep; +#endif + union { + struct callback_head f_task_work; + struct llist_node f_llist; + struct file_ra_state f_ra; + }; + /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ -- cgit v1.2.3 From d345bd2e9834e2da505977e154a1c179c793b7b2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 28 Aug 2024 12:56:24 +0200 Subject: mm: add kmem_cache_create_rcu() When a kmem cache is created with SLAB_TYPESAFE_BY_RCU the free pointer must be located outside of the object because we don't know what part of the memory can safely be overwritten as it may be needed to prevent object recycling. That has the consequence that SLAB_TYPESAFE_BY_RCU may end up adding a new cacheline. This is the case for e.g., struct file. After having it shrunk down by 40 bytes and having it fit in three cachelines we still have SLAB_TYPESAFE_BY_RCU adding a fourth cacheline because it needs to accommodate the free pointer. Add a new kmem_cache_create_rcu() function that allows the caller to specify an offset where the free pointer is supposed to be placed. Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-2-5460bc1f09f6@kernel.org Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner --- include/linux/slab.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index eb2bf4629157..5b2da2cf31a8 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -212,6 +212,12 @@ enum _slab_flag_bits { #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif +/* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * @@ -242,6 +248,9 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, + unsigned int freeptr_offset, + slab_flags_t flags); void kmem_cache_destroy(struct kmem_cache *s); int kmem_cache_shrink(struct kmem_cache *s); -- cgit v1.2.3 From ea566e18b4deea6e998088de4f1a76d1f39c8d3f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 28 Aug 2024 12:56:25 +0200 Subject: fs: use kmem_cache_create_rcu() Switch to the new kmem_cache_create_rcu() helper which allows us to use a custom free pointer offset avoiding the need to have an external free pointer which would grow struct file behind our backs. Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-3-5460bc1f09f6@kernel.org Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index af8bbd4eeb3a..58c91a52cad1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1011,6 +1011,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_task_work: task work entry point * @f_llist: work queue entrypoint * @f_ra: file's readahead state + * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) */ struct file { atomic_long_t f_count; @@ -1042,6 +1043,7 @@ struct file { struct callback_head f_task_work; struct llist_node f_llist; struct file_ra_state f_ra; + freeptr_t f_freeptr; }; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout -- cgit v1.2.3 From f91f2a9879cc77db1f45f690f38f42698580416e Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 28 Aug 2024 16:34:00 -0700 Subject: dmaengine: idxd: Add a new DSA device ID for Granite Rapids-D platform A new DSA device ID, 0x11fb, is introduced for the Granite Rapids-D platform. Add the device ID to the IDXD driver. Since a potential security issue has been fixed on the new device, it's secure to assign the device to virtual machines, and therefore, the new device ID will not be added to the VFIO denylist. Additionally, the new device ID may be useful in identifying and addressing any other potential issues with this specific device in the future. The same is also applied to any other new DSA/IAA devices with new device IDs. Signed-off-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20240828233401.186007-2-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index e388c8b1cbc2..ff99047dac44 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2706,6 +2706,7 @@ #define PCI_DEVICE_ID_INTEL_82815_MC 0x1130 #define PCI_DEVICE_ID_INTEL_82815_CGC 0x1132 #define PCI_DEVICE_ID_INTEL_SST_TNG 0x119a +#define PCI_DEVICE_ID_INTEL_DSA_GNRD 0x11fb #define PCI_DEVICE_ID_INTEL_82092AA_0 0x1221 #define PCI_DEVICE_ID_INTEL_82437 0x122d #define PCI_DEVICE_ID_INTEL_82371FB_0 0x122e -- cgit v1.2.3 From 4fecf944c051ea852e98c062636bf5b4c7f5f8a7 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Wed, 28 Aug 2024 16:34:01 -0700 Subject: dmaengine: idxd: Add new DSA and IAA device IDs for Diamond Rapids platform A new DSA device ID, 0x1212, and a new IAA device ID, 0x1216, are introduced for Diamond Rapids platform. Add the device IDs to the IDXD driver. The name "IAA" is used in new code instead of the old name "IAX". However, the "IAX" naming (e.g., IDXD_TYPE_IAX) is retained for legacy code compatibility. Signed-off-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20240828233401.186007-3-fenghua.yu@intel.com Signed-off-by: Vinod Koul --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index ff99047dac44..8139231d0e86 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2707,6 +2707,8 @@ #define PCI_DEVICE_ID_INTEL_82815_CGC 0x1132 #define PCI_DEVICE_ID_INTEL_SST_TNG 0x119a #define PCI_DEVICE_ID_INTEL_DSA_GNRD 0x11fb +#define PCI_DEVICE_ID_INTEL_DSA_DMR 0x1212 +#define PCI_DEVICE_ID_INTEL_IAA_DMR 0x1216 #define PCI_DEVICE_ID_INTEL_82092AA_0 0x1221 #define PCI_DEVICE_ID_INTEL_82437 0x122d #define PCI_DEVICE_ID_INTEL_82371FB_0 0x122e -- cgit v1.2.3 From 6f606ffd6dd7583d8194ee3d858ba4da2eff26a3 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:23 -0700 Subject: bpf: Move insn_buf[16] to bpf_verifier_env This patch moves the 'struct bpf_insn insn_buf[16]' stack usage to the bpf_verifier_env. A '#define INSN_BUF_SIZE 16' is also added to replace the ARRAY_SIZE(insn_buf) usages. Both convert_ctx_accesses() and do_misc_fixup() are changed to use the env->insn_buf. It is a refactoring work for adding the epilogue_buf[16] in a later patch. With this patch, the stack size usage decreased. Before: ./kernel/bpf/verifier.c:22133:5: warning: stack frame size (2584) After: ./kernel/bpf/verifier.c:22184:5: warning: stack frame size (2264) Reviewed-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 279b4a640644..0ad2d189c546 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -23,6 +23,8 @@ * (in the "-8,-16,...,-512" form) */ #define TMP_STR_BUF_LEN 320 +/* Patch buffer size */ +#define INSN_BUF_SIZE 16 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that @@ -780,6 +782,7 @@ struct bpf_verifier_env { * e.g., in reg_type_str() to generate reg_type string */ char tmp_str_buf[TMP_STR_BUF_LEN]; + struct bpf_insn insn_buf[INSN_BUF_SIZE]; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) -- cgit v1.2.3 From 169c31761c8d7f606f3ee628829c27998626c4f0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Aug 2024 14:08:25 -0700 Subject: bpf: Add gen_epilogue to bpf_verifier_ops This patch adds a .gen_epilogue to the bpf_verifier_ops. It is similar to the existing .gen_prologue. Instead of allowing a subsystem to run code at the beginning of a bpf prog, it allows the subsystem to run code just before the bpf prog exit. One of the use case is to allow the upcoming bpf qdisc to ensure that the skb->dev is the same as the qdisc->dev_queue->dev. The bpf qdisc struct_ops implementation could either fix it up or drop the skb. Another use case could be in bpf_tcp_ca.c to enforce snd_cwnd has sane value (e.g. non zero). The epilogue can do the useful thing (like checking skb->dev) if it can access the bpf prog's ctx. Unlike prologue, r1 may not hold the ctx pointer. This patch saves the r1 in the stack if the .gen_epilogue has returned some instructions in the "epilogue_buf". The existing .gen_prologue is done in convert_ctx_accesses(). The new .gen_epilogue is done in the convert_ctx_accesses() also. When it sees the (BPF_JMP | BPF_EXIT) instruction, it will be patched with the earlier generated "epilogue_buf". The epilogue patching is only done for the main prog. Only one epilogue will be patched to the main program. When the bpf prog has multiple BPF_EXIT instructions, a BPF_JA is used to goto the earlier patched epilogue. Majority of the archs support (BPF_JMP32 | BPF_JA): x86, arm, s390, risv64, loongarch, powerpc and arc. This patch keeps it simple and always use (BPF_JMP32 | BPF_JA). A new macro BPF_JMP32_A is added to generate the (BPF_JMP32 | BPF_JA) insn. Acked-by: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240829210833.388152-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_verifier.h | 1 + include/linux/filter.h | 10 ++++++++++ 3 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc63083f76b7..6f87fb014fba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -974,6 +974,8 @@ struct bpf_verifier_ops { struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); + int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog, + s16 ctx_stack_off); int (*gen_ld_abs)(const struct bpf_insn *orig, struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0ad2d189c546..2e20207315a9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -783,6 +783,7 @@ struct bpf_verifier_env { */ char tmp_str_buf[TMP_STR_BUF_LEN]; struct bpf_insn insn_buf[INSN_BUF_SIZE]; + struct bpf_insn epilogue_buf[INSN_BUF_SIZE]; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) diff --git a/include/linux/filter.h b/include/linux/filter.h index b6672ff61407..99b6fc83825b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -437,6 +437,16 @@ static inline bool insn_is_cast_user(const struct bpf_insn *insn) .off = OFF, \ .imm = 0 }) +/* Unconditional jumps, gotol pc + imm32 */ + +#define BPF_JMP32_A(IMM) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_JA, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + /* Relative call */ #define BPF_CALL_REL(TGT) \ -- cgit v1.2.3 From 087adb4f0f91ee330446a70af899e6a996e5cc13 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 6 Aug 2024 19:28:46 +0200 Subject: vfs: dodge smp_mb in break_lease and break_deleg in the common case These inlines show up in the fast path (e.g., in do_dentry_open()) and induce said full barrier regarding i_flctx access when in most cases the pointer is NULL. The pointer can be safely checked before issuing the barrier, dodging it in most cases as a result. It is plausible the consume fence would be sufficient, but I don't want to go audit all callers regarding what they before calling here. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240806172846.886570-1-mjguzik@gmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/filelock.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index daee999d05f3..bb44224c6676 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -420,28 +420,38 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) #ifdef CONFIG_FILE_LOCKING static inline int break_lease(struct inode *inode, unsigned int mode) { + struct file_lock_context *flctx; + /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ + flctx = READ_ONCE(inode->i_flctx); + if (!flctx) + return 0; smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, mode, FL_LEASE); return 0; } static inline int break_deleg(struct inode *inode, unsigned int mode) { + struct file_lock_context *flctx; + /* * Since this check is lockless, we must ensure that any refcounts * taken are done before checking i_flctx->flc_lease. Otherwise, we * could end up racing with tasks trying to set a new lease on this * file. */ + flctx = READ_ONCE(inode->i_flctx); + if (!flctx) + return 0; smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) + if (!list_empty_careful(&flctx->flc_lease)) return __break_lease(inode, mode, FL_DELEG); return 0; } -- cgit v1.2.3 From 8447d848e1dc6d42d1dcd00f133d7715fc732c47 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 10 Aug 2024 08:47:53 +0200 Subject: vfs: only read fops once in fops_get/put In do_dentry_open() the usage is: f->f_op = fops_get(inode->i_fop); In generated asm the compiler emits 2 reads from inode->i_fop instead of just one. This popped up due to false-sharing where loads from that offset end up bouncing a cacheline during parallel open. While this is going to be fixed, the spurious load does not need to be there. This makes do_dentry_open() go down from 1177 to 1154 bytes. fops_put() is patched to maintain some consistency. No functional changes. Reviewed-by: Josef Bacik Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20240810064753.1211441-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c0286d479c9d..696c620f90e0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2553,10 +2553,17 @@ struct super_block *sget(struct file_system_type *type, struct super_block *sget_dev(struct fs_context *fc, dev_t dev); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ -#define fops_get(fops) \ - (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) -#define fops_put(fops) \ - do { if (fops) module_put((fops)->owner); } while(0) +#define fops_get(fops) ({ \ + const struct file_operations *_fops = (fops); \ + (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL)); \ +}) + +#define fops_put(fops) ({ \ + const struct file_operations *_fops = (fops); \ + if (_fops) \ + module_put((_fops)->owner); \ +}) + /* * This one is to be used *ONLY* from ->open() instances. * fops must be non-NULL, pinned down *and* module dependencies -- cgit v1.2.3 From 641bb4394f405cba498b100b44541ffc0aed5be1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Aug 2024 12:38:56 +0200 Subject: fs: move FMODE_UNSIGNED_OFFSET to fop_flags This is another flag that is statically set and doesn't need to use up an FMODE_* bit. Move it to ->fop_flags and free up another FMODE_* bit. (1) mem_open() used from proc_mem_operations (2) adi_open() used from adi_fops (3) drm_open_helper(): (3.1) accel_open() used from DRM_ACCEL_FOPS (3.2) drm_open() used from (3.2.1) amdgpu_driver_kms_fops (3.2.2) psb_gem_fops (3.2.3) i915_driver_fops (3.2.4) nouveau_driver_fops (3.2.5) panthor_drm_driver_fops (3.2.6) radeon_driver_kms_fops (3.2.7) tegra_drm_fops (3.2.8) vmwgfx_driver_fops (3.2.9) xe_driver_fops (3.2.10) DRM_GEM_FOPS (3.2.11) DEFINE_DRM_GEM_DMA_FOPS (4) struct memdev sets fmode flags based on type of device opened. For devices using struct mem_fops unsigned offset is used. Mark all these file operations as FOP_UNSIGNED_OFFSET and add asserts into the open helper to ensure that the flag is always set. Link: https://lore.kernel.org/r/20240809-work-fop_unsigned-v1-1-658e054d893e@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 696c620f90e0..0145bda465ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -146,8 +146,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) -/* File is huge (eg. /dev/mem): treat loff_t as unsigned */ -#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)(1 << 13)) +/* FMODE_* bit 13 */ /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) @@ -2073,6 +2072,8 @@ struct file_operations { #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) /* Contains huge pages */ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) +/* Treat loff_t as unsigned (e.g., /dev/mem) */ +#define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, -- cgit v1.2.3 From 1c48d441468c425e878d81c5c3e7ff8a9a594cc0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 16 Aug 2024 16:35:52 +0200 Subject: inode: remove __I_DIO_WAKEUP Afaict, we can just rely on inode->i_dio_count for waiting instead of this awkward indirection through __I_DIO_WAKEUP. This survives LTP dio and xfstests dio tests. Link: https://lore.kernel.org/r/20240816-vfs-misc-dio-v1-1-80fe21a2c710@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/fs.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0145bda465ff..8f8d274929d7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2373,8 +2373,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * * I_REFERENCED Marks the inode as recently references on the LRU list. * - * I_DIO_WAKEUP Never set. Only used as a key for wait_on_bit(). - * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See @@ -2409,8 +2407,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, #define __I_SYNC 7 #define I_SYNC (1 << __I_SYNC) #define I_REFERENCED (1 << 8) -#define __I_DIO_WAKEUP 9 -#define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) #define I_WB_SWITCH (1 << 13) @@ -3227,7 +3223,9 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, } #endif +bool inode_dio_finished(const struct inode *inode); void inode_dio_wait(struct inode *inode); +void inode_dio_wait_interruptible(struct inode *inode); /** * inode_dio_begin - signal start of a direct I/O requests @@ -3251,7 +3249,7 @@ static inline void inode_dio_begin(struct inode *inode) static inline void inode_dio_end(struct inode *inode) { if (atomic_dec_and_test(&inode->i_dio_count)) - wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); + wake_up_var(&inode->i_dio_count); } extern void inode_set_flags(struct inode *inode, unsigned int flags, -- cgit v1.2.3 From 029c3f27fe84c5fd29d49a99cc5176433bf12209 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Aug 2024 11:28:38 +0200 Subject: fs: remove unused path_put_init() This helper has been unused for a while now. Link: https://lore.kernel.org/r/20240822-bewuchs-werktag-46672b3c0606@brauner Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/path.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/path.h b/include/linux/path.h index ca073e70decd..7ea389dc764b 100644 --- a/include/linux/path.h +++ b/include/linux/path.h @@ -18,12 +18,6 @@ static inline int path_equal(const struct path *path1, const struct path *path2) return path1->mnt == path2->mnt && path1->dentry == path2->dentry; } -static inline void path_put_init(struct path *path) -{ - path_put(path); - *path = (struct path) { }; -} - /* * Cleanup macro for use with __free(path_put). Avoids dereference and * copying @path unlike DEFINE_FREE(). path_put() will handle the empty -- cgit v1.2.3 From 71ff58ce3428b2471efae5b00594e892b285c97c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Aug 2024 11:30:58 +0200 Subject: fs: s/__u32/u32/ for s_fsnotify_mask The underscore variants are for uapi whereas the non-underscore variants are for in-kernel consumers. Link: https://lore.kernel.org/r/20240822-anwerben-nutzung-1cd6c82a565f@brauner Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8f8d274929d7..cda1f2545ea0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1266,7 +1266,7 @@ struct super_block { time64_t s_time_min; time64_t s_time_max; #ifdef CONFIG_FSNOTIFY - __u32 s_fsnotify_mask; + u32 s_fsnotify_mask; struct fsnotify_sb_info *s_fsnotify_info; #endif -- cgit v1.2.3 From da18ecbf0fb6fb73e6f9273e7aa15610f148ce17 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 14:47:35 +0200 Subject: fs: add i_state helpers The i_state member is an unsigned long so that it can be used with the wait bit infrastructure which expects unsigned long. This wastes 4 bytes which we're unlikely to ever use. Switch to using the var event wait mechanism using the address of the bit. Thanks to Linus for the address idea. Link: https://lore.kernel.org/r/20240823-work-i_state-v3-1-5cd5fd207a57@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index cda1f2545ea0..d35de348d2ec 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -744,6 +744,21 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +/* + * Get bit address from inode->i_state to use with wait_var_event() + * infrastructre. + */ +#define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit)) + +struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, + struct inode *inode, u32 bit); + +static inline void inode_wake_up_bit(struct inode *inode, u32 bit) +{ + /* Caller is responsible for correct memory barriers. */ + wake_up_var(inode_state_wait_address(inode, bit)); +} + struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); static inline unsigned int i_blocksize(const struct inode *node) -- cgit v1.2.3 From 2ed634c96ed1d6e4ee0497be176f9980866e81cb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 14:47:36 +0200 Subject: fs: reorder i_state bits so that we can use the first bits to derive unique addresses from i_state. Link: https://lore.kernel.org/r/20240823-work-i_state-v3-2-5cd5fd207a57@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jeff Layton Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d35de348d2ec..a868c9823c10 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2410,28 +2410,32 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, * i_count. * * Q: What is the difference between I_WILL_FREE and I_FREEING? + * + * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait + * upon. There's one free address left. */ -#define I_DIRTY_SYNC (1 << 0) -#define I_DIRTY_DATASYNC (1 << 1) -#define I_DIRTY_PAGES (1 << 2) -#define __I_NEW 3 +#define __I_NEW 0 #define I_NEW (1 << __I_NEW) -#define I_WILL_FREE (1 << 4) -#define I_FREEING (1 << 5) -#define I_CLEAR (1 << 6) -#define __I_SYNC 7 +#define __I_SYNC 1 #define I_SYNC (1 << __I_SYNC) -#define I_REFERENCED (1 << 8) +#define __I_LRU_ISOLATING 2 +#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) + +#define I_DIRTY_SYNC (1 << 3) +#define I_DIRTY_DATASYNC (1 << 4) +#define I_DIRTY_PAGES (1 << 5) +#define I_WILL_FREE (1 << 6) +#define I_FREEING (1 << 7) +#define I_CLEAR (1 << 8) +#define I_REFERENCED (1 << 9) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) -#define I_WB_SWITCH (1 << 13) -#define I_OVL_INUSE (1 << 14) -#define I_CREATING (1 << 15) -#define I_DONTCACHE (1 << 16) -#define I_SYNC_QUEUED (1 << 17) -#define I_PINNING_NETFS_WB (1 << 18) -#define __I_LRU_ISOLATING 19 -#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) +#define I_WB_SWITCH (1 << 12) +#define I_OVL_INUSE (1 << 13) +#define I_CREATING (1 << 14) +#define I_DONTCACHE (1 << 15) +#define I_SYNC_QUEUED (1 << 16) +#define I_PINNING_NETFS_WB (1 << 17) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) -- cgit v1.2.3 From 0fe340a98b584a31b07c664dc5ff0c923730581e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 14:47:38 +0200 Subject: inode: port __I_NEW to var event Port the __I_NEW mechanism to use the new var event mechanism. Link: https://lore.kernel.org/r/20240823-work-i_state-v3-4-5cd5fd207a57@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/writeback.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 56b85841ae4c..8f651bb0a1a5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -200,7 +200,8 @@ void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { - wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); + wait_var_event(inode_state_wait_address(inode, __I_NEW), + !(READ_ONCE(inode->i_state) & I_NEW)); } #ifdef CONFIG_CGROUP_WRITEBACK -- cgit v1.2.3 From 2b111edbe0a9c441605be5cfb73001dc98ec686f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 14:47:40 +0200 Subject: inode: make i_state a u32 Now that we use the wait var event mechanism make i_state a u32 and free up 4 bytes. This means we currently have two 4 byte holes in struct inode which we can pack. Link: https://lore.kernel.org/r/20240823-work-i_state-v3-6-5cd5fd207a57@kernel.org Reviewed-by: Josef Bacik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index a868c9823c10..a1ef8c65d828 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -681,7 +681,8 @@ struct inode { #endif /* Misc */ - unsigned long i_state; + u32 i_state; + /* 32-bit hole */ struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ -- cgit v1.2.3 From 2a9f8995f7139a759fe8cb31d9e8a433757228ec Mon Sep 17 00:00:00 2001 From: Javier Carrasco Date: Thu, 4 Jul 2024 19:23:18 +0200 Subject: mfd: 88pm80x: Constify read-only regmap structs `pm800_irq`, `pm805_irq` and `pm805_irq_chip` are not modified and can be declared as const to move their data to a read-only section. In order to keep the const modifier for the regmap_irq_chip structures, the pointer used to reference them must be converted to const as well. Signed-off-by: Javier Carrasco Link: https://lore.kernel.org/r/20240704-mfd-const-regmap_config-v2-8-0c8785b1331d@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/88pm80x.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mfd/88pm80x.h b/include/linux/mfd/88pm80x.h index def5df6e74bf..551ef1c367d6 100644 --- a/include/linux/mfd/88pm80x.h +++ b/include/linux/mfd/88pm80x.h @@ -294,7 +294,7 @@ struct pm80x_chip { struct i2c_client *client; struct i2c_client *companion; struct regmap *regmap; - struct regmap_irq_chip *regmap_irq_chip; + const struct regmap_irq_chip *regmap_irq_chip; struct regmap_irq_chip_data *irq_data; int type; int irq; -- cgit v1.2.3 From 87b9c9ea01f4fd9193f30cc19bd03b05541ff4cb Mon Sep 17 00:00:00 2001 From: Wilken Gottwalt Date: Mon, 15 Jul 2024 08:17:28 +0000 Subject: mfd: ds1wm: Remove remaining header file The driver was removed after kernel 6.2 rendering the header file unused. Signed-off-by: Wilken Gottwalt Link: https://lore.kernel.org/r/ZpTbGHb6EX2Oe7ok@monster.localdomain Signed-off-by: Lee Jones --- include/linux/mfd/ds1wm.h | 29 ----------------------------- 1 file changed, 29 deletions(-) delete mode 100644 include/linux/mfd/ds1wm.h (limited to 'include/linux') diff --git a/include/linux/mfd/ds1wm.h b/include/linux/mfd/ds1wm.h deleted file mode 100644 index 43dfca1c9702..000000000000 --- a/include/linux/mfd/ds1wm.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* MFD cell driver data for the DS1WM driver - * - * to be defined in the MFD device that is - * using this driver for one of his sub devices - */ - -struct ds1wm_driver_data { - int active_high; - int clock_rate; - /* in milliseconds, the amount of time to - * sleep following a reset pulse. Zero - * should work if your bus devices recover - * time respects the 1-wire spec since the - * ds1wm implements the precise timings of - * a reset pulse/presence detect sequence. - */ - unsigned int reset_recover_delay; - - /* Say 1 here for big endian Hardware - * (only relevant with bus-shift > 0 - */ - bool is_hw_big_endian; - - /* left shift of register number to get register address offsett. - * Only 0,1,2 allowed for 8,16 or 32 bit bus width respectively - */ - unsigned int bus_shift; -}; -- cgit v1.2.3 From 5e9629d0ae977d6f6916d7e519724804e95f0b07 Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 27 Aug 2024 15:51:12 +0100 Subject: drivers/perf: arm_spe: Use perf_allow_kernel() for permissions Use perf_allow_kernel() for 'pa_enable' (physical addresses), 'pct_enable' (physical timestamps) and context IDs. This means that perf_event_paranoid is now taken into account and LSM hooks can be used, which is more consistent with other perf_event_open calls. For example PERF_SAMPLE_PHYS_ADDR uses perf_allow_kernel() rather than just perfmon_capable(). This also indirectly fixes the following error message which is misleading because perf_event_paranoid is not taken into account by perfmon_capable(): $ perf record -e arm_spe/pa_enable/ Error: Access to performance monitoring and observability operations is limited. Consider adjusting /proc/sys/kernel/perf_event_paranoid setting ... Suggested-by: Al Grant Signed-off-by: James Clark Link: https://lore.kernel.org/r/20240827145113.1224604-1-james.clark@linaro.org Link: https://lore.kernel.org/all/20240807120039.GD37996@noisy.programming.kicks-ass.net/ Signed-off-by: Will Deacon --- include/linux/perf_event.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1a8942277dda..e336306b8c08 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1602,13 +1602,7 @@ static inline int perf_is_paranoid(void) return sysctl_perf_event_paranoid > -1; } -static inline int perf_allow_kernel(struct perf_event_attr *attr) -{ - if (sysctl_perf_event_paranoid > 1 && !perfmon_capable()) - return -EACCES; - - return security_perf_event_open(attr, PERF_SECURITY_KERNEL); -} +int perf_allow_kernel(struct perf_event_attr *attr); static inline int perf_allow_cpu(struct perf_event_attr *attr) { -- cgit v1.2.3 From 6c17c7d5936e6af6a5bda9f9de98a5e2ee6e8a6f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 7 Aug 2024 15:19:20 -0300 Subject: iommu: Allow ATS to work on VFs when the PF uses IDENTITY PCI ATS has a global Smallest Translation Unit field that is located in the PF but shared by all of the VFs. The expectation is that the STU will be set to the root port's global STU capability which is driven by the IO page table configuration of the iommu HW. Today it becomes set when the iommu driver first enables ATS. Thus, to enable ATS on the VF, the PF must have already had the correct STU programmed, even if ATS is off on the PF. Unfortunately the PF only programs the STU when the PF enables ATS. The iommu drivers tend to leave ATS disabled when IDENTITY translation is being used. Thus we can get into a state where the PF is setup to use IDENTITY with the DMA API while the VF would like to use VFIO with a PAGING domain and have ATS turned on. This fails because the PF never loaded a PAGING domain and so it never setup the STU, and the VF can't do it. The simplest solution is to have the iommu driver set the ATS STU when it probes the device. This way the ATS STU is loaded immediately at boot time to all PFs and there is no issue when a VF comes to use it. Add a new call pci_prepare_ats() which should be called by iommu drivers in their probe_device() op for every PCI device if the iommu driver supports ATS. This will setup the STU based on whatever page size capability the iommu HW has. Signed-off-by: Jason Gunthorpe Acked-by: Bjorn Helgaas Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/0-v1-0fb4d2ab6770+7e706-ats_vf_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/pci-ats.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h index df54cd5b15db..0e8b74e63767 100644 --- a/include/linux/pci-ats.h +++ b/include/linux/pci-ats.h @@ -8,6 +8,7 @@ /* Address Translation Service */ bool pci_ats_supported(struct pci_dev *dev); int pci_enable_ats(struct pci_dev *dev, int ps); +int pci_prepare_ats(struct pci_dev *dev, int ps); void pci_disable_ats(struct pci_dev *dev); int pci_ats_queue_depth(struct pci_dev *dev); int pci_ats_page_aligned(struct pci_dev *dev); @@ -16,6 +17,8 @@ static inline bool pci_ats_supported(struct pci_dev *d) { return false; } static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; } +static inline int pci_prepare_ats(struct pci_dev *dev, int ps) +{ return -ENODEV; } static inline void pci_disable_ats(struct pci_dev *d) { } static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; } -- cgit v1.2.3 From 5c40e050e6ac0218af7c520095729d440cc87e6b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 29 Aug 2024 15:06:40 +0200 Subject: fs: drop GFP_NOFAIL mode from alloc_page_buffers There is only one called of alloc_page_buffers and it doesn't require __GFP_NOFAIL so drop this allocation mode. Signed-off-by: Michal Hocko Link: https://lore.kernel.org/r/20240829130640.1397970-1-mhocko@kernel.org Acked-by: Song Liu Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/buffer_head.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 14acf1bbe0ce..7e903457967a 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -199,8 +199,7 @@ void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, gfp_t gfp); -struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, - bool retry); +struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size); struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); -- cgit v1.2.3 From a06c3fad49a50d5d5eb078f93e70f4d3eca5d5a5 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Aug 2024 14:01:45 +0100 Subject: drivers/virt: pkvm: Add initial support for running as a protected guest Implement a pKVM protected guest driver to probe the presence of pKVM and determine the memory protection granule using the HYP_MEMINFO hypercall. Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-3-will@kernel.org Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 083f85653716..16b6dcc54e02 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -115,6 +115,7 @@ /* KVM "vendor specific" services */ #define ARM_SMCCC_KVM_FUNC_FEATURES 0 #define ARM_SMCCC_KVM_FUNC_PTP 1 +#define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO 2 #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -137,6 +138,12 @@ ARM_SMCCC_OWNER_VENDOR_HYP, \ ARM_SMCCC_KVM_FUNC_PTP) +#define ARM_SMCCC_VENDOR_HYP_KVM_HYP_MEMINFO_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_HYP_MEMINFO) + /* ptp_kvm counter type ID */ #define KVM_PTP_VIRT_COUNTER 0 #define KVM_PTP_PHYS_COUNTER 1 -- cgit v1.2.3 From ebc59b120c588156feb7ce194a9636584ced18ba Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Aug 2024 14:01:47 +0100 Subject: drivers/virt: pkvm: Hook up mem_encrypt API using pKVM hypercalls If we detect the presence of pKVM's SHARE and UNSHARE hypercalls, then register a backend implementation of the mem_encrypt API so that things like DMA buffers can be shared appropriately with the host. Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-5-will@kernel.org Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 16b6dcc54e02..9cb7c95920b0 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -116,6 +116,8 @@ #define ARM_SMCCC_KVM_FUNC_FEATURES 0 #define ARM_SMCCC_KVM_FUNC_PTP 1 #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO 2 +#define ARM_SMCCC_KVM_FUNC_MEM_SHARE 3 +#define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE 4 #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -144,6 +146,18 @@ ARM_SMCCC_OWNER_VENDOR_HYP, \ ARM_SMCCC_KVM_FUNC_HYP_MEMINFO) +#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_SHARE_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MEM_SHARE) + +#define ARM_SMCCC_VENDOR_HYP_KVM_MEM_UNSHARE_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MEM_UNSHARE) + /* ptp_kvm counter type ID */ #define KVM_PTP_VIRT_COUNTER 0 #define KVM_PTP_PHYS_COUNTER 1 -- cgit v1.2.3 From 0f12694958001c96bda811473fdb23f333c6d3ca Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Aug 2024 14:01:49 +0100 Subject: drivers/virt: pkvm: Intercept ioremap using pKVM MMIO_GUARD hypercall Hook up pKVM's MMIO_GUARD hypercall so that ioremap() and friends will register the target physical address as MMIO with the hypervisor, allowing guest exits to that page to be emulated by the host with full syndrome information. Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-7-will@kernel.org Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 9cb7c95920b0..e93c1f7cea70 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -118,6 +118,7 @@ #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO 2 #define ARM_SMCCC_KVM_FUNC_MEM_SHARE 3 #define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE 4 +#define ARM_SMCCC_KVM_FUNC_MMIO_GUARD 7 #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 @@ -158,6 +159,12 @@ ARM_SMCCC_OWNER_VENDOR_HYP, \ ARM_SMCCC_KVM_FUNC_MEM_UNSHARE) +#define ARM_SMCCC_VENDOR_HYP_KVM_MMIO_GUARD_FUNC_ID \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_VENDOR_HYP, \ + ARM_SMCCC_KVM_FUNC_MMIO_GUARD) + /* ptp_kvm counter type ID */ #define KVM_PTP_VIRT_COUNTER 0 #define KVM_PTP_PHYS_COUNTER 1 -- cgit v1.2.3 From 21be9f7110d4c044c2b49bafbd7246335f236221 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Aug 2024 14:01:50 +0100 Subject: arm64: smccc: Reserve block of KVM "vendor" services for pKVM hypercalls pKVM relies on hypercalls to expose services such as memory sharing to protected guests. Tentatively allocate a block of 58 hypercalls (i.e. fill the remaining space in the first 64 function IDs) for pKVM usage, as future extensions such as pvIOMMU support, range-based memory sharing and validation of assigned devices will require additional services. Suggested-by: Marc Zyngier Link: https://lore.kernel.org/r/86a5h5yg5y.wl-maz@kernel.org Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20240830130150.8568-8-will@kernel.org Signed-off-by: Will Deacon --- include/linux/arm-smccc.h | 60 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index e93c1f7cea70..f59099a213d0 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -115,10 +115,70 @@ /* KVM "vendor specific" services */ #define ARM_SMCCC_KVM_FUNC_FEATURES 0 #define ARM_SMCCC_KVM_FUNC_PTP 1 +/* Start of pKVM hypercall range */ #define ARM_SMCCC_KVM_FUNC_HYP_MEMINFO 2 #define ARM_SMCCC_KVM_FUNC_MEM_SHARE 3 #define ARM_SMCCC_KVM_FUNC_MEM_UNSHARE 4 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_5 5 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_6 6 #define ARM_SMCCC_KVM_FUNC_MMIO_GUARD 7 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_8 8 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_9 9 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_10 10 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_11 11 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_12 12 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_13 13 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_14 14 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_15 15 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_16 16 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_17 17 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_18 18 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_19 19 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_20 20 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_21 21 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_22 22 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_23 23 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_24 24 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_25 25 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_26 26 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_27 27 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_28 28 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_29 29 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_30 30 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_31 31 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_32 32 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_33 33 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_34 34 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_35 35 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_36 36 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_37 37 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_38 38 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_39 39 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_40 40 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_41 41 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_42 42 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_43 43 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_44 44 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_45 45 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_46 46 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_47 47 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_48 48 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_49 49 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_50 50 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_51 51 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_52 52 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_53 53 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_54 54 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_55 55 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_56 56 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_57 57 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_58 58 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_59 59 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_60 60 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_61 61 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_62 62 +#define ARM_SMCCC_KVM_FUNC_PKVM_RESV_63 63 +/* End of pKVM hypercall range */ #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 #define ARM_SMCCC_KVM_NUM_FUNCS 128 -- cgit v1.2.3 From 929c81ade6355b87097a2a4886c10750e68626cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 30 Aug 2024 11:45:57 +0200 Subject: fbdev: Introduce devm_register_framebuffer() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a device-managed variant of register_framebuffer() which automatically unregisters the framebuffer on device destruction. This can simplify the error handling and resource management in drivers. Signed-off-by: Thomas Weißschuh Signed-off-by: Helge Deller --- include/linux/fb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fb.h b/include/linux/fb.h index db7d97b10964..abf6643ebcaf 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -601,6 +601,7 @@ extern ssize_t fb_sys_write(struct fb_info *info, const char __user *buf, /* fbmem.c */ extern int register_framebuffer(struct fb_info *fb_info); extern void unregister_framebuffer(struct fb_info *fb_info); +extern int devm_register_framebuffer(struct device *dev, struct fb_info *fb_info); extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size); extern void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx, u32 height, u32 shift_high, u32 shift_low, u32 mod); -- cgit v1.2.3 From 9aee8262445d185960431e972e2d997e6aba3de0 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 26 Aug 2024 11:58:23 +0800 Subject: ARM: OMAP2+: Remove obsoleted declaration for gpmc_onenand_init The gpmc_onenand_init() have been removed since commit 2514830b8b8c ("ARM: OMAP2+: Remove gpmc-onenand"), and now it is useless, so remove it. Signed-off-by: Gaosheng Cui Link: https://lore.kernel.org/r/20240826035823.4043171-1-cuigaosheng1@huawei.com Signed-off-by: Kevin Hilman --- include/linux/omap-gpmc.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/omap-gpmc.h b/include/linux/omap-gpmc.h index 082841908fe7..c9e3843d2dd5 100644 --- a/include/linux/omap-gpmc.h +++ b/include/linux/omap-gpmc.h @@ -84,13 +84,3 @@ extern void gpmc_read_settings_dt(struct device_node *np, struct gpmc_timings; struct omap_nand_platform_data; struct omap_onenand_platform_data; - -#if IS_ENABLED(CONFIG_MTD_ONENAND_OMAP2) -extern int gpmc_onenand_init(struct omap_onenand_platform_data *d); -#else -#define board_onenand_data NULL -static inline int gpmc_onenand_init(struct omap_onenand_platform_data *d) -{ - return 0; -} -#endif -- cgit v1.2.3 From ddc94d0b17e8ea8179ecbbefacac3fba0fb77265 Mon Sep 17 00:00:00 2001 From: Matthew Brost Date: Mon, 26 Aug 2024 10:01:43 -0700 Subject: dma-buf: Split out dma fence array create into alloc and arm functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Useful to preallocate dma fence array and then arm in path of reclaim or a dma fence. v2: - s/arm/init (Christian) - Drop !array warn (Christian) v3: - Fix kernel doc typos (dim) Cc: Sumit Semwal Cc: Christian König Signed-off-by: Matthew Brost Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20240826170144.2492062-2-matthew.brost@intel.com --- include/linux/dma-fence-array.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h index 29c5650c1038..079b3dec0a16 100644 --- a/include/linux/dma-fence-array.h +++ b/include/linux/dma-fence-array.h @@ -79,6 +79,12 @@ to_dma_fence_array(struct dma_fence *fence) for (index = 0, fence = dma_fence_array_first(head); fence; \ ++(index), fence = dma_fence_array_next(head, index)) +struct dma_fence_array *dma_fence_array_alloc(int num_fences); +void dma_fence_array_init(struct dma_fence_array *array, + int num_fences, struct dma_fence **fences, + u64 context, unsigned seqno, + bool signal_on_any); + struct dma_fence_array *dma_fence_array_create(int num_fences, struct dma_fence **fences, u64 context, unsigned seqno, -- cgit v1.2.3 From 1abab1ba0775036bb67c6c57945c637be644c04f Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 30 Aug 2024 10:02:28 +0000 Subject: cgroup/cpuset: guard cpuset-v1 code under CONFIG_CPUSETS_V1 This patch introduces CONFIG_CPUSETS_V1 and guard cpuset-v1 code under CONFIG_CPUSETS_V1. The default value of CONFIG_CPUSETS_V1 is N, so that user who adopted v2 don't have 'pay' for cpuset v1. Signed-off-by: Chen Ridong Acked-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 2a6981eeebf8..835e7b793f6a 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -99,6 +99,7 @@ static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2); +#ifdef CONFIG_CPUSETS_V1 #define cpuset_memory_pressure_bump() \ do { \ if (cpuset_memory_pressure_enabled) \ @@ -106,6 +107,9 @@ extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, } while (0) extern int cpuset_memory_pressure_enabled; extern void __cpuset_memory_pressure_bump(void); +#else +static inline void cpuset_memory_pressure_bump(void) { } +#endif extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); -- cgit v1.2.3 From 0328947c50324cf4b2d8b181bf948edb8101f59f Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 28 Aug 2024 21:16:15 +0530 Subject: PCI: endpoint: Assign PCI domain number for endpoint controllers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Right now, PCI endpoint subsystem doesn't assign PCI domain number for the PCI endpoint controllers. But this domain number could be useful to the EPC drivers to uniquely identify each controller based on the hardware instance when there are multiple ones present in an SoC (even multiple RC/EP). So let's make use of the existing pci_bus_find_domain_nr() API to allocate domain numbers based on either devicetree (linux,pci-domain) property or dynamic domain number allocation scheme. It should be noted that the domain number allocated by this API will be based on both RC and EP controllers in a SoC. If the 'linux,pci-domain' DT property is present, then the domain number represents the actual hardware instance of the PCI endpoint controller. If not, then the domain number will be allocated based on the PCI EP/RC controller probe order. If the architecture doesn't support CONFIG_PCI_DOMAINS_GENERIC (rare), then currently a warning is thrown to indicate that the architecture specific implementation is needed. Link: https://lore.kernel.org/linux-pci/20240828-pci-qcom-hotplug-v4-5-263a385fbbcb@linaro.org Signed-off-by: Manivannan Sadhasivam Signed-off-by: Krzysztof Wilczyński Reviewed-by: Frank Li --- include/linux/pci-epc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 85bdf2adb760..8e3dcac55dcd 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -128,6 +128,7 @@ struct pci_epc_mem { * @group: configfs group representing the PCI EPC device * @lock: mutex to protect pci_epc ops * @function_num_map: bitmap to manage physical function number + * @domain_nr: PCI domain number of the endpoint controller * @init_complete: flag to indicate whether the EPC initialization is complete * or not */ @@ -145,6 +146,7 @@ struct pci_epc { /* mutex to protect against concurrent access of EP controller */ struct mutex lock; unsigned long function_num_map; + int domain_nr; bool init_complete; }; -- cgit v1.2.3 From 4ed9ef32606386efc562bada891d7baa16fc46b4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Jul 2024 17:14:14 +1000 Subject: lockd: discard nlmsvc_timeout nlmsvc_timeout always has the same value as (nlm_timeout * HZ), so use that in the one place that nlmsvc_timeout is used. In truth it *might* not always be the same as nlmsvc_timeout is only set when lockd is started while nlm_timeout can be set at anytime via sysctl. I think this difference it not helpful so removing it is good. Also remove the test for nlm_timout being 0. This is not possible - unless a module parameter is used to set the minimum timeout to 0, and if that happens then it probably should be honoured. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 1b95fe31051f..61c4b9c41904 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -200,7 +200,7 @@ extern const struct svc_procedure nlmsvc_procedures[24]; extern const struct svc_procedure nlmsvc_procedures4[24]; #endif extern int nlmsvc_grace_period; -extern unsigned long nlmsvc_timeout; +extern unsigned long nlm_timeout; extern bool nsm_use_hostnames; extern u32 nsm_local_state; -- cgit v1.2.3 From f2b27e1d72527f94f030b6356b3187576e60885b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Jul 2024 17:14:15 +1000 Subject: SUNRPC: make various functions static, or not exported. Various functions are only used within the sunrpc module, and several are only use in the one file. So clean up: These are marked static, and any EXPORT is removed. svc_rcpb_setup() svc_rqst_alloc() svc_rqst_free() - also moved before first use svc_rpcbind_set_version() svc_drop() - also moved to svc.c These are now not EXPORTed, but are not static. svc_authenticate() svc_sock_update_bufs() Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 9 --------- include/linux/sunrpc/svcauth.h | 1 - include/linux/sunrpc/svcsock.h | 2 -- 3 files changed, 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index a7d0406b9ef5..e4fa25fafa97 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -401,17 +401,13 @@ struct svc_procedure { */ int sunrpc_set_pool_mode(const char *val); int sunrpc_get_pool_mode(char *val, size_t size); -int svc_rpcb_setup(struct svc_serv *serv, struct net *net); void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net); int svc_bind(struct svc_serv *serv, struct net *net); struct svc_serv *svc_create(struct svc_program *, unsigned int, int (*threadfn)(void *data)); -struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv, - struct svc_pool *pool, int node); bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page); void svc_rqst_release_pages(struct svc_rqst *rqstp); -void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *prog, struct svc_stat *stats, @@ -446,11 +442,6 @@ int svc_generic_rpcbind_set(struct net *net, u32 version, int family, unsigned short proto, unsigned short port); -int svc_rpcbind_set_version(struct net *net, - const struct svc_program *progp, - u32 version, int family, - unsigned short proto, - unsigned short port); #define RPC_MAX_ADDRBUFLEN (63U) diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index 61c455f1e1f5..63cf6fb26dcc 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -151,7 +151,6 @@ struct auth_ops { struct svc_xprt; -extern enum svc_auth_status svc_authenticate(struct svc_rqst *rqstp); extern rpc_authflavor_t svc_auth_flavor(struct svc_rqst *rqstp); extern int svc_authorise(struct svc_rqst *rqstp); extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 7c78ec6356b9..bf45d9e8492a 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -58,8 +58,6 @@ static inline u32 svc_sock_final_rec(struct svc_sock *svsk) */ void svc_recv(struct svc_rqst *rqstp); void svc_send(struct svc_rqst *rqstp); -void svc_drop(struct svc_rqst *); -void svc_sock_update_bufs(struct svc_serv *serv); int svc_addsock(struct svc_serv *serv, struct net *net, const int fd, char *name_return, const size_t len, const struct cred *cred); -- cgit v1.2.3 From 5fe690a5946442f7f2d08448341dd621367f80bc Mon Sep 17 00:00:00 2001 From: Matthew Cassell Date: Mon, 22 Jul 2024 17:13:16 +0000 Subject: mm: add node_reclaim successes to VM event counters /proc/vmstat currently shows the number of node_reclaim() failures when vm.zone_reclaim_mode is set appropriately. It would be convenient to have the number of successes right next to zone_reclaim_failed (similar to compaction and migration). While just a trivially addition to the vmstat file. It was helpful during benchmarking to not have to probe node_reclaim() to observe the success/failure ratio. Link: https://lkml.kernel.org/r/20240722171316.7517-1-mcassell411@gmail.com Signed-off-by: Matthew Cassell Cc: Domenico Cerasuolo Cc: "Huang, Ying" Cc: Li Zhijian Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 747943bc8cc2..6fff8ed3b1fd 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -50,6 +50,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSTEAL_ANON, PGSTEAL_FILE, #ifdef CONFIG_NUMA + PGSCAN_ZONE_RECLAIM_SUCCESS, PGSCAN_ZONE_RECLAIM_FAILED, #endif PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, -- cgit v1.2.3 From 3ddc2fefe6f34ec870fcb22f4cd656e53db079f2 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 22 Jul 2024 18:29:23 +0200 Subject: mm: vmalloc: implement vrealloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Align kvrealloc() with krealloc()", v2. Besides the obvious (and desired) difference between krealloc() and kvrealloc(), there is some inconsistency in their function signatures and behavior: - krealloc() frees the memory when the requested size is zero, whereas kvrealloc() simply returns a pointer to the existing allocation. - krealloc() behaves like kmalloc() if a NULL pointer is passed, whereas kvrealloc() does not accept a NULL pointer at all and, if passed, would fault instead. - krealloc() is self-contained, whereas kvrealloc() relies on the caller to provide the size of the previous allocation. Inconsistent behavior throughout allocation APIs is error prone, hence make kvrealloc() behave like krealloc(), which seems superior in all mentioned aspects. In order to be able to get rid of kvrealloc()'s oldsize parameter, introduce vrealloc() and make use of it in kvrealloc(). Making use of vrealloc() in kvrealloc() also provides oppertunities to grow (and shrink) allocations more efficiently. For instance, vrealloc() can be optimized to allocate and map additional pages to grow the allocation or unmap and free unused pages to shrink the allocation. Besides the above, those functions are required by Rust's allocator abstractons [1] (rework based on this series in [2]). With `Vec` or `KVec` respectively, potentially growing (and shrinking) data structures are rather common. [1] https://lore.kernel.org/lkml/20240704170738.3621-1-dakr@redhat.com/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/dakr/linux.git/log/?h=rust/mm This patch (of 2): Implement vrealloc() analogous to krealloc(). Currently, krealloc() requires the caller to pass the size of the previous memory allocation, which, instead, should be self-contained. We attempt to fix this in a subsequent patch which, in order to do so, requires vrealloc(). Besides that, we need realloc() functions for kernel allocators in Rust too. With `Vec` or `KVec` respectively, potentially growing (and shrinking) data structures are rather common. [dakr@kernel.org: fix missing nommu implementation] Link: https://lkml.kernel.org/r/20240725141227.13954-1-dakr@kernel.org [dakr@kernel.org: document concurrency restrictions] Link: https://lkml.kernel.org/r/20240725125442.4957-1-dakr@kernel.org [dakr@kernel.org: consider spare memory for __GFP_ZERO] Link: https://lkml.kernel.org/r/20240730185049.6244-3-dakr@kernel.org [dakr@kernel.org: properly document __GFP_ZERO behavior] Link: https://lkml.kernel.org/r/20240730185049.6244-4-dakr@kernel.org Link: https://lkml.kernel.org/r/20240722163111.4766-1-dakr@kernel.org Link: https://lkml.kernel.org/r/20240722163111.4766-2-dakr@kernel.org Signed-off-by: Danilo Krummrich Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Chandan Babu R Cc: Christian König Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kees Cook Cc: Marc Zyngier Cc: Michael Ellerman Cc: Miguel Ojeda Cc: Oliver Upton Cc: Pekka Enberg Cc: Roman Gushchin Cc: Uladzislau Rezki Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index e4a631ec430b..ad2ce7a6ab7a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -189,6 +189,10 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1 extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) +void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags) + __realloc_size(2); +#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) + extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); -- cgit v1.2.3 From 590b9d576caec6b4c46bba49ed36223a399c3fc5 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 22 Jul 2024 18:29:24 +0200 Subject: mm: kvmalloc: align kvrealloc() with krealloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Besides the obvious (and desired) difference between krealloc() and kvrealloc(), there is some inconsistency in their function signatures and behavior: - krealloc() frees the memory when the requested size is zero, whereas kvrealloc() simply returns a pointer to the existing allocation. - krealloc() behaves like kmalloc() if a NULL pointer is passed, whereas kvrealloc() does not accept a NULL pointer at all and, if passed, would fault instead. - krealloc() is self-contained, whereas kvrealloc() relies on the caller to provide the size of the previous allocation. Inconsistent behavior throughout allocation APIs is error prone, hence make kvrealloc() behave like krealloc(), which seems superior in all mentioned aspects. Besides that, implementing kvrealloc() by making use of krealloc() and vrealloc() provides oppertunities to grow (and shrink) allocations more efficiently. For instance, vrealloc() can be optimized to allocate and map additional pages to grow the allocation or unmap and free unused pages to shrink the allocation. [dakr@kernel.org: document concurrency restrictions] Link: https://lkml.kernel.org/r/20240725125442.4957-1-dakr@kernel.org [dakr@kernel.org: disable KASAN when switching to vmalloc] Link: https://lkml.kernel.org/r/20240730185049.6244-2-dakr@kernel.org [dakr@kernel.org: properly document __GFP_ZERO behavior] Link: https://lkml.kernel.org/r/20240730185049.6244-5-dakr@kernel.org Link: https://lkml.kernel.org/r/20240722163111.4766-3-dakr@kernel.org Signed-off-by: Danilo Krummrich Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Chandan Babu R Cc: Christian König Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kees Cook Cc: Marc Zyngier Cc: Michael Ellerman Cc: Miguel Ojeda Cc: Oliver Upton Cc: Pekka Enberg Cc: Roman Gushchin Cc: Uladzislau Rezki Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- include/linux/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index eb2bf4629157..c9cb42203183 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -841,8 +841,8 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) #define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__)) #define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__)) -extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) - __realloc_size(3); +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) + __realloc_size(2); #define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) extern void kvfree(const void *addr); -- cgit v1.2.3 From d58a2a581f132529eefac5377676011562b631b8 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 22 Jul 2024 13:43:18 +0800 Subject: mm: shmem: rename shmem_is_huge() to shmem_huge_global_enabled() shmem_is_huge() is now used to check if the top-level huge page is enabled, thus rename it to reflect its usage. Link: https://lkml.kernel.org/r/da53296e0ab6359aa083561d9dc01e4223d60fbe.1721626645.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Barry Song <21cnbao@gmail.com> Cc: Hugh Dickins Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 1d06b1e5408a..405ee8d3589a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -111,14 +111,15 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags); +extern bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, bool shmem_huge_force, + struct mm_struct *mm, unsigned long vm_flags); unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, bool global_huge); #else -static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags) +static __always_inline bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + bool shmem_huge_force, struct mm_struct *mm, + unsigned long vm_flags) { return false; } -- cgit v1.2.3 From 6beeab870e70b2d4f49baf6c6be9da1b61c169f8 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 22 Jul 2024 13:43:19 +0800 Subject: mm: shmem: move shmem_huge_global_enabled() into shmem_allowable_huge_orders() Move shmem_huge_global_enabled() into shmem_allowable_huge_orders(), so that shmem_allowable_huge_orders() can also help to find the allowable huge orders for tmpfs. Moreover the shmem_huge_global_enabled() can become static. While we are at it, passing the vma instead of mm for shmem_huge_global_enabled() makes code cleaner. No functional changes. Link: https://lkml.kernel.org/r/8e825146bb29ee1a1c7bd64d2968ff3e19be7815.1721626645.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Barry Song <21cnbao@gmail.com> Cc: Hugh Dickins Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 405ee8d3589a..1564d7d3ca61 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -111,21 +111,13 @@ extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, bool shmem_huge_force, - struct mm_struct *mm, unsigned long vm_flags); unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge); + bool shmem_huge_force); #else -static __always_inline bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - bool shmem_huge_force, struct mm_struct *mm, - unsigned long vm_flags) -{ - return false; -} static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool global_huge) + bool shmem_huge_force) { return 0; } -- cgit v1.2.3 From 4fd568faf6e7e21f206cd66dd4fca9a72e7d922c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 18 Jul 2024 17:18:21 +0800 Subject: mm: kmem: remove mem_cgroup_from_obj() There is no user of mem_cgroup_from_obj(), remove it. Link: https://lkml.kernel.org/r/20240718091821.44740-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0e5bf25d324f..af7da7bd00af 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1717,7 +1717,6 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -struct mem_cgroup *mem_cgroup_from_obj(void *p); struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_event(struct obj_cgroup *objcg, @@ -1780,11 +1779,6 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) return -1; } -static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) -{ - return NULL; -} - static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) { return NULL; -- cgit v1.2.3 From 2a28713a67fd28eedc13b32300df6b9c5a2381f5 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 24 Jul 2024 09:01:14 -0400 Subject: memory tiering: introduce folio_use_access_time() check If memory tiering mode is on and a folio is not in the top tier memory, folio's cpupid field is repurposed to store page access time. Instead of an open coded check, use a function to encapsulate the check. Link: https://lkml.kernel.org/r/20240724130115.793641-3-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: "Huang, Ying" Acked-by: David Hildenbrand Reviewed-by: Kefeng Wang Cc: Baolin Wang Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6549d0979b28..6c2078ca3391 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1745,6 +1745,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } + +bool folio_use_access_time(struct folio *folio); #else /* !CONFIG_NUMA_BALANCING */ static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { @@ -1798,6 +1800,10 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { } +static inline bool folio_use_access_time(struct folio *folio) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) -- cgit v1.2.3 From c4a6fce85640beb4f79e8217272d1ef79839cc17 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Wed, 24 Jul 2024 20:33:21 +0000 Subject: vmstat: kernel stack usage histogram As part of the dynamic kernel stack project, we need to know the amount of data that can be saved by reducing the default kernel stack size [1]. Provide a kernel stack usage histogram to aid in optimizing kernel stack sizes and minimizing memory waste in large-scale environments. The histogram divides stack usage into power-of-two buckets and reports the results in /proc/vmstat. This information is especially valuable in environments with millions of machines, where even small optimizations can have a significant impact. The histogram data is presented in /proc/vmstat with entries like "kstack_1k", "kstack_2k", and so on, indicating the number of threads that exited with stack usage falling within each respective bucket. Example outputs: Intel: $ grep kstack /proc/vmstat kstack_1k 3 kstack_2k 188 kstack_4k 11391 kstack_8k 243 kstack_16k 0 ARM with 64K page_size: $ grep kstack /proc/vmstat kstack_1k 1 kstack_2k 340 kstack_4k 25212 kstack_8k 1659 kstack_16k 0 kstack_32k 0 kstack_64k 0 Note: once the dynamic kernel stack is implemented it will depend on the implementation the usability of this feature: On hardware that supports faults on kernel stacks, we will have other metrics that show the total number of pages allocated for stacks. On hardware where faults are not supported, we will most likely have some optimization where only some threads are extended, and for those, these metrics will still be very useful. [1] https://lwn.net/Articles/974367 Link: https://lkml.kernel.org/r/20240730150158.832783-3-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240724203322.2765486-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Kent Overstreet Acked-by: Shakeel Butt Cc: Domenico Cerasuolo Cc: Li Zhijian Cc: Matthew Wilcox Cc: Nhat Pham Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 6fff8ed3b1fd..aae5c7c5cfb4 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -155,6 +155,30 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, VMA_LOCK_RETRY, VMA_LOCK_MISS, #endif +#ifdef CONFIG_DEBUG_STACK_USAGE + KSTACK_1K, +#if THREAD_SIZE > 1024 + KSTACK_2K, +#endif +#if THREAD_SIZE > 2048 + KSTACK_4K, +#endif +#if THREAD_SIZE > 4096 + KSTACK_8K, +#endif +#if THREAD_SIZE > 8192 + KSTACK_16K, +#endif +#if THREAD_SIZE > 16384 + KSTACK_32K, +#endif +#if THREAD_SIZE > 32768 + KSTACK_64K, +#endif +#if THREAD_SIZE > 65536 + KSTACK_REST, +#endif +#endif /* CONFIG_DEBUG_STACK_USAGE */ NR_VM_EVENT_ITEMS }; -- cgit v1.2.3 From fbe76a6557a83af5ef3819fd7b7ffd0a5d3b4e51 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Wed, 24 Jul 2024 20:33:22 +0000 Subject: task_stack: uninline stack_not_used Given that stack_not_used() is not performance critical function uninline it. Link: https://lkml.kernel.org/r/20240730150158.832783-4-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20240724203322.2765486-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: Shakeel Butt Cc: Domenico Cerasuolo Cc: Kent Overstreet Cc: Li Zhijian Cc: Matthew Wilcox Cc: Nhat Pham Cc: Peter Zijlstra Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/sched/task_stack.h | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index ccd72b978e1f..bf10bdb487dd 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -95,23 +95,11 @@ static inline int object_is_on_stack(const void *obj) extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE +unsigned long stack_not_used(struct task_struct *p); +#else static inline unsigned long stack_not_used(struct task_struct *p) { - unsigned long *n = end_of_stack(p); - - do { /* Skip over canary */ -# ifdef CONFIG_STACK_GROWSUP - n--; -# else - n++; -# endif - } while (!*n); - -# ifdef CONFIG_STACK_GROWSUP - return (unsigned long)end_of_stack(p) - (unsigned long)n; -# else - return (unsigned long)n - (unsigned long)end_of_stack(p); -# endif + return 0; } #endif extern void set_task_stack_end_magic(struct task_struct *tsk); -- cgit v1.2.3 From f77bd4b14ccfd38dfcfe67eecad517b8ec1b7f37 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Jul 2024 20:31:08 +0000 Subject: mm: memcg: don't call propagate_protected_usage() needlessly Patch series "mm: memcg: page counters optimizations", v3. This patchset contains 3 independent small optimizations of page counters. This patch (of 3): Memory protection (min/low) requires a constant tracking of protected memory usage. propagate_protected_usage() is called on each page counters update and does a number of operations even in cases when the actual memory protection functionality is not supported (e.g. hugetlb cgroups or memcg swap counters). It's obviously inefficient and leads to a waste of CPU cycles. It can be addressed by calling propagate_protected_usage() only for the counters which do support memory guarantees. As of now it's only memcg->memory - the unified memory memcg counter. Link: https://lkml.kernel.org/r/20240726203110.1577216-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 904c52f97284..0b8e993f9163 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -31,6 +31,7 @@ struct page_counter { /* Keep all the read most fields in a separete cacheline. */ CACHELINE_PADDING(_pad2_); + bool protection_support; unsigned long min; unsigned long low; unsigned long high; @@ -44,12 +45,17 @@ struct page_counter { #define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE) #endif +/* + * Protection is supported only for the first counter (with id 0). + */ static inline void page_counter_init(struct page_counter *counter, - struct page_counter *parent) + struct page_counter *parent, + bool protection_support) { atomic_long_set(&counter->usage, 0); counter->max = PAGE_COUNTER_MAX; counter->parent = parent; + counter->protection_support = protection_support; } static inline unsigned long page_counter_read(struct page_counter *counter) -- cgit v1.2.3 From 941ce635234162c420c9185816c59f7d5dd1ad07 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Jul 2024 20:31:09 +0000 Subject: mm: page_counters: put page_counter_calculate_protection() under CONFIG_MEMCG Put page_counter_calculate_protection() under CONFIG_MEMCG. The protection functionality (min/low limits) is not supported by any other cgroup subsystem, so page_counter_calculate_protection() and related static effective_protection() can be compiled out if CONFIG_MEMCG is not enabled. Link: https://lkml.kernel.org/r/20240726203110.1577216-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 0b8e993f9163..1b3e92c09b80 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -87,8 +87,14 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) counter->watermark = page_counter_read(counter); } +#ifdef CONFIG_MEMCG void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, bool recursive_protection); +#else +static inline void page_counter_calculate_protection(struct page_counter *root, + struct page_counter *counter, + bool recursive_protection) {} +#endif #endif /* _LINUX_PAGE_COUNTER_H */ -- cgit v1.2.3 From 57979fabff554bf4d1edeeee69251b22ca9bf55e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 26 Jul 2024 20:31:10 +0000 Subject: mm: page_counters: initialize usage using ATOMIC_LONG_INIT() macro When a page_counter structure is initialized, there is no need to use an atomic set operation to initialize the usage counter because at this point the structure is not visible to anybody else. ATOMIC_LONG_INIT() is what should be used in such cases. Link: https://lkml.kernel.org/r/20240726203110.1577216-4-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/page_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 1b3e92c09b80..66ebf9a73158 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -52,7 +52,7 @@ static inline void page_counter_init(struct page_counter *counter, struct page_counter *parent, bool protection_support) { - atomic_long_set(&counter->usage, 0); + counter->usage = (atomic_long_t)ATOMIC_LONG_INIT(0); counter->max = PAGE_COUNTER_MAX; counter->parent = parent; counter->protection_support = protection_support; -- cgit v1.2.3 From 394290cba9664ed3ab80d0e247402102a9b7287a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 26 Jul 2024 17:07:26 +0200 Subject: mm: turn USE_SPLIT_PTE_PTLOCKS / USE_SPLIT_PTE_PTLOCKS into Kconfig options Patch series "mm: split PTE/PMD PT table Kconfig cleanups+clarifications". This series is a follow up to the fixes: "[PATCH v1 0/2] mm/hugetlb: fix hugetlb vs. core-mm PT locking" When working on the fixes, I wondered why 8xx is fine (-> never uses split PT locks) and how PT locking even works properly with PMD page table sharing (-> always requires split PMD PT locks). Let's improve the split PT lock detection, make hugetlb properly depend on it and make 8xx bail out if it would ever get enabled by accident. As an alternative to patch #3 we could extend the Kconfig SPLIT_PTE_PTLOCKS option from patch #2 -- but enforcing it closer to the code that actually implements it feels a bit nicer for documentation purposes, and there is no need to actually disable it because it should always be disabled (!SMP). Did a bunch of cross-compilations to make sure that split PTE/PMD PT locks are still getting used where we would expect them. [1] https://lkml.kernel.org/r/20240725183955.2268884-1-david@redhat.com This patch (of 3): Let's clean that up a bit and prepare for depending on CONFIG_SPLIT_PMD_PTLOCKS in other Kconfig options. More cleanups would be reasonable (like the arch-specific "depends on" for CONFIG_SPLIT_PTE_PTLOCKS), but we'll leave that for another day. Link: https://lkml.kernel.org/r/20240726150728.3159964-1-david@redhat.com Link: https://lkml.kernel.org/r/20240726150728.3159964-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Russell King (Oracle) Reviewed-by: Qi Zheng Cc: Alexander Viro Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Christian Brauner Cc: Christophe Leroy Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Juergen Gross Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Peter Xu Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- include/linux/mm_types.h | 2 +- include/linux/mm_types_task.h | 3 --- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6c2078ca3391..b7000fa300f3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2891,7 +2891,7 @@ static inline void pagetable_free(struct ptdesc *pt) __free_pages(page, compound_order(page)); } -#if USE_SPLIT_PTE_PTLOCKS +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); @@ -2949,7 +2949,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) return true; } -#else /* !USE_SPLIT_PTE_PTLOCKS */ +#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ @@ -2964,7 +2964,7 @@ static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} -#endif /* USE_SPLIT_PTE_PTLOCKS */ +#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { @@ -3024,7 +3024,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) -#if USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_SPLIT_PMD_PTLOCKS) static inline struct page *pmd_pgtable_page(pmd_t *pmd) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 485424979254..165c58b12ccc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -947,7 +947,7 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_subscriptions *notifier_subscriptions; #endif -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_NUMA_BALANCING diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index a2f6179b672b..bff5706b76e1 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -16,9 +16,6 @@ #include #endif -#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) -#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ - IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) /* -- cgit v1.2.3 From 188cac58a8bcdf82c7f63275b68f7a46871e45d6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 26 Jul 2024 17:07:27 +0200 Subject: mm/hugetlb: enforce that PMD PT sharing has split PMD PT locks Sharing page tables between processes but falling back to per-MM page table locks cannot possibly work. So, let's make sure that we do have split PMD locks by adding a new Kconfig option and letting that depend on CONFIG_SPLIT_PMD_PTLOCKS. Link: https://lkml.kernel.org/r/20240726150728.3159964-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alexander Viro Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Christian Brauner Cc: Christophe Leroy Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Juergen Gross Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Peter Xu Cc: Russell King Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 45bf05ad5c53..9b7bcfce6920 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1251,7 +1251,7 @@ static inline __init void hugetlb_cma_reserve(int order) } #endif -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline bool hugetlb_pmd_shared(pte_t *pte) { return page_count(virt_to_page(pte)) > 1; @@ -1287,8 +1287,7 @@ bool __vma_private_lock(struct vm_area_struct *vma); static inline pte_t * hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { -#if defined(CONFIG_HUGETLB_PAGE) && \ - defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP) +#if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; /* -- cgit v1.2.3 From e5a41fc77771c7c7f6b9bef85a02b38b802b7371 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jul 2024 20:38:42 +0200 Subject: mm: simplify arch_make_folio_accessible() Patch series "mm: remove arch_make_page_accessible()". Now that s390x implements arch_make_folio_accessible(), let's convert remaining users to use arch_make_folio_accessible() instead so we can remove arch_make_page_accessible(). This patch (of 3): Now that s390x implements HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE, let's turn generic arch_make_folio_accessible() into a NOP: there are no other targets that implement HAVE_ARCH_MAKE_PAGE_ACCESSIBLE but not HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE. Link: https://lkml.kernel.org/r/20240729183844.388481-1-david@redhat.com Link: https://lkml.kernel.org/r/20240729183844.388481-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Claudio Imbrenda Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: David Hildenbrand Cc: Heiko Carstens Cc: Janosch Frank Cc: Sven Schnelle Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b7000fa300f3..fc7bd3864bcd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2215,16 +2215,7 @@ static inline int arch_make_page_accessible(struct page *page) #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { - int ret; - long i, nr = folio_nr_pages(folio); - - for (i = 0; i < nr; i++) { - ret = arch_make_page_accessible(folio_page(folio, i)); - if (ret) - break; - } - - return ret; + return 0; } #endif -- cgit v1.2.3 From 3290ef3c7f2a8171fc534e02fb95a512eeae689a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 29 Jul 2024 20:38:44 +0200 Subject: s390/uv: drop arch_make_page_accessible() All code was converted to using arch_make_folio_accessible(), let's drop arch_make_page_accessible(). Link: https://lkml.kernel.org/r/20240729183844.388481-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Claudio Imbrenda Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Janosch Frank Cc: Sven Schnelle Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fc7bd3864bcd..153a4662a0cd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2205,13 +2205,6 @@ static inline bool folio_likely_mapped_shared(struct folio *folio) return atomic_read(&folio->_mapcount) > 0; } -#ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE -static inline int arch_make_page_accessible(struct page *page) -{ - return 0; -} -#endif - #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { -- cgit v1.2.3 From c6f53ed8f213a66ae8bc40aa9112c32412c35a21 Mon Sep 17 00:00:00 2001 From: David Finkel Date: Mon, 29 Jul 2024 10:37:42 -0400 Subject: mm, memcg: cg2 memory{.swap,}.peak write handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm, memcg: cg2 memory{.swap,}.peak write handlers", v7. This patch (of 2): Other mechanisms for querying the peak memory usage of either a process or v1 memory cgroup allow for resetting the high watermark. Restore parity with those mechanisms, but with a less racy API. For example: - Any write to memory.max_usage_in_bytes in a cgroup v1 mount resets the high watermark. - writing "5" to the clear_refs pseudo-file in a processes's proc directory resets the peak RSS. This change is an evolution of a previous patch, which mostly copied the cgroup v1 behavior, however, there were concerns about races/ownership issues with a global reset, so instead this change makes the reset filedescriptor-local. Writing any non-empty string to the memory.peak and memory.swap.peak pseudo-files reset the high watermark to the current usage for subsequent reads through that same FD. Notably, following Johannes's suggestion, this implementation moves the O(FDs that have written) behavior onto the FD write(2) path. Instead, on the page-allocation path, we simply add one additional watermark to conditionally bump per-hierarchy level in the page-counter. Additionally, this takes Longman's suggestion of nesting the page-charging-path checks for the two watermarks to reduce the number of common-case comparisons. This behavior is particularly useful for work scheduling systems that need to track memory usage of worker processes/cgroups per-work-item. Since memory can't be squeezed like CPU can (the OOM-killer has opinions), these systems need to track the peak memory usage to compute system/container fullness when binpacking workitems. Most notably, Vimeo's use-case involves a system that's doing global binpacking across many Kubernetes pods/containers, and while we can use PSI for some local decisions about overload, we strive to avoid packing workloads too tightly in the first place. To facilitate this, we track the peak memory usage. However, since we run with long-lived workers (to amortize startup costs) we need a way to track the high watermark while a work-item is executing. Polling runs the risk of missing short spikes that last for timescales below the polling interval, and peak memory tracking at the cgroup level is otherwise perfect for this use-case. As this data is used to ensure that binpacked work ends up with sufficient headroom, this use-case mostly avoids the inaccuracies surrounding reclaimable memory. Link: https://lkml.kernel.org/r/20240730231304.761942-1-davidf@vimeo.com Link: https://lkml.kernel.org/r/20240729143743.34236-1-davidf@vimeo.com Link: https://lkml.kernel.org/r/20240729143743.34236-2-davidf@vimeo.com Signed-off-by: David Finkel Suggested-by: Johannes Weiner Suggested-by: Waiman Long Acked-by: Johannes Weiner Reviewed-by: Michal Koutný Acked-by: Tejun Heo Reviewed-by: Roman Gushchin Cc: Jonathan Corbet Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Cc: Shuah Khan Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/cgroup-defs.h | 5 +++++ include/linux/cgroup.h | 3 +++ include/linux/memcontrol.h | 5 +++++ include/linux/page_counter.h | 11 ++++++++++- 4 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index ae04035b6cbe..7fc2d0195f56 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -775,6 +775,11 @@ struct cgroup_subsys { extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +struct cgroup_of_peak { + unsigned long value; + struct list_head list; +}; + /** * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups * @tsk: target task diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c60ba0ab1462..3e0563753cc3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -854,4 +855,6 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); +struct cgroup_of_peak *of_peak(struct kernfs_open_file *of); + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index af7da7bd00af..1b79760af685 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -193,6 +193,11 @@ struct mem_cgroup { struct page_counter memsw; /* v1 only */ }; + /* registered local peak watchers */ + struct list_head memory_peaks; + struct list_head swap_peaks; + spinlock_t peaks_lock; + /* Range enforcement for interrupt charges */ struct work_struct high_work; diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 66ebf9a73158..79dbd8bc35a7 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -26,6 +26,8 @@ struct page_counter { atomic_long_t children_low_usage; unsigned long watermark; + /* Latest cg2 reset watermark */ + unsigned long local_watermark; unsigned long failcnt; /* Keep all the read most fields in a separete cacheline. */ @@ -84,7 +86,14 @@ int page_counter_memparse(const char *buf, const char *max, static inline void page_counter_reset_watermark(struct page_counter *counter) { - counter->watermark = page_counter_read(counter); + unsigned long usage = page_counter_read(counter); + + /* + * Update local_watermark first, so it's always <= watermark + * (modulo CPU/compiler re-ordering) + */ + counter->local_watermark = usage; + counter->watermark = usage; } #ifdef CONFIG_MEMCG -- cgit v1.2.3 From a17c7d8fd2b097a422fc892b660e879e19c20214 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 29 Jul 2024 12:50:35 +0100 Subject: userfaultfd: move core VMA manipulation logic to mm/userfaultfd.c Patch series "Make core VMA operations internal and testable", v4. There are a number of "core" VMA manipulation functions implemented in mm/mmap.c, notably those concerning VMA merging, splitting, modifying, expanding and shrinking, which logically don't belong there. More importantly this functionality represents an internal implementation detail of memory management and should not be exposed outside of mm/ itself. This patch series isolates core VMA manipulation functionality into its own file, mm/vma.c, and provides an API to the rest of the mm code in mm/vma.h. Importantly, it also carefully implements mm/vma_internal.h, which specifies which headers need to be imported by vma.c, leading to the very useful property that vma.c depends only on mm/vma.h and mm/vma_internal.h. This means we can then re-implement vma_internal.h in userland, adding shims for kernel mechanisms as required, allowing us to unit test internal VMA functionality. This testing is useful as opposed to an e.g. kunit implementation as this way we can avoid all external kernel side-effects while testing, run tests VERY quickly, and iterate on and debug problems quickly. Excitingly this opens the door to, in the future, recreating precise problems observed in production in userland and very quickly debugging problems that might otherwise be very difficult to reproduce. This patch series takes advantage of existing shim logic and full userland maple tree support contained in tools/testing/radix-tree/ and tools/include/linux/, separating out shared components of the radix tree implementation to provide this testing. Kernel functionality is stubbed and shimmed as needed in tools/testing/vma/ which contains a fully functional userland vma_internal.h file and which imports mm/vma.c and mm/vma.h to be directly tested from userland. A simple, skeleton testing implementation is provided in tools/testing/vma/vma.c as a proof-of-concept, asserting that simple VMA merge, modify (testing split), expand and shrink functionality work correctly. This patch (of 4): This patch forms part of a patch series intending to separate out VMA logic and render it testable from userspace, which requires that core manipulation functions be exposed in an mm/-internal header file. In order to do this, we must abstract APIs we wish to test, in this instance functions which ultimately invoke vma_modify(). This patch therefore moves all logic which ultimately invokes vma_modify() to mm/userfaultfd.c, trying to transfer code at a functional granularity where possible. [lorenzo.stoakes@oracle.com: fix user-after-free in userfaultfd_clear_vma()] Link: https://lkml.kernel.org/r/3c947ddc-b804-49b7-8fe9-3ea3ca13def5@lucifer.local Link: https://lkml.kernel.org/r/cover.1722251717.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/50c3ed995fd81c45876c86304c8a00bf3e396cfd.1722251717.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Alexander Viro Cc: Brendan Higgins Cc: Christian Brauner Cc: David Gow Cc: Eric W. Biederman Cc: Jan Kara Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Rae Moar Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Pengfei Xu Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a12bcf042551..9fc6ce15c499 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -267,6 +267,25 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm, extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); extern bool userfaultfd_wp_async(struct vm_area_struct *vma); +void userfaultfd_reset_ctx(struct vm_area_struct *vma); + +struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end); + +int userfaultfd_register_range(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, + unsigned long vm_flags, + unsigned long start, unsigned long end, + bool wp_async); + +void userfaultfd_release_new(struct userfaultfd_ctx *ctx); + +void userfaultfd_release_all(struct mm_struct *mm, + struct userfaultfd_ctx *ctx); + #else /* CONFIG_USERFAULTFD */ /* mm helpers */ -- cgit v1.2.3 From fa04c08f3ce649b47781c7663e92398197f5b551 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 29 Jul 2024 12:50:36 +0100 Subject: mm: move vma_modify() and helpers to internal header These are core VMA manipulation functions which invoke VMA splitting and merging and should not be directly accessed from outside of mm/. Link: https://lkml.kernel.org/r/5efde0c6342a8860d5ffc90b415f3989fd8ed0b2.1722251717.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Alexander Viro Cc: Brendan Higgins Cc: Christian Brauner Cc: David Gow Cc: Eric W. Biederman Cc: Jan Kara Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Rae Moar Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Pengfei Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 60 ------------------------------------------------------ 1 file changed, 60 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 153a4662a0cd..c4054e68dc09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,66 +3279,6 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); -struct vm_area_struct *vma_modify(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long vm_flags, - struct mempolicy *policy, - struct vm_userfaultfd_ctx uffd_ctx, - struct anon_vma_name *anon_name); - -/* We are about to modify the VMA's flags. */ -static inline struct vm_area_struct -*vma_modify_flags(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long new_flags) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), vma->vm_userfaultfd_ctx, - anon_vma_name(vma)); -} - -/* We are about to modify the VMA's flags and/or anon_name. */ -static inline struct vm_area_struct -*vma_modify_flags_name(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long new_flags, - struct anon_vma_name *new_name) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); -} - -/* We are about to modify the VMA's memory policy. */ -static inline struct vm_area_struct -*vma_modify_policy(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct mempolicy *new_pol) -{ - return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, - new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); -} - -/* We are about to modify the VMA's flags and/or uffd context. */ -static inline struct vm_area_struct -*vma_modify_flags_uffd(struct vma_iterator *vmi, - struct vm_area_struct *prev, - struct vm_area_struct *vma, - unsigned long start, unsigned long end, - unsigned long new_flags, - struct vm_userfaultfd_ctx new_ctx) -{ - return vma_modify(vmi, prev, vma, start, end, new_flags, - vma_policy(vma), new_ctx, anon_vma_name(vma)); -} static inline int check_data_rlimit(unsigned long rlim, unsigned long new, -- cgit v1.2.3 From d61f0d59683d9c899211c300254d4140c482a6c0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 29 Jul 2024 12:50:37 +0100 Subject: mm: move vma_shrink(), vma_expand() to internal header The vma_shrink() and vma_expand() functions are internal VMA manipulation functions which we ought to abstract for use outside of memory management code. To achieve this, we replace shift_arg_pages() in fs/exec.c with an invocation of a new relocate_vma_down() function implemented in mm/mmap.c, which enables us to also move move_page_tables() and vma_iter_prev_range() to internal.h. The purpose of doing this is to isolate key VMA manipulation functions in order that we can both abstract them and later render them easily testable. Link: https://lkml.kernel.org/r/3cfcd9ec433e032a85f636fdc0d7d98fafbd19c5.1722251717.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Alexander Viro Cc: Brendan Higgins Cc: Christian Brauner Cc: David Gow Cc: Eric W. Biederman Cc: Jan Kara Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Rae Moar Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Pengfei Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c4054e68dc09..1c0d88446eb8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1005,12 +1005,6 @@ static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) return mas_prev(&vmi->mas, 0); } -static inline -struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) -{ - return mas_prev_range(&vmi->mas, 0); -} - static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; @@ -2520,11 +2514,6 @@ int set_page_dirty_lock(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); -extern unsigned long move_page_tables(struct vm_area_struct *vma, - unsigned long old_addr, struct vm_area_struct *new_vma, - unsigned long new_addr, unsigned long len, - bool need_rmap_locks, bool for_stack); - /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However @@ -3267,11 +3256,6 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff, - struct vm_area_struct *next); -extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); @@ -3279,6 +3263,7 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, -- cgit v1.2.3 From 49b1b8d6f6831026cb105b0eafa18f13db612d86 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 29 Jul 2024 12:50:38 +0100 Subject: mm: move internal core VMA manipulation functions to own file This patch introduces vma.c and moves internal core VMA manipulation functions to this file from mmap.c. This allows us to isolate VMA functionality in a single place such that we can create userspace testing code that invokes this functionality in an environment where we can implement simple unit tests of core functionality. This patch ensures that core VMA functionality is explicitly marked as such by its presence in mm/vma.h. It also places the header includes required by vma.c in vma_internal.h, which is simply imported by vma.c. This makes the VMA functionality testable, as userland testing code can simply stub out functionality as required. Link: https://lkml.kernel.org/r/c77a6aafb4c42aaadb8e7271a853658cbdca2e22.1722251717.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Alexander Viro Cc: Brendan Higgins Cc: Christian Brauner Cc: David Gow Cc: Eric W. Biederman Cc: Jan Kara Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Rae Moar Cc: SeongJae Park Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Pengfei Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1c0d88446eb8..bd219ac9c026 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1005,21 +1005,6 @@ static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) return mas_prev(&vmi->mas, 0); } -static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) -{ - return vmi->mas.index; -} - -static inline unsigned long vma_iter_end(struct vma_iterator *vmi) -{ - return vmi->mas.last + 1; -} -static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, - unsigned long count) -{ - return mas_expected_entries(&vmi->mas, count); -} - static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { @@ -2534,21 +2519,6 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen); #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) -bool vma_needs_dirty_tracking(struct vm_area_struct *vma); -bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); -static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) -{ - /* - * We want to check manually if we can change individual PTEs writable - * if we can't do that automatically for all PTEs in a mapping. For - * private mappings, that's always the case when we have write - * permissions as we properly have to handle COW. - */ - if (vma->vm_flags & VM_SHARED) - return vma_wants_writenotify(vma, vma->vm_page_prot); - return !!(vma->vm_flags & VM_WRITE); - -} bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern long change_protection(struct mmu_gather *tlb, @@ -3256,12 +3226,7 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); -extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void unlink_file_vma(struct vm_area_struct *); -extern struct vm_area_struct *copy_vma(struct vm_area_struct **, - unsigned long addr, unsigned long len, pgoff_t pgoff, - bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); -- cgit v1.2.3 From 29943248af0a8ac3edb808564baa417b40e35521 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 29 Jul 2024 14:47:17 +0530 Subject: mm: improve code consistency with zonelist_* helper functions Replace direct access to zoneref->zone, zoneref->zone_idx, or zone_to_nid(zoneref->zone) with the corresponding zonelist_* helper functions for consistency. No functional change. Link: https://lkml.kernel.org/r/20240729091717.464-1-shivankg@amd.com Co-developed-by: Shivank Garg Signed-off-by: Shivank Garg Signed-off-by: Wei Yang Acked-by: David Hildenbrand Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1dc6248feb83..dbb69f96a152 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1688,7 +1688,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ - for (zone = z->zone; \ + for (zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) @@ -1724,7 +1724,7 @@ static inline bool movable_only_nodes(nodemask_t *nodes) nid = first_node(*nodes); zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, ZONE_NORMAL, nodes); - return (!z->zone) ? true : false; + return (!zonelist_zone(z)) ? true : false; } -- cgit v1.2.3 From 9f101bef408a3f70c44b6e4de44d3d4e2655ed10 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 30 Jul 2024 19:13:39 +1200 Subject: mm: swap: add nr argument in swapcache_prepare and swapcache_clear to support large folios Right now, swapcache_prepare() and swapcache_clear() supports one entry only, to support large folios, we need to handle multiple swap entries. To optimize stack usage, we iterate twice in __swap_duplicate(): the first time to verify that all entries are valid, and the second time to apply the modifications to the entries. Currently, we're using nr=1 for the existing users. [v-songbaohua@oppo.com: clarify swap_count_continued and improve readability for __swap_duplicate] Link: https://lkml.kernel.org/r/20240802071817.47081-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240730071339.107447-2-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Tested-by: Baolin Wang Cc: Chris Li Cc: Gao Xiang Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Nhat Pham Cc: Ryan Roberts Cc: Sergey Senozhatsky Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index ba7ea95d1c57..5b920fa2315b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -480,7 +480,7 @@ extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); -extern int swapcache_prepare(swp_entry_t); +extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); @@ -554,7 +554,7 @@ static inline int swap_duplicate(swp_entry_t swp) return 0; } -static inline int swapcache_prepare(swp_entry_t swp) +static inline int swapcache_prepare(swp_entry_t swp, int nr) { return 0; } -- cgit v1.2.3 From 94ccd21e9a5f41585bd16cc84dab2afa6cc21149 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 31 Jul 2024 16:20:00 +0200 Subject: mm/hugetlb: remove hugetlb_follow_page_mask() leftover We removed hugetlb_follow_page_mask() in commit 9cb28da54643 ("mm/gup: handle hugetlb in the generic follow_page_mask code") but forgot to cleanup some leftovers. While at it, simplify the hugetlb comment, it's overly detailed and rather confusing. Stating that we may end up in there during coredumping is sufficient to explain the PF_DUMPCORE usage. Link: https://lkml.kernel.org/r/20240731142000.625044-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Peter Xu Cc: Muchun Song Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9b7bcfce6920..3100a52ceb73 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -127,9 +127,6 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); -struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); -- cgit v1.2.3 From 17d5f38b33b66210c5a46af639c47fddfe1c5b4d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 31 Jul 2024 18:07:58 +0200 Subject: mm: clarify folio_likely_mapped_shared() documentation for KSM folios For KSM folios, the function actually does what it is supposed to do: even having multiple mappings inside the same MM is considered "sharing", as there is no real relationship between these KSM page mappings -- in contrast to mapping the same file range twice and having the same pagecache page mapped twice. Link: https://lkml.kernel.org/r/20240731160758.808925-1-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index bd219ac9c026..2f6c08b53e4f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2132,14 +2132,19 @@ static inline size_t folio_size(const struct folio *folio) * MM ("mapped shared"), or if the folio is only mapped into a single MM * ("mapped exclusively"). * + * For KSM folios, this function also returns "mapped shared" when a folio is + * mapped multiple times into the same MM, because the individual page mappings + * are independent. + * * As precise information is not easily available for all folios, this function * estimates the number of MMs ("sharers") that are currently mapping a folio * using the number of times the first page of the folio is currently mapped * into page tables. * - * For small anonymous folios (except KSM folios) and anonymous hugetlb folios, - * the return value will be exactly correct, because they can only be mapped - * at most once into an MM, and they cannot be partially mapped. + * For small anonymous folios and anonymous hugetlb folios, the return + * value will be exactly correct: non-KSM folios can only be mapped at most once + * into an MM, and they cannot be partially mapped. KSM folios are + * considered shared even if mapped multiple times into the same MM. * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly @@ -2148,9 +2153,6 @@ static inline size_t folio_size(const struct folio *folio) * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. - * #. For (small) KSM folios, the return value can wrongly indicate "mapped - * shared" (false positive), when the folio is mapped multiple times into - * the same MM. * * Further, this function only considers current page table mappings that * are tracked using the folio mapcount(s). -- cgit v1.2.3 From 03790c51a475c46647079e67e2460a149938bfd6 Mon Sep 17 00:00:00 2001 From: Kaiyang Zhao Date: Thu, 1 Aug 2024 23:25:47 +0000 Subject: mm: create promo_wmark_pages and clean up open-coded sites Patch series "mm: print the promo watermark in zoneinfo", v2. This patch (of 2): Define promo_wmark_pages and convert current call sites of wmark_pages with fixed WMARK_PROMO to using it instead. Link: https://lkml.kernel.org/r/20240801232548.36604-1-kaiyang2@cs.cmu.edu Link: https://lkml.kernel.org/r/20240801232548.36604-2-kaiyang2@cs.cmu.edu Signed-off-by: Kaiyang Zhao Cc: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dbb69f96a152..9f3589f7f05c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -669,6 +669,7 @@ enum zone_watermarks { #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) +#define promo_wmark_pages(z) (z->_watermark[WMARK_PROMO] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) /* -- cgit v1.2.3 From 620943d7ee69070df8844235d58843af48ac70e2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 1 Aug 2024 16:50:05 -0700 Subject: include/linux/mmzone.h: clean up watermark accessors - we have a helper wmark_pages(). Teach min_wmark_pages(), low_wmark_pages(), high_wmark_pages() and promo_wmark_pages() to use it instead of open-coding its implementation. - there's no reason to implement all these things as macros. Redo them in C. Acked-by: Johannes Weiner Cc: Kaiyang Zhao Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9f3589f7f05c..17506e4a2835 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -666,12 +666,6 @@ enum zone_watermarks { #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) -#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) -#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) -#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) -#define promo_wmark_pages(z) (z->_watermark[WMARK_PROMO] + z->watermark_boost) -#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) - /* * Flags used in pcp->flags field. * @@ -1017,6 +1011,32 @@ enum zone_flags { ZONE_BELOW_HIGH, /* zone is below high watermark. */ }; +static inline unsigned long wmark_pages(const struct zone *z, + enum zone_watermarks w) +{ + return z->_watermark[w] + z->watermark_boost; +} + +static inline unsigned long min_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_MIN); +} + +static inline unsigned long low_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_LOW); +} + +static inline unsigned long high_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_HIGH); +} + +static inline unsigned long promo_wmark_pages(const struct zone *z) +{ + return wmark_pages(z, WMARK_PROMO); +} + static inline unsigned long zone_managed_pages(struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); -- cgit v1.2.3 From aa39ca6940f1a0540f2984051b3089972f42959b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 2 Aug 2024 17:55:15 +0200 Subject: mm/pagewalk: introduce folio_walk_start() + folio_walk_end() We want to get rid of follow_page(), and have a more reasonable way to just lookup a folio mapped at a certain address, perform some checks while still under PTL, and then only conditionally grab a folio reference if really required. Further, we might want to get rid of some walk_page_range*() users that really only want to temporarily lookup a single folio at a single address. So let's add a new page table walker that does exactly that, similarly to GUP also being able to walk hugetlb VMAs. Add folio_walk_end() as a macro for now: the compiler is not easy to please with the pte_unmap()->kunmap_local(). Note that one difference between follow_page() and get_user_pages(1) is that follow_page() will not trigger faults to get something mapped. So folio_walk is at least currently not a replacement for get_user_pages(1), but could likely be extended/reused to achieve something similar in the future. Link: https://lkml.kernel.org/r/20240802155524.517137-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Janosch Frank Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 27cd1e59ccf7..f5eb5a32aeed 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -130,4 +130,62 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private); +typedef int __bitwise folio_walk_flags_t; + +/* + * Walk migration entries as well. Careful: a large folio might get split + * concurrently. + */ +#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0)) + +/* Walk shared zeropages (small + huge) as well. */ +#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1)) + +enum folio_walk_level { + FW_LEVEL_PTE, + FW_LEVEL_PMD, + FW_LEVEL_PUD, +}; + +/** + * struct folio_walk - folio_walk_start() / folio_walk_end() data + * @page: exact folio page referenced (if applicable) + * @level: page table level identifying the entry type + * @pte: pointer to the page table entry (FW_LEVEL_PTE). + * @pmd: pointer to the page table entry (FW_LEVEL_PMD). + * @pud: pointer to the page table entry (FW_LEVEL_PUD). + * @ptl: pointer to the page table lock. + * + * (see folio_walk_start() documentation for more details) + */ +struct folio_walk { + /* public */ + struct page *page; + enum folio_walk_level level; + union { + pte_t *ptep; + pud_t *pudp; + pmd_t *pmdp; + }; + union { + pte_t pte; + pud_t pud; + pmd_t pmd; + }; + /* private */ + struct vm_area_struct *vma; + spinlock_t *ptl; +}; + +struct folio *folio_walk_start(struct folio_walk *fw, + struct vm_area_struct *vma, unsigned long addr, + folio_walk_flags_t flags); + +#define folio_walk_end(__fw, __vma) do { \ + spin_unlock((__fw)->ptl); \ + if (likely((__fw)->level == FW_LEVEL_PTE)) \ + pte_unmap((__fw)->ptep); \ + vma_pgtable_walk_end(__vma); \ +} while (0) + #endif /* _LINUX_PAGEWALK_H */ -- cgit v1.2.3 From 8710f6ed34e7bcf98971d65acfb5eaed5fc469b0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 2 Aug 2024 17:55:20 +0200 Subject: mm/huge_memory: convert split_huge_pages_pid() from follow_page() to folio_walk Let's remove yet another follow_page() user. Note that we have to do the split without holding the PTL, after folio_walk_end(). We don't care about losing the secretmem check in follow_page(). [david@redhat.com: teach can_split_folio() that we are not holding an additional reference] Link: https://lkml.kernel.org/r/c75d1c6c-8ea6-424f-853c-1ccda6c77ba2@redhat.com Link: https://lkml.kernel.org/r/20240802155524.517137-8-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Janosch Frank Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e25d9ebfdf89..ce44caa40eed 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -314,7 +314,7 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); -bool can_split_folio(struct folio *folio, int *pextra_pins); +bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); static inline int split_huge_page(struct page *page) @@ -470,7 +470,7 @@ thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, } static inline bool -can_split_folio(struct folio *folio, int *pextra_pins) +can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins) { return false; } -- cgit v1.2.3 From 7290840de65e04c1fc59dab692468fc9600c6038 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 2 Aug 2024 17:55:23 +0200 Subject: mm: remove follow_page() All users are gone, let's remove it and any leftovers in comments. We'll leave any FOLL/follow_page_() naming cleanups as future work. Link: https://lkml.kernel.org/r/20240802155524.517137-11-david@redhat.com Signed-off-by: David Hildenbrand Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Claudio Imbrenda Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Janosch Frank Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2f6c08b53e4f..ee8cea73d415 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3527,9 +3527,6 @@ static inline vm_fault_t vmf_fs_error(int err) return VM_FAULT_SIGBUS; } -struct page *follow_page(struct vm_area_struct *vma, unsigned long address, - unsigned int foll_flags); - static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) -- cgit v1.2.3 From e31c38e037621c445bb4393fd77e0a76e6e0899a Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 5 Aug 2024 16:22:42 -0700 Subject: zswap: implement a second chance algorithm for dynamic zswap shrinker Patch series "improving dynamic zswap shrinker protection scheme", v3. When experimenting with the memory-pressure based (i.e "dynamic") zswap shrinker in production, we observed a sharp increase in the number of swapins, which led to performance regression. We were able to trace this regression to the following problems with the shrinker's warm pages protection scheme: 1. The protection decays way too rapidly, and the decaying is coupled with zswap stores, leading to anomalous patterns, in which a small batch of zswap stores effectively erase all the protection in place for the warmer pages in the zswap LRU. This observation has also been corroborated upstream by Takero Funaki (in [1]). 2. We inaccurately track the number of swapped in pages, missing the non-pivot pages that are part of the readahead window, while counting the pages that are found in the zswap pool. To alleviate these two issues, this patch series improve the dynamic zswap shrinker in the following manner: 1. Replace the protection size tracking scheme with a second chance algorithm. This new scheme removes the need for haphazard stats decaying, and automatically adjusts the pace of pages aging with memory pressure, and writeback rate with pool activities: slowing down when the pool is dominated with zswpouts, and speeding up when the pool is dominated with stale entries. 2. Fix the tracking of the number of swapins to take into account non-pivot pages in the readahead window. With these two changes in place, in a kernel-building benchmark without any cold data added, the number of swapins is reduced by 64.12%. This translate to a 10.32% reduction in build time. We also observe a 3% reduction in kernel CPU time. In another benchmark, with cold data added (to gauge the new algorithm's ability to offload cold data), the new second chance scheme outperforms the old protection scheme by around 0.7%, and actually written back around 21% more pages to backing swap device. So the new scheme is just as good, if not even better than the old scheme on this front as well. [1]: https://lore.kernel.org/linux-mm/CAPpodddcGsK=0Xczfuk8usgZ47xeyf4ZjiofdT+ujiyz6V2pFQ@mail.gmail.com/ This patch (of 2): Current zswap shrinker's heuristics to prevent overshrinking is brittle and inaccurate, specifically in the way we decay the protection size (i.e making pages in the zswap LRU eligible for reclaim). We currently decay protection aggressively in zswap_lru_add() calls. This leads to the following unfortunate effect: when a new batch of pages enter zswap, the protection size rapidly decays to below 25% of the zswap LRU size, which is way too low. We have observed this effect in production, when experimenting with the zswap shrinker: the rate of shrinking shoots up massively right after a new batch of zswap stores. This is somewhat the opposite of what we want originally - when new pages enter zswap, we want to protect both these new pages AND the pages that are already protected in the zswap LRU. Replace existing heuristics with a second chance algorithm 1. When a new zswap entry is stored in the zswap pool, its referenced bit is set. 2. When the zswap shrinker encounters a zswap entry with the referenced bit set, give it a second chance - only flips the referenced bit and rotate it in the LRU. 3. If the shrinker encounters the entry again, this time with its referenced bit unset, then it can reclaim the entry. In this manner, the aging of the pages in the zswap LRUs are decoupled from zswap stores, and picks up the pace with increasing memory pressure (which is what we want). The second chance scheme allows us to modulate the writeback rate based on recent pool activities. Entries that recently entered the pool will be protected, so if the pool is dominated by such entries the writeback rate will reduce proportionally, protecting the workload's workingset.On the other hand, stale entries will be written back quickly, which increases the effective writeback rate. The referenced bit is added at the hole after the `length` field of struct zswap_entry, so there is no extra space overhead for this algorithm. We will still maintain the count of swapins, which is consumed and subtracted from the lru size in zswap_shrinker_count(), to further penalize past overshrinking that led to disk swapins. The idea is that had we considered this many more pages in the LRU active/protected, they would not have been written back and we would not have had to swapped them in. To test this new heuristics, I built the kernel under a cgroup with memory.max set to 2G, on a host with 36 cores: With the old shrinker: real: 263.89s user: 4318.11s sys: 673.29s swapins: 227300.5 With the second chance algorithm: real: 244.85s user: 4327.22s sys: 664.39s swapins: 94663 (average over 5 runs) We observe an 1.3% reduction in kernel CPU usage, and around 7.2% reduction in real time. Note that the number of swapped in pages dropped by 58%. [nphamcs@gmail.com: fix a small mistake in the referenced bit documentation] Link: https://lkml.kernel.org/r/20240806003403.3142387-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20240805232243.2896283-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20240805232243.2896283-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: Chengming Zhou Cc: Shakeel Butt Cc: Takero Funaki Signed-off-by: Andrew Morton --- include/linux/zswap.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 6cecb4a4f68b..9cd1beef0654 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -13,17 +13,15 @@ extern atomic_t zswap_stored_pages; struct zswap_lruvec_state { /* - * Number of pages in zswap that should be protected from the shrinker. - * This number is an estimate of the following counts: + * Number of swapped in pages from disk, i.e not found in the zswap pool. * - * a) Recent page faults. - * b) Recent insertion to the zswap LRU. This includes new zswap stores, - * as well as recent zswap LRU rotations. - * - * These pages are likely to be warm, and might incur IO if the are written - * to swap. + * This is consumed and subtracted from the lru size in + * zswap_shrinker_count() to penalize past overshrinking that led to disk + * swapins. The idea is that had we considered this many more pages in the + * LRU active/protected and not written them back, we would not have had to + * swapped them in. */ - atomic_long_t nr_zswap_protected; + atomic_long_t nr_disk_swapins; }; unsigned long zswap_total_pages(void); -- cgit v1.2.3 From 17fe833b0de08a78bfb51388e0615969d73ea8ad Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 5 Aug 2024 14:52:03 +0200 Subject: mm: fix (harmless) type confusion in lock_vma_under_rcu() There is a (harmless) type confusion in lock_vma_under_rcu(): After vma_start_read(), we have taken the VMA lock but don't know yet whether the VMA has already been detached and scheduled for RCU freeing. At this point, ->vm_start and ->vm_end are accessed. vm_area_struct contains a union such that ->vm_rcu uses the same memory as ->vm_start and ->vm_end; so accessing ->vm_start and ->vm_end of a detached VMA is illegal and leads to type confusion between union members. Fix it by reordering the vma->detached check above the address checks, and document the rules for RCU readers accessing VMAs. This will probably change the number of observed VMA_LOCK_MISS events (since previously, trying to access a detached VMA whose ->vm_rcu has been scheduled would bail out when checking the fault address against the rcu_head members reinterpreted as VMA bounds). Link: https://lkml.kernel.org/r/20240805-fix-vma-lock-type-confusion-v1-1-9f25443a9a71@google.com Fixes: 50ee32537206 ("mm: introduce lock_vma_under_rcu to be used from arch-specific code") Signed-off-by: Jann Horn Acked-by: Suren Baghdasaryan Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 165c58b12ccc..003619fab20e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -660,6 +660,9 @@ struct vma_numab_state { * per VM-area/task. A VM area is any part of the process virtual memory * space that has a special rule for the page-fault handlers (ie a shared * library, the executable area etc). + * + * Only explicitly marked struct members may be accessed by RCU readers before + * getting a stable reference. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -675,7 +678,11 @@ struct vm_area_struct { #endif }; - struct mm_struct *vm_mm; /* The address space we belong to. */ + /* + * The address space we belong to. + * Unstable RCU readers are allowed to read this. + */ + struct mm_struct *vm_mm; pgprot_t vm_page_prot; /* Access permissions of this VMA. */ /* @@ -688,7 +695,10 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* Flag to indicate areas detached from the mm->mm_mt tree */ + /* + * Flag to indicate areas detached from the mm->mm_mt tree. + * Unstable RCU readers are allowed to read this. + */ bool detached; /* @@ -706,6 +716,7 @@ struct vm_area_struct { * slowpath. */ int vm_lock_seq; + /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif -- cgit v1.2.3 From cc0a0f98553528791ae33ba5ee8c118b52ae2028 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 5 Aug 2024 14:39:39 +0200 Subject: kfence: introduce burst mode Introduce burst mode, which can be configured with kfence.burst=$count, where the burst count denotes the additional successive slab allocations to be allocated through KFENCE for each sample interval. The idea is that this can give developers an additional knob to make KFENCE more aggressive when debugging specific issues of systems where either rebooting or recompiling the kernel with KASAN is not possible. Experiment: To assess the effectiveness of the new option, we randomly picked a recent out-of-bounds [1] and use-after-free bug [2], each with a reproducer provided by syzbot, that initially detected these bugs with KASAN. We then tried to reproduce the bugs with KFENCE below. [1] Fixed by: 7c55b78818cf ("jfs: xattr: fix buffer overflow for invalid xattr") https://syzkaller.appspot.com/bug?id=9d1b59d4718239da6f6069d3891863c25f9f24a2 [2] Fixed by: f8ad00f3fb2a ("l2tp: fix possible UAF when cleaning up tunnels") https://syzkaller.appspot.com/bug?id=4f34adc84f4a3b080187c390eeef60611fd450e1 The following KFENCE configs were compared. A pool size of 1023 objects was used for all configurations. Baseline kfence.sample_interval=100 kfence.skip_covered_thresh=75 kfence.burst=0 Aggressive kfence.sample_interval=1 kfence.skip_covered_thresh=10 kfence.burst=0 AggressiveBurst kfence.sample_interval=1 kfence.skip_covered_thresh=10 kfence.burst=1000 Each reproducer was run 10 times (after a fresh reboot), with the following detection counts for each KFENCE config: | Detection Count out of 10 | | OOB [1] | UAF [2] | ------------------+-------------+-------------+ Default | 0/10 | 0/10 | Aggressive | 0/10 | 0/10 | AggressiveBurst | 8/10 | 8/10 | With the Default and even the Aggressive configs the results are unsurprising, given KFENCE has not been designed for deterministic bug detection of small test cases. However, when enabling burst mode with relatively large burst count, KFENCE can start to detect heap memory-safety bugs even in simpler test cases with high probability (in the above cases with ~80% probability). Link: https://lkml.kernel.org/r/20240805124203.2692278-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Jann Horn Signed-off-by: Andrew Morton --- include/linux/kfence.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 88100cc9caba..0ad1ddbb8b99 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -124,7 +124,7 @@ static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp if (!static_branch_likely(&kfence_allocation_key)) return NULL; #endif - if (likely(atomic_read(&kfence_allocation_gate))) + if (likely(atomic_read(&kfence_allocation_gate) > 0)) return NULL; return __kfence_alloc(s, size, flags); } -- cgit v1.2.3 From 47baed6a132f611228113851a70c73f6e6478d87 Mon Sep 17 00:00:00 2001 From: Jianhui Zhou <912460177@qq.com> Date: Wed, 7 Aug 2024 15:44:48 +0800 Subject: percpu: remove pcpu_alloc_size() pcpu_alloc_size() was added in 7ac5c53e0073 "mm/percpu.c: introduce pcpu_alloc_size()", which is used to get the allocated memory size in bpf. However, pcpu_alloc_size() is no longer used in "bpf: Use c->unit_size to select target cache during free" because its actuall allocated memory size may change at runtime due to its slab merging mechanism. Therefore, pcpu_alloc_size() can be removed. Link: https://lkml.kernel.org/r/tencent_AD5C50E8D78C07A3CE539BD5F6BF39706507@qq.com Signed-off-by: Jianhui Zhou <912460177@qq.com> Cc: Christoph Lameter Cc: Dennis Zhou Cc: JonasZhou Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 4b2047b78b67..b6321fc49159 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -135,7 +135,6 @@ extern void __init setup_per_cpu_areas(void); extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, gfp_t gfp) __alloc_size(1); -extern size_t pcpu_alloc_size(void __percpu *__pdata); #define __alloc_percpu_gfp(_size, _align, _gfp) \ alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp)) -- cgit v1.2.3 From 09022bc196d23484a7a5d48cf373f8583e3fcf23 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 7 Aug 2024 20:35:26 +0100 Subject: mm: remove PG_error The PG_error bit is now unused; delete it and free up a bit in page->flags. Link: https://lkml.kernel.org/r/20240807193528.1865100-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5769fe6e4950..a0a29bd092f8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -66,8 +66,6 @@ * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * - * PG_error is set to indicate that an I/O error occurred on this page. - * * PG_arch_1 is an architecture specific page state bit. The generic code * guarantees that this bit is cleared for a page when it first is entered into * the page cache. @@ -103,7 +101,6 @@ enum pageflags { PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, - PG_error, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_arch_1, PG_reserved, @@ -183,7 +180,7 @@ enum pageflags { */ /* At least one page in this folio has the hwpoison flag set */ - PG_has_hwpoisoned = PG_error, + PG_has_hwpoisoned = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ }; @@ -506,7 +503,6 @@ static inline int TestClearPage##uname(struct page *page) { return 0; } __PAGEFLAG(Locked, locked, PF_NO_TAIL) FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) -PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE) -- cgit v1.2.3 From 310183de7bb2ed114a80d057690ddb23d0cfe814 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 9 Aug 2024 14:48:50 +0300 Subject: mm: introduce PageUnaccepted() page type The new page type allows physical memory scanners to detect unaccepted memory and handle it accordingly. The page type is serialized with zone lock. Link: https://lkml.kernel.org/r/20240809114854.3745464-5-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: David Hildenbrand Cc: Borislav Petkov Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Rapoport (Microsoft) Cc: Tom Lendacky Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a0a29bd092f8..17175a328e6b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -939,6 +939,7 @@ enum pagetype { PG_hugetlb = 0x04000000, PG_slab = 0x02000000, PG_zsmalloc = 0x01000000, + PG_unaccepted = 0x00800000, PAGE_TYPE_BASE = 0x80000000, @@ -1072,6 +1073,13 @@ FOLIO_TEST_FLAG_FALSE(hugetlb) PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) +/* + * Mark pages that has to be accepted before touched for the first time. + * + * Serialized with zone lock. + */ +PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) + /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. -- cgit v1.2.3 From 5adfeaecc487e7023f1c7bbdc081707d7a93110f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 9 Aug 2024 14:48:51 +0300 Subject: mm: rework accept memory helpers Make accept_memory() and range_contains_unaccepted_memory() take 'start' and 'size' arguments instead of 'start' and 'end'. Remove accept_page(), replacing it with direct calls to accept_memory(). The accept_page() name is going to be used for a different function. Link: https://lkml.kernel.org/r/20240809114854.3745464-6-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Suggested-by: David Hildenbrand Cc: Borislav Petkov Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Rapoport (Microsoft) Cc: Tom Lendacky Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ee8cea73d415..b1eed30fdc06 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4062,18 +4062,18 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start, #ifdef CONFIG_UNACCEPTED_MEMORY -bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end); -void accept_memory(phys_addr_t start, phys_addr_t end); +bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); +void accept_memory(phys_addr_t start, unsigned long size); #else static inline bool range_contains_unaccepted_memory(phys_addr_t start, - phys_addr_t end) + unsigned long size) { return false; } -static inline void accept_memory(phys_addr_t start, phys_addr_t end) +static inline void accept_memory(phys_addr_t start, unsigned long size) { } @@ -4081,9 +4081,7 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end) static inline bool pfn_is_unaccepted_memory(unsigned long pfn) { - phys_addr_t paddr = pfn << PAGE_SHIFT; - - return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE); + return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); } void vma_pgtable_walk_begin(struct vm_area_struct *vma); -- cgit v1.2.3 From 1c399e74a97ca733d0780cc51819aa81fb292c22 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 12 Aug 2024 14:12:23 -0400 Subject: mm/x86: implement arch_check_zapped_pud() Introduce arch_check_zapped_pud() to sanity check shadow stack on PUD zaps. It has the same logic as the PMD helper. One thing to mention is, it might be a good idea to use page_table_check in the future for trapping wrong setups of shadow stack pgtable entries [1]. That is left for the future as a separate effort. [1] https://lore.kernel.org/all/59d518698f664e07c036a5098833d7b56b953305.camel@intel.com Link: https://lkml.kernel.org/r/20240812181225.1360970-6-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: "Edgecombe, Rick P" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Aneesh Kumar K.V Cc: Christophe Leroy Cc: Dan Williams Cc: Dave Jiang Cc: David Rientjes Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paolo Bonzini Cc: Rik van Riel Cc: Sean Christopherson Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2a6a3cccfc36..780f3b439d98 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -447,6 +447,12 @@ static inline void arch_check_zapped_pmd(struct vm_area_struct *vma, } #endif +#ifndef arch_check_zapped_pud +static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) +{ +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, -- cgit v1.2.3 From cb0f01beb16669e91510fcdb2cea213931aee017 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 12 Aug 2024 14:12:25 -0400 Subject: mm/mprotect: fix dax pud handlings This is only relevant to the two archs that support PUD dax, aka, x86_64 and ppc64. PUD THPs do not yet exist elsewhere, and hugetlb PUDs do not count in this case. DAX have had PUD mappings for years, but change protection path never worked. When the path is triggered in any form (a simple test program would be: call mprotect() on a 1G dev_dax mapping), the kernel will report "bad pud". This patch should fix that. The new change_huge_pud() tries to keep everything simple. For example, it doesn't optimize write bit as that will need even more PUD helpers. It's not too bad anyway to have one more write fault in the worst case once for 1G range; may be a bigger thing for each PAGE_SIZE, though. Neither does it support userfault-wp bits, as there isn't such PUD mappings that is supported; file mappings always need a split there. The same to TLB shootdown: the pmd path (which was for x86 only) has the trick of using _ad() version of pmdp_invalidate*() which can avoid one redundant TLB, but let's also leave that for later. Again, the larger the mapping, the smaller of such effect. There's some difference on handling "retry" for change_huge_pud() (where it can return 0): it isn't like change_huge_pmd(), as the pmd version is safe with all conditions handled in change_pte_range() later, thanks to Hugh's new pte_offset_map_lock(). In short, change_pte_range() is simply smarter. For that, change_pud_range() will need proper retry if it races with something else when a huge PUD changed from under us. The last thing to mention is currently the PUD path ignores the huge pte numa counter (NUMA_HUGE_PTE_UPDATES), not only because DAX is not applicable to NUMA, but also that it's ambiguous on its own to decide how to account pud in this case. In one earlier version of this patchset I proposed to remove the counter as it doesn't even look right to do the accounting as of now [1], but then a further discussion suggests we can leave that for later, as that doesn't block this series if we choose to ignore that counter. That's what this patch does, by ignoring it. When at it, touch up the comment in pgtable_split_needed() to make it generic to either pmd or pud file THPs. [1] https://lore.kernel.org/all/20240715192142.3241557-3-peterx@redhat.com/ [2] https://lore.kernel.org/r/added2d0-b8be-4108-82ca-1367a388d0b1@redhat.com Link: https://lkml.kernel.org/r/20240812181225.1360970-8-peterx@redhat.com Fixes: a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages") Fixes: 27af67f35631 ("powerpc/book3s64/mm: enable transparent pud hugepage") Signed-off-by: Peter Xu Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Jiang Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Michael Ellerman Cc: Aneesh Kumar K.V Cc: Oscar Salvador Cc: Christophe Leroy Cc: David Hildenbrand Cc: David Rientjes Cc: "Edgecombe, Rick P" Cc: Nicholas Piggin Cc: Paolo Bonzini Cc: Rik van Riel Cc: Sean Christopherson Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ce44caa40eed..6370026689e0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -342,6 +342,17 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pudp, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags); +#else +static inline int +change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, + pud_t *pudp, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) { return 0; } +#endif + #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ @@ -585,6 +596,19 @@ static inline int next_order(unsigned long *orders, int prev) { return 0; } + +static inline void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, + unsigned long address) +{ +} + +static inline int change_huge_pud(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pudp, + unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, -- cgit v1.2.3 From 223febc6e5573cda5e94e6b38fcdaded068db777 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 12 Aug 2024 18:26:02 +1000 Subject: mm: add optional close() to struct vm_special_mapping Add an optional close() callback to struct vm_special_mapping. It will be used, by powerpc at least, to handle unmapping of the VDSO. Although support for unmapping the VDSO was initially added for CRIU[1], it is not desirable to guard that support behind CONFIG_CHECKPOINT_RESTORE. There are other known users of unmapping the VDSO which are not related to CRIU, eg. Valgrind [2] and void-ship [3]. The powerpc arch_unmap() hook has been in place for ~9 years, with no ifdef, so there may be other unknown users that have come to rely on unmapping the VDSO. Even if the code was behind an ifdef, major distros enable CHECKPOINT_RESTORE so users may not realise unmapping the VDSO depends on that configuration option. It's also undesirable to have such core mm behaviour behind a relatively obscure CONFIG option. Longer term the unmap behaviour should be standardised across architectures, however that is complicated by the fact the VDSO pointer is stored differently across architectures. There was a previous attempt to unify that handling [4], which could be revived. See [5] for further discussion. [1]: commit 83d3f0e90c6c ("powerpc/mm: tracking vDSO remap") [2]: https://sourceware.org/git/?p=valgrind.git;a=commit;h=3a004915a2cbdcdebafc1612427576bf3321eef5 [3]: https://github.com/insanitybit/void-ship [4]: https://lore.kernel.org/lkml/20210611180242.711399-17-dima@arista.com/ [5]: https://lore.kernel.org/linuxppc-dev/shiq5v3jrmyi6ncwke7wgl76ojysgbhrchsk32q4lbx2hadqqc@kzyy2igem256 Link: https://lkml.kernel.org/r/20240812082605.743814-1-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Suggested-by: Linus Torvalds Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Cc: Christophe Leroy Cc: Jeff Xu Cc: Nicholas Piggin Cc: Pedro Falcato Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 003619fab20e..2419e60c9a7f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1324,6 +1324,9 @@ struct vm_special_mapping { int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); + + void (*close)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma); }; enum tlb_flush_reason { -- cgit v1.2.3 From 497258dfafcc6b88e7c4c46a38a479721d44cf41 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 20 Aug 2024 15:14:47 -0700 Subject: mm: remove legacy install_special_mapping() code All relevant architectures had already been converted to the new interface (which just has an underscore in front of the name - not very imaginative naming), this just force-converts the stragglers. The modern interface is almost identical to the old one, except instead of the page pointer it takes a "struct vm_special_mapping" that describes the mapping (and contains the page pointer as one member), and it returns the resulting 'vma' instead of just the error code. Getting rid of the old interface also gets rid of some special casing, which had caused problems with the mremap extensions to "struct vm_special_mapping". [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/CAHk-=whvR+z=0=0gzgdfUiK70JTa-=+9vxD-4T=3BagXR6dciA@mail.gmail.comTested-by: Rob Landley # arch/sh/ Link: https://lore.kernel.org/all/20240819195120.GA1113263@thelio-3990X/ Signed-off-by: Linus Torvalds Cc: Nathan Chancellor Cc: Michael Ellerman Cc: Anton Ivanov Cc: Brian Cain Cc: Christophe Leroy Cc: Dinh Nguyen Cc: Guo Ren Cc: Jeff Xu Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Liam R. Howlett Cc: Nicholas Piggin Cc: Pedro Falcato Cc: Richard Weinberger Cc: Rich Felker Cc: Rob Landley Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b1eed30fdc06..0fde51c0532f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3263,10 +3263,6 @@ extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, const struct vm_special_mapping *spec); -/* This is an obsolete alternative to _install_special_mapping. */ -extern int install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long flags, struct page **pages); unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); -- cgit v1.2.3 From 02f4bbefcada38cab86b365e5c795e0f858356bc Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Aug 2024 17:34:15 +0800 Subject: mm: kmem: add lockdep assertion to obj_cgroup_memcg obj_cgroup_memcg() is supposed to safe to prevent the returned memory cgroup from being freed only when the caller is holding the rcu read lock or objcg_lock or cgroup_mutex. It is very easy to ignore thoes conditions when users call some upper APIs which call obj_cgroup_memcg() internally like mem_cgroup_from_slab_obj() (See the link below). So it is better to add lockdep assertion to obj_cgroup_memcg() to find those issues ASAP. Because there is no user of obj_cgroup_memcg() holding objcg_lock to make the returned memory cgroup safe, do not add objcg_lock assertion (We should export objcg_lock if we really want to do). Additionally, this is some internal implementation detail of memcg and should not be accessible outside memcg code. Some users like __mem_cgroup_uncharge() do not care the lifetime of the returned memory cgroup, which just want to know if the folio is charged to a memory cgroup, therefore, they do not need to hold the needed locks. In which case, introduce a new helper folio_memcg_charged() to do this. Compare it to folio_memcg(), it could eliminate a memory access of objcg->memcg for kmem, actually, a really small gain. [songmuchun@bytedance.com: fix split_page_memcg()] Link: https://lkml.kernel.org/r/20240819080415.44964-1-songmuchun@bytedance.com Link: https://lore.kernel.org/all/20240718083607.42068-1-songmuchun@bytedance.com/ Link: https://lkml.kernel.org/r/20240814093415.17634-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1b79760af685..07eadf7ecbba 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -366,11 +366,11 @@ static inline bool folio_memcg_kmem(struct folio *folio); * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. * - * The caller must ensure that the returned memcg won't be released: - * e.g. acquire the rcu_read_lock or css_set_lock. + * The caller must ensure that the returned memcg won't be released. */ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) { + lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex)); return READ_ONCE(objcg->memcg); } @@ -444,6 +444,19 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return __folio_memcg(folio); } +/* + * folio_memcg_charged - If a folio is charged to a memory cgroup. + * @folio: Pointer to the folio. + * + * Returns true if folio is charged to a memory cgroup, otherwise returns false. + */ +static inline bool folio_memcg_charged(struct folio *folio) +{ + if (folio_memcg_kmem(folio)) + return __folio_objcg(folio) != NULL; + return __folio_memcg(folio) != NULL; +} + /** * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. * @folio: Pointer to the folio. @@ -460,7 +473,6 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) unsigned long memcg_data = READ_ONCE(folio->memcg_data); VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - WARN_ON_ONCE(!rcu_read_lock_held()); if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; @@ -469,6 +481,8 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) return obj_cgroup_memcg(objcg); } + WARN_ON_ONCE(!rcu_read_lock_held()); + return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } -- cgit v1.2.3 From bd164d81a767f33c30d5c5b50b775631699ee976 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:28 -0400 Subject: maple_tree: introduce store_type enum Patch series "Introduce a store type enum for the Maple tree", v4. ================================ OVERVIEW ================================ This series implements two work items[3]: "aligning mas_store_gfp() with mas_preallocate()" and "enum for store type". mas_store_gfp() is modified to preallocate nodes. This simplies many of the write helper functions by allowing them to use mas_store_gfp() rather than open coding node allocation and error handling. The enum defines the following store types: enum store_type { wr_invalid, wr_new_root, wr_store_root, wr_exact_fit, wr_spanning_store, wr_split_store, wr_rebalance, wr_append, wr_node_store, wr_slot_store, }; In the current maple tree code, a walk down the tree is done in mas_preallocate() to determine the number of nodes needed for this write. After node allocation, mas_wr_store_entry() will perform another walk to determine which write helper function to use to complete the write. Rather than performing the second walk, we can store the type of write in the maple write state during node allocation and read this field to complete the write. Patches 1-16 implement this store type feature. Patch 17 is a cleanup patch to change functions that have unused return types to be void. ================================ RESULTS ================================= Phoronix t-test-1 (Seconds < Lower Is Better) v6.10-rc6 Threads: 1 33.15 Threads: 2 10.81 v6.10-rc6 + this series Threads: 1 32.69 Threads: 2 10.45 Stress-ng mmap 6.10_base store_type_v4 Duration User 2744.65 2769.40 Duration System 10862.69 10817.59 Duration Elapsed 1477.58 1478.35 ================================ TESTING ================================= Testing was done with the maple tree test suite. A new test case is also added to validate the order in which we test for and assign the store type. [1]: https://lore.kernel.org/linux-mm/80926b22-a8d2-9992-eb5e-27e2c99cf460@google.com/T/#m81044feb66765265f8ca7f21e4b4b3725b18780a [2]: https://lore.kernel.org/linux-mm/80926b22-a8d2-9992-eb5e-27e2c99cf460@google.com/T/#mb36c6526486638e82518c0f37a428fb279c84d8a [3]: https://lists.infradead.org/pipermail/maple-tree/2023-December/003098.html This patch (of 17): Add a store_type enum that is stored in ma_state. This will be used to keep track of partial walks of the tree so that subsequent walks can pick up where a previous walk left off. Link: https://lkml.kernel.org/r/20240814161944.55347-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20240814161944.55347-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index a53ad4dabd7e..8e1504a81cd2 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -148,6 +148,18 @@ enum maple_type { maple_arange_64, }; +enum store_type { + wr_invalid, + wr_new_root, + wr_store_root, + wr_exact_fit, + wr_spanning_store, + wr_split_store, + wr_rebalance, + wr_append, + wr_node_store, + wr_slot_store, +}; /** * DOC: Maple tree flags @@ -436,6 +448,7 @@ struct ma_state { unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ + enum store_type store_type; /* The type of store needed for this operation */ }; struct ma_wr_state { @@ -477,6 +490,7 @@ struct ma_wr_state { .max = ULONG_MAX, \ .alloc = NULL, \ .mas_flags = 0, \ + .store_type = wr_invalid, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ -- cgit v1.2.3 From 5d383b69a04e2eb2e0ae06e91821fd82cb9acf73 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 Aug 2024 22:04:47 -0700 Subject: memcg: move v1 only percpu stats in separate struct Patch series "memcg: further decouple v1 code from v2". Some of the v1 code is still in v2 code base due to v1 fields in the struct memcg_vmstats_percpu. This field decouples those fileds from v2 struct and move all the related code into v1 only code base. This patch (of 7): At the moment struct memcg_vmstats_percpu contains two v1 only fields which consumes memory even when CONFIG_MEMCG_V1 is not enabled. In addition there are v1 only functions accessing them and are in the main memcontrol source file and can not be moved to v1 only source file due to these fields. Let's move these fields into their own struct. Later patches will move the functions accessing them to v1 source file and only allocate these fields when CONFIG_MEMCG_V1 is enabled. Link: https://lkml.kernel.org/r/20240815050453.1298138-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20240815050453.1298138-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: T.J. Mercier Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 07eadf7ecbba..08b7121cf43a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -70,6 +70,7 @@ struct mem_cgroup_id { }; struct memcg_vmstats_percpu; +struct memcg1_events_percpu; struct memcg_vmstats; struct lruvec_stats_percpu; struct lruvec_stats; @@ -254,6 +255,7 @@ struct mem_cgroup { struct list_head objcg_list; struct memcg_vmstats_percpu __percpu *vmstats_percpu; + struct memcg1_events_percpu __percpu *events_percpu; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; -- cgit v1.2.3 From 0ccaf421d6592bb99bb9b424e4ccca3c6367d799 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 Aug 2024 22:04:52 -0700 Subject: memcg: allocate v1 event percpu only on v1 deployment Currently memcg->events_percpu gets allocated on v2 deployments. Let's move the allocation to v1 only codebase. This is not needed in v2. Link: https://lkml.kernel.org/r/20240815050453.1298138-7-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: T.J. Mercier Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 08b7121cf43a..ed170399179a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -255,7 +255,6 @@ struct mem_cgroup { struct list_head objcg_list; struct memcg_vmstats_percpu __percpu *vmstats_percpu; - struct memcg1_events_percpu __percpu *events_percpu; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; @@ -277,6 +276,8 @@ struct mem_cgroup { struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ + struct memcg1_events_percpu __percpu *events_percpu; + unsigned long soft_limit; /* protected by memcg_oom_lock */ -- cgit v1.2.3 From 836d13a6ef8a2eb0eab2bd2de06f2deabc62b060 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 21 Jul 2024 16:36:18 +0300 Subject: xz: switch from public domain to BSD Zero Clause License (0BSD) Remove the public domain notices and add SPDX license identifiers. Change MODULE_LICENSE from "GPL" to "Dual BSD/GPL" because 0BSD should count as a BSD license variant here. The switch to 0BSD was done in the upstream XZ Embedded project because public domain has (real or perceived) legal issues in some jurisdictions. Link: https://lkml.kernel.org/r/20240721133633.47721-4-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Reviewed-by: Sam James Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Herbert Xu Cc: Joel Stanley Cc: Jonathan Corbet Cc: Jubin Zhong Cc: Jules Maselbas Cc: Krzysztof Kozlowski Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/decompress/unxz.h | 5 ++--- include/linux/xz.h | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/decompress/unxz.h b/include/linux/decompress/unxz.h index f764e2a7201e..3dd2658a9dab 100644 --- a/include/linux/decompress/unxz.h +++ b/include/linux/decompress/unxz.h @@ -1,10 +1,9 @@ +/* SPDX-License-Identifier: 0BSD */ + /* * Wrapper for decompressing XZ-compressed kernel, initramfs, and initrd * * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #ifndef DECOMPRESS_UNXZ_H diff --git a/include/linux/xz.h b/include/linux/xz.h index 7285ca5d56e9..5728d57aecc0 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -1,11 +1,10 @@ +/* SPDX-License-Identifier: 0BSD */ + /* * XZ decompressor * * Authors: Lasse Collin * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #ifndef XZ_H -- cgit v1.2.3 From ad8c67b870d108aa1286f4fc76d0c29a736fd75e Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 21 Jul 2024 16:36:20 +0300 Subject: xz: fix kernel-doc formatting errors in xz.h The opaque structs xz_dec and xz_dec_microlzma are declared in xz.h but their definitions are in xz_dec_lzma2.c without kernel-doc comments. Use regular comments for these structs in xz.h to avoid errors when building the docs. Add a few missing colons. Link: https://lkml.kernel.org/r/20240721133633.47721-6-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Reviewed-by: Sam James Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joel Stanley Cc: Jonathan Corbet Cc: Jubin Zhong Cc: Jules Maselbas Cc: Krzysztof Kozlowski Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/xz.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xz.h b/include/linux/xz.h index 5728d57aecc0..af1e075d9add 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -142,7 +142,7 @@ struct xz_buf { size_t out_size; }; -/** +/* * struct xz_dec - Opaque type to hold the XZ decoder state */ struct xz_dec; @@ -240,15 +240,16 @@ XZ_EXTERN void xz_dec_end(struct xz_dec *s); * marked with XZ_EXTERN. This avoids warnings about static functions that * are never defined. */ -/** + +/* * struct xz_dec_microlzma - Opaque type to hold the MicroLZMA decoder state */ struct xz_dec_microlzma; /** * xz_dec_microlzma_alloc() - Allocate memory for the MicroLZMA decoder - * @mode XZ_SINGLE or XZ_PREALLOC - * @dict_size LZMA dictionary size. This must be at least 4 KiB and + * @mode: XZ_SINGLE or XZ_PREALLOC + * @dict_size: LZMA dictionary size. This must be at least 4 KiB and * at most 3 GiB. * * In contrast to xz_dec_init(), this function only allocates the memory @@ -276,15 +277,15 @@ extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, /** * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state - * @s Decoder state allocated using xz_dec_microlzma_alloc() - * @comp_size Compressed size of the input stream - * @uncomp_size Uncompressed size of the input stream. A value smaller + * @s: Decoder state allocated using xz_dec_microlzma_alloc() + * @comp_size: Compressed size of the input stream + * @uncomp_size: Uncompressed size of the input stream. A value smaller * than the real uncompressed size of the input stream can * be specified if uncomp_size_is_exact is set to false. * uncomp_size can never be set to a value larger than the * expected real uncompressed size because it would eventually * result in XZ_DATA_ERROR. - * @uncomp_size_is_exact This is an int instead of bool to avoid + * @uncomp_size_is_exact: This is an int instead of bool to avoid * requiring stdbool.h. This should normally be set to true. * When this is set to false, error detection is weaker. */ @@ -294,7 +295,7 @@ extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, /** * xz_dec_microlzma_run() - Run the MicroLZMA decoder - * @s Decoder state initialized using xz_dec_microlzma_reset() + * @s: Decoder state initialized using xz_dec_microlzma_reset() * @b: Input and output buffers * * This works similarly to xz_dec_run() with a few important differences. -- cgit v1.2.3 From 0f2c5996340b69d167d3f6ca38d3012204787ac1 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 21 Jul 2024 16:36:21 +0300 Subject: xz: improve the MicroLZMA kernel-doc in xz.h Move the description of the format into a "DOC:" comment. Emphasize that MicroLZMA functions aren't usually needed. Link: https://lkml.kernel.org/r/20240721133633.47721-7-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Reviewed-by: Sam James Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joel Stanley Cc: Jonathan Corbet Cc: Jubin Zhong Cc: Jules Maselbas Cc: Krzysztof Kozlowski Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/xz.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xz.h b/include/linux/xz.h index af1e075d9add..701d62c02b9a 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -232,9 +232,18 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s); */ XZ_EXTERN void xz_dec_end(struct xz_dec *s); -/* - * Decompressor for MicroLZMA, an LZMA variant with a very minimal header. - * See xz_dec_microlzma_alloc() below for details. +/** + * DOC: MicroLZMA decompressor + * + * This MicroLZMA header format was created for use in EROFS but may be used + * by others too. **In most cases one needs the XZ APIs above instead.** + * + * The compressed format supported by this decoder is a raw LZMA stream + * whose first byte (always 0x00) has been replaced with bitwise-negation + * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is + * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00. + * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream + * marker must not be used. The unused values are reserved for future use. * * These functions aren't used or available in preboot code and thus aren't * marked with XZ_EXTERN. This avoids warnings about static functions that @@ -262,15 +271,6 @@ struct xz_dec_microlzma; * On success, xz_dec_microlzma_alloc() returns a pointer to * struct xz_dec_microlzma. If memory allocation fails or * dict_size is invalid, NULL is returned. - * - * The compressed format supported by this decoder is a raw LZMA stream - * whose first byte (always 0x00) has been replaced with bitwise-negation - * of the LZMA properties (lc/lp/pb) byte. For example, if lc/lp/pb is - * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00. - * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream - * marker must not be used. The unused values are reserved for future use. - * This MicroLZMA header format was created for use in EROFS but may be used - * by others too. */ extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, uint32_t dict_size); -- cgit v1.2.3 From c6f371bab25edccd39caa5dd452b50d9dfdf4ff0 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Wed, 24 Jul 2024 14:05:41 +0300 Subject: xz: remove XZ_EXTERN and extern from functions XZ_EXTERN was used to make internal functions static in the preboot code. However, in other decompressors this hasn't been done. On x86-64, this makes no difference to the kernel image size. Omit XZ_EXTERN and let some of the internal functions be extern in the preboot code. Omitting XZ_EXTERN from include/linux/xz.h fixes warnings in "make htmldocs" and makes the intradocument links to xz_dec functions work in Documentation/staging/xz.rst. The alternative would have been to add "XZ_EXTERN" to c_id_attributes in Documentation/conf.py but omitting XZ_EXTERN seemed cleaner. Link: https://lore.kernel.org/lkml/20240723205437.3c0664b0@kaneli/ Link: https://lkml.kernel.org/r/20240724110544.16430-1-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Tested-by: Michael Ellerman (powerpc) Cc: Jonathan Corbet Cc: Sam James Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joel Stanley Cc: Jubin Zhong Cc: Jules Maselbas Cc: Krzysztof Kozlowski Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/xz.h | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/xz.h b/include/linux/xz.h index 701d62c02b9a..58ae1d746c6f 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -18,11 +18,6 @@ # include #endif -/* In Linux, this is used to make extern functions static when needed. */ -#ifndef XZ_EXTERN -# define XZ_EXTERN extern -#endif - /** * enum xz_mode - Operation mode * @@ -190,7 +185,7 @@ struct xz_dec; * ready to be used with xz_dec_run(). If memory allocation fails, * xz_dec_init() returns NULL. */ -XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max); +struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max); /** * xz_dec_run() - Run the XZ decoder @@ -210,7 +205,7 @@ XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max); * get that amount valid data from the beginning of the stream. You must use * the multi-call decoder if you don't want to uncompress the whole stream. */ -XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b); +enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b); /** * xz_dec_reset() - Reset an already allocated decoder state @@ -223,14 +218,14 @@ XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b); * xz_dec_run(). Thus, explicit call to xz_dec_reset() is useful only in * multi-call mode. */ -XZ_EXTERN void xz_dec_reset(struct xz_dec *s); +void xz_dec_reset(struct xz_dec *s); /** * xz_dec_end() - Free the memory allocated for the decoder state * @s: Decoder state allocated using xz_dec_init(). If s is NULL, * this function does nothing. */ -XZ_EXTERN void xz_dec_end(struct xz_dec *s); +void xz_dec_end(struct xz_dec *s); /** * DOC: MicroLZMA decompressor @@ -244,10 +239,6 @@ XZ_EXTERN void xz_dec_end(struct xz_dec *s); * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00. * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream * marker must not be used. The unused values are reserved for future use. - * - * These functions aren't used or available in preboot code and thus aren't - * marked with XZ_EXTERN. This avoids warnings about static functions that - * are never defined. */ /* @@ -272,8 +263,8 @@ struct xz_dec_microlzma; * struct xz_dec_microlzma. If memory allocation fails or * dict_size is invalid, NULL is returned. */ -extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, - uint32_t dict_size); +struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size); /** * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state @@ -289,9 +280,8 @@ extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, * requiring stdbool.h. This should normally be set to true. * When this is set to false, error detection is weaker. */ -extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, - uint32_t comp_size, uint32_t uncomp_size, - int uncomp_size_is_exact); +void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size, + uint32_t uncomp_size, int uncomp_size_is_exact); /** * xz_dec_microlzma_run() - Run the MicroLZMA decoder @@ -329,15 +319,14 @@ extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, * may be changed normally like with XZ_PREALLOC. This way input data can be * provided from non-contiguous memory. */ -extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, - struct xz_buf *b); +enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, struct xz_buf *b); /** * xz_dec_microlzma_end() - Free the memory allocated for the decoder state * @s: Decoder state allocated using xz_dec_microlzma_alloc(). * If s is NULL, this function does nothing. */ -extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s); +void xz_dec_microlzma_end(struct xz_dec_microlzma *s); /* * Standalone build (userspace build or in-kernel build for boot time use) @@ -358,13 +347,13 @@ extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s); * This must be called before any other xz_* function to initialize * the CRC32 lookup table. */ -XZ_EXTERN void xz_crc32_init(void); +void xz_crc32_init(void); /* * Update CRC32 value using the polynomial from IEEE-802.3. To start a new * calculation, the third argument must be zero. To continue the calculation, * the previously returned value is passed as the third argument. */ -XZ_EXTERN uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc); +uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc); #endif #endif -- cgit v1.2.3 From 9a42bfd255b288dad2d1a9df0a1fe58394d5da12 Mon Sep 17 00:00:00 2001 From: Deshan Zhang Date: Thu, 25 Jul 2024 17:30:45 +0800 Subject: lib/lru_cache: fix spelling mistake "colision"->"collision" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a spelling mistake in a literal string and in cariable names. Fix these. Link: https://lkml.kernel.org/r/20240725093044.1742842-1-deshan@nfschina.com Signed-off-by: Deshan Zhang Cc: Christoph Böhmwalder Cc: Lars Ellenberg Cc: Philipp Reisner Signed-off-by: Andrew Morton --- include/linux/lru_cache.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index c9afcdd9324c..ff82ef85a084 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -119,7 +119,7 @@ write intent log information, three of which are mentioned here. */ /* this defines an element in a tracked set - * .colision is for hash table lookup. + * .collision is for hash table lookup. * When we process a new IO request, we know its sector, thus can deduce the * region number (label) easily. To do the label -> object lookup without a * full list walk, we use a simple hash table. @@ -145,7 +145,7 @@ write intent log information, three of which are mentioned here. * But it avoids high order page allocations in kmalloc. */ struct lc_element { - struct hlist_node colision; + struct hlist_node collision; struct list_head list; /* LRU list or free list */ unsigned refcnt; /* back "pointer" into lc_cache->element[index], -- cgit v1.2.3 From 6ce2082fd3a25d5a8c756120959237cace0379f1 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 13 Aug 2024 15:12:35 +0300 Subject: fault-inject: improve build for CONFIG_FAULT_INJECTION=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fault-inject.h users across the kernel need to add a lot of #ifdef CONFIG_FAULT_INJECTION to cater for shortcomings in the header. Make fault-inject.h self-contained for CONFIG_FAULT_INJECTION=n, and add stubs for DECLARE_FAULT_ATTR(), setup_fault_attr(), should_fail_ex(), and should_fail() to allow removal of conditional compilation. [akpm@linux-foundation.org: repair fallout from no longer including debugfs.h into fault-inject.h] [akpm@linux-foundation.org: fix drivers/misc/xilinx_tmr_inject.c] [akpm@linux-foundation.org: Add debugfs.h inclusion to more files, per Stephen] Link: https://lkml.kernel.org/r/20240813121237.2382534-1-jani.nikula@intel.com Fixes: 6ff1cb355e62 ("[PATCH] fault-injection capabilities infrastructure") Signed-off-by: Jani Nikula Cc: Akinobu Mita Cc: Abhinav Kumar Cc: Dmitry Baryshkov Cc: Himal Prasad Ghimiray Cc: Lucas De Marchi Cc: Rob Clark Cc: Rodrigo Vivi Cc: Thomas Hellström Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/fault-inject.h | 36 +++++++++++++++++++++++++++++------- include/linux/mmc/host.h | 1 + 2 files changed, 30 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 354413950d34..8c829d28dcf3 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -2,13 +2,17 @@ #ifndef _LINUX_FAULT_INJECT_H #define _LINUX_FAULT_INJECT_H +#include +#include + +struct dentry; +struct kmem_cache; + #ifdef CONFIG_FAULT_INJECTION -#include -#include +#include #include #include -#include /* * For explanation of the elements of this struct, see @@ -51,6 +55,28 @@ int setup_fault_attr(struct fault_attr *attr, char *str); bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags); bool should_fail(struct fault_attr *attr, ssize_t size); +#else /* CONFIG_FAULT_INJECTION */ + +struct fault_attr { +}; + +#define DECLARE_FAULT_ATTR(name) struct fault_attr name = {} + +static inline int setup_fault_attr(struct fault_attr *attr, char *str) +{ + return 0; /* Note: 0 means error for __setup() handlers! */ +} +static inline bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags) +{ + return false; +} +static inline bool should_fail(struct fault_attr *attr, ssize_t size) +{ + return false; +} + +#endif /* CONFIG_FAULT_INJECTION */ + #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS struct dentry *fault_create_debugfs_attr(const char *name, @@ -87,10 +113,6 @@ static inline void fault_config_init(struct fault_config *config, #endif /* CONFIG_FAULT_INJECTION_CONFIGFS */ -#endif /* CONFIG_FAULT_INJECTION */ - -struct kmem_cache; - #ifdef CONFIG_FAIL_PAGE_ALLOC bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); #else diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 88c6a76042ee..49470188fca7 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From d994c238347d7ba4de15da00985e1bea75e91dc7 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 17 Aug 2024 14:37:54 +0200 Subject: ratelimit: convert flags to int to save 8 bytes in size Only bit 1 is used, making an unsigned long a total overkill. This brings it from 40 to 32 bytes, which in turn shrinks user_struct from 136 to 128 bytes. Since the latter is allocated with hwalign, this means the total usage goes down from 192 to 128 bytes per object. No functional changes. Link: https://lkml.kernel.org/r/20240817123754.240924-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Signed-off-by: Andrew Morton --- include/linux/ratelimit_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h index 002266693e50..765232ce0b5e 100644 --- a/include/linux/ratelimit_types.h +++ b/include/linux/ratelimit_types.h @@ -19,8 +19,8 @@ struct ratelimit_state { int burst; int printed; int missed; + unsigned int flags; unsigned long begin; - unsigned long flags; }; #define RATELIMIT_STATE_INIT_FLAGS(name, interval_init, burst_init, flags_init) { \ -- cgit v1.2.3 From 32cebfe1cc21a84e4a907575bb9fd1c3f6b091fd Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Tue, 27 Aug 2024 10:45:15 +0800 Subject: lib/string_choices: add str_true_false()/str_false_true() helper Add str_true_false()/str_false_true() helper to retur a "true" or "false" string literal. We found more than 10 cases currently exist in the tree. So these helpers can be used for these cases. This patch (of 3): Add str_true_false()/str_false_true() helper to return "true" or "false" string literal. Link: https://lkml.kernel.org/r/20240827024517.914100-1-lihongbo22@huawei.com Link: https://lkml.kernel.org/r/20240827024517.914100-2-lihongbo22@huawei.com Signed-off-by: Hongbo Li Cc: Andy Shevchenko Cc: Anna Schumaker Cc: Greg Kroah-Hartman Cc: Kees Cook Cc: Matthew Wilcox Cc: Trond Myklebust Signed-off-by: Andrew Morton --- include/linux/string_choices.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index d9ebe20229f8..4a2432313b8e 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -42,6 +42,12 @@ static inline const char *str_yes_no(bool v) return v ? "yes" : "no"; } +static inline const char *str_true_false(bool v) +{ + return v ? "true" : "false"; +} +#define str_false_true(v) str_true_false(!(v)) + /** * str_plural - Return the simple pluralization based on English counts * @num: Number used for deciding pluralization -- cgit v1.2.3 From c4a315eaf8eff0d3234600e13db7e7c71c0b3405 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 21 Aug 2024 14:14:56 +0200 Subject: gpio: ath79: remove support for platform data There are no more board files defining platform data for this driver so remove the header and support from the driver. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240821121456.19553-4-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/platform_data/gpio-ath79.h | 16 ---------------- 1 file changed, 16 deletions(-) delete mode 100644 include/linux/platform_data/gpio-ath79.h (limited to 'include/linux') diff --git a/include/linux/platform_data/gpio-ath79.h b/include/linux/platform_data/gpio-ath79.h deleted file mode 100644 index 3ea6dd942c27..000000000000 --- a/include/linux/platform_data/gpio-ath79.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Atheros AR7XXX/AR9XXX GPIO controller platform data - * - * Copyright (C) 2015 Alban Bedel - */ - -#ifndef __LINUX_PLATFORM_DATA_GPIO_ATH79_H -#define __LINUX_PLATFORM_DATA_GPIO_ATH79_H - -struct ath79_gpio_platform_data { - unsigned ngpios; - bool oe_inverted; -}; - -#endif -- cgit v1.2.3 From 4b2b0a2ce8153d65d0829e45e73bf6acdc291344 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 28 Aug 2024 17:23:57 +0300 Subject: gpiolib: legacy: Kill GPIOF_INIT_* definitions Besides the fact that (old) drivers use wrong definitions, e.g., GPIOF_INIT_HIGH instead of GPIOF_OUT_INIT_HIGH, shrink the legacy definitions by killing those GPIOF_INIT_* completely. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Acked-by: Alexander Sverdlin Reviewed-by: Alexander Sverdlin Link: https://lore.kernel.org/r/20240828142554.2424189-2-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 063f71b18a7c..4af8ad114557 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -20,12 +20,9 @@ struct device; #define GPIOF_DIR_OUT (0 << 0) #define GPIOF_DIR_IN (1 << 0) -#define GPIOF_INIT_LOW (0 << 1) -#define GPIOF_INIT_HIGH (1 << 1) - #define GPIOF_IN (GPIOF_DIR_IN) -#define GPIOF_OUT_INIT_LOW (GPIOF_DIR_OUT | GPIOF_INIT_LOW) -#define GPIOF_OUT_INIT_HIGH (GPIOF_DIR_OUT | GPIOF_INIT_HIGH) +#define GPIOF_OUT_INIT_LOW (GPIOF_DIR_OUT | (0 << 1)) +#define GPIOF_OUT_INIT_HIGH (GPIOF_DIR_OUT | (1 << 1)) /* Gpio pin is active-low */ #define GPIOF_ACTIVE_LOW (1 << 2) -- cgit v1.2.3 From 8c045ca534d03bab1ce4b4de49d29a36276a1e35 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 28 Aug 2024 17:23:58 +0300 Subject: gpiolib: legacy: Kill GPIOF_DIR_* definitions Besides the fact that (old) drivers use wrong definitions, e.g., GPIOF_DIR_IN instead of GPIOF_IN, shrink the legacy definitions by killing those GPIOF_DIR_* completely. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Acked-by: Alexander Sverdlin Reviewed-by: Alexander Sverdlin Link: https://lore.kernel.org/r/20240828142554.2424189-3-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 4af8ad114557..2d105be7bbc3 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -17,12 +17,9 @@ struct device; /* make these flag values available regardless of GPIO kconfig options */ -#define GPIOF_DIR_OUT (0 << 0) -#define GPIOF_DIR_IN (1 << 0) - -#define GPIOF_IN (GPIOF_DIR_IN) -#define GPIOF_OUT_INIT_LOW (GPIOF_DIR_OUT | (0 << 1)) -#define GPIOF_OUT_INIT_HIGH (GPIOF_DIR_OUT | (1 << 1)) +#define GPIOF_IN ((1 << 0)) +#define GPIOF_OUT_INIT_LOW ((0 << 0) | (0 << 1)) +#define GPIOF_OUT_INIT_HIGH ((0 << 0) | (1 << 1)) /* Gpio pin is active-low */ #define GPIOF_ACTIVE_LOW (1 << 2) -- cgit v1.2.3 From e220917fa50774fedb27c075df2261fd664e8ca3 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Thu, 22 Aug 2024 15:50:12 +0200 Subject: mm: split a folio in minimum folio order chunks split_folio() and split_folio_to_list() assume order 0, to support minorder for non-anonymous folios, we must expand these to check the folio mapping order and use that. Set new_order to be at least minimum folio order if it is set in split_huge_page_to_list() so that we can maintain minimum folio order requirement in the page cache. Update the debugfs write files used for testing to ensure the order is respected as well. We simply enforce the min order when a file mapping is used. Signed-off-by: Luis Chamberlain Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240902124931.506061-2-kernel@pankajraghav.com # folded fix Link: https://lore.kernel.org/r/20240822135018.1931258-5-kernel@pankajraghav.com Tested-by: David Howells Reviewed-by: Hannes Reinecke Reviewed-by: Zi Yan Signed-off-by: Christian Brauner --- include/linux/huge_mm.h | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e25d9ebfdf89..7a570e0437c9 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -96,6 +96,8 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) +#define split_folio(f) split_folio_to_list(f, NULL) + #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PUD_SHIFT PUD_SHIFT @@ -317,9 +319,24 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); +int min_order_for_split(struct folio *folio); +int split_folio_to_list(struct folio *folio, struct list_head *list); static inline int split_huge_page(struct page *page) { - return split_huge_page_to_list_to_order(page, NULL, 0); + struct folio *folio = page_folio(page); + int ret = min_order_for_split(folio); + + if (ret < 0) + return ret; + + /* + * split_huge_page() locks the page before splitting and + * expects the same page that has been split to be locked when + * returned. split_folio(page_folio(page)) cannot be used here + * because it converts the page to folio and passes the head + * page to be split. + */ + return split_huge_page_to_list_to_order(page, NULL, ret); } void deferred_split_folio(struct folio *folio); @@ -484,6 +501,12 @@ static inline int split_huge_page(struct page *page) { return 0; } + +static inline int split_folio_to_list(struct folio *folio, struct list_head *list) +{ + return 0; +} + static inline void deferred_split_folio(struct folio *folio) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) @@ -598,7 +621,4 @@ static inline int split_folio_to_order(struct folio *folio, int new_order) return split_folio_to_list_to_order(folio, NULL, new_order); } -#define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0) -#define split_folio(f) split_folio_to_order(f, 0) - #endif /* _LINUX_HUGE_MM_H */ -- cgit v1.2.3 From f46a6e16519712651e2304da03ec144cc0bb084c Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Fri, 30 Aug 2024 18:26:28 +0300 Subject: usb: Add tunnel_mode parameter to usb device structure Add 'tunnel_mode' enum to usb device structure to describe if a USB3 link is tunneled over USB4, or connected directly using native USB2/USB3 protocols. Tunneled devices depend on USB4 NHI host to maintain the tunnel. Knowledge about tunneled devices is important to ensure correct suspend and resume order between USB4 hosts and tunneled devices. i.e. make sure tunnel is up before the USB device using it resumes. USB hosts such as xHCI may have vendor specific ways to detect tunneled connections. This 'tunnel_mode' parameter can be set by USB3 host driver during hcd->driver->update_device(hcd, udev) callback. tunnel_mode can be set to: USB_LINK_UNKNOWN = 0 USB_LINK_NATIVE USB_LINK_TUNNELED USB_LINK_UNKNOWN is used in case host is not capable of detecting tunneled links. Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20240830152630.3943215-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 832997a9da0a..672d8fc2abdb 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -495,6 +495,12 @@ struct usb_dev_state; struct usb_tt; +enum usb_link_tunnel_mode { + USB_LINK_UNKNOWN = 0, + USB_LINK_NATIVE, + USB_LINK_TUNNELED, +}; + enum usb_port_connect_type { USB_PORT_CONNECT_TYPE_UNKNOWN = 0, USB_PORT_CONNECT_TYPE_HOT_PLUG, @@ -605,6 +611,7 @@ struct usb3_lpm_parameters { * WUSB devices are not, until we authorize them from user space. * FIXME -- complete doc * @authenticated: Crypto authentication passed + * @tunnel_mode: Connection native or tunneled over USB4 * @lpm_capable: device supports LPM * @lpm_devinit_allow: Allow USB3 device initiated LPM, exit latency is in range * @usb2_hw_lpm_capable: device can perform USB2 hardware LPM @@ -714,6 +721,7 @@ struct usb_device { unsigned do_remote_wakeup:1; unsigned reset_resume:1; unsigned port_is_suspended:1; + enum usb_link_tunnel_mode tunnel_mode; int slot_id; struct usb2_lpm_parameters l1_params; -- cgit v1.2.3 From d9c61bb33fbbcbc2d5f69efa10db116dab4b56cc Mon Sep 17 00:00:00 2001 From: Michael Grzeschik Date: Sun, 1 Sep 2024 21:11:16 +0200 Subject: usb: gadget: function: move u_f.h to include/linux/usb/func_utils.h We move the func_utils.h header to include/linux/usb to be able to compile function drivers outside of the drivers/usb/gadget/function directory. Signed-off-by: Michael Grzeschik Link: https://lore.kernel.org/r/20240116-ml-topic-u9p-v12-1-9a27de5160e0@pengutronix.de Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/func_utils.h | 86 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 include/linux/usb/func_utils.h (limited to 'include/linux') diff --git a/include/linux/usb/func_utils.h b/include/linux/usb/func_utils.h new file mode 100644 index 000000000000..c8795c965109 --- /dev/null +++ b/include/linux/usb/func_utils.h @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * func_utils.h + * + * Utility definitions for USB functions + * + * Copyright (c) 2013 Samsung Electronics Co., Ltd. + * http://www.samsung.com + * + * Author: Andrzej Pietrasiewicz + */ + +#ifndef _FUNC_UTILS_H_ +#define _FUNC_UTILS_H_ + +#include +#include + +/* Variable Length Array Macros **********************************************/ +#define vla_group(groupname) size_t groupname##__next = 0 +#define vla_group_size(groupname) groupname##__next + +#define vla_item(groupname, type, name, n) \ + size_t groupname##_##name##__offset = ({ \ + size_t offset = 0; \ + if (groupname##__next != SIZE_MAX) { \ + size_t align_mask = __alignof__(type) - 1; \ + size_t size = array_size(n, sizeof(type)); \ + offset = (groupname##__next + align_mask) & \ + ~align_mask; \ + if (check_add_overflow(offset, size, \ + &groupname##__next)) { \ + groupname##__next = SIZE_MAX; \ + offset = 0; \ + } \ + } \ + offset; \ + }) + +#define vla_item_with_sz(groupname, type, name, n) \ + size_t groupname##_##name##__sz = array_size(n, sizeof(type)); \ + size_t groupname##_##name##__offset = ({ \ + size_t offset = 0; \ + if (groupname##__next != SIZE_MAX) { \ + size_t align_mask = __alignof__(type) - 1; \ + offset = (groupname##__next + align_mask) & \ + ~align_mask; \ + if (check_add_overflow(offset, groupname##_##name##__sz,\ + &groupname##__next)) { \ + groupname##__next = SIZE_MAX; \ + offset = 0; \ + } \ + } \ + offset; \ + }) + +#define vla_ptr(ptr, groupname, name) \ + ((void *) ((char *)ptr + groupname##_##name##__offset)) + +struct usb_ep; +struct usb_request; + +/** + * alloc_ep_req - returns a usb_request allocated by the gadget driver and + * allocates the request's buffer. + * + * @ep: the endpoint to allocate a usb_request + * @len: usb_requests's buffer suggested size + * + * In case @ep direction is OUT, the @len will be aligned to ep's + * wMaxPacketSize. In order to avoid memory leaks or drops, *always* use + * usb_requests's length (req->length) to refer to the allocated buffer size. + * Requests allocated via alloc_ep_req() *must* be freed by free_ep_req(). + */ +struct usb_request *alloc_ep_req(struct usb_ep *ep, size_t len); + +/* Frees a usb_request previously allocated by alloc_ep_req() */ +static inline void free_ep_req(struct usb_ep *ep, struct usb_request *req) +{ + WARN_ON(req->buf == NULL); + kfree(req->buf); + req->buf = NULL; + usb_ep_free_request(ep, req); +} + +#endif /* _FUNC_UTILS_H_ */ -- cgit v1.2.3 From beb5a9bea8239cdf4adf6b62672e30db3e9fa5ce Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 29 Aug 2024 14:33:36 +0200 Subject: netdevice: convert private flags > BIT(31) to bitfields Make dev->priv_flags `u32` back and define bits higher than 31 as bitfield booleans as per Jakub's suggestion. This simplifies code which accesses these bits with no optimization loss (testb both before/after), allows to not extend &netdev_priv_flags each time, but also scales better as bits > 63 in the future would only add a new u64 to the structure with no complications, comparing to that extending ::priv_flags would require converting it to a bitmap. Note that I picked `unsigned long :1` to not lose any potential optimizations comparing to `bool :1` etc. Suggested-by: Jakub Kicinski Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index fce70990b209..d6f35c9d8580 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1613,7 +1613,8 @@ struct net_device_ops { * userspace; this means that the order of these flags can change * during any kernel release. * - * You should have a pretty good reason to be extending these flags. + * You should add bitfield booleans after either net_device::priv_flags + * (hotpath) or ::threaded (slowpath) instead of extending these flags. * * @IFF_802_1Q_VLAN: 802.1Q VLAN device * @IFF_EBRIDGE: Ethernet bridging device @@ -1652,10 +1653,6 @@ struct net_device_ops { * @IFF_NO_ADDRCONF: prevent ipv6 addrconf * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) - * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN - * @IFF_SEE_ALL_HWTSTAMP_REQUESTS: device wants to see calls to - * ndo_hwtstamp_set() for all timestamp requests regardless of source, - * even if those aren't HWTSTAMP_SOURCE_NETDEV. */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1690,8 +1687,6 @@ enum netdev_priv_flags { IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_NO_ADDRCONF = BIT_ULL(30), IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), - IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), - IFF_SEE_ALL_HWTSTAMP_REQUESTS = BIT_ULL(33), }; /* Specifies the type of the struct net_device::ml_priv pointer */ @@ -1723,6 +1718,9 @@ enum netdev_reg_state { * data with strictly "high-level" data, and it has to know about * almost every data structure used in the INET module. * + * @priv_flags: flags invisible to userspace defined as bits, see + * enum netdev_priv_flags for the definitions + * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name * of the interface. @@ -1789,8 +1787,6 @@ enum netdev_reg_state { * * @flags: Interface flags (a la BSD) * @xdp_features: XDP capability supported by the device - * @priv_flags: Like 'flags' but invisible to userspace, - * see if.h for the definitions * @gflags: Global flags ( kept as legacy ) * @priv_len: Size of the ->priv flexible array * @priv: Flexible array containing private data @@ -1964,6 +1960,12 @@ enum netdev_reg_state { * * @threaded: napi threaded mode is enabled * + * @see_all_hwtstamp_requests: device wants to see calls to + * ndo_hwtstamp_set() for all timestamp requests + * regardless of source, even if those aren't + * HWTSTAMP_SOURCE_NETDEV + * @change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN + * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved * to another network namespace. @@ -2014,7 +2016,9 @@ struct net_device { /* TX read-mostly hotpath */ __cacheline_group_begin(net_device_read_tx); - unsigned long long priv_flags; + struct_group(priv_flags_fast, + unsigned long priv_flags:32; + ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; struct netdev_queue *_tx; @@ -2350,6 +2354,10 @@ struct net_device { bool proto_down; bool threaded; + /* priv_flags_slow, ungrouped to save space */ + unsigned long see_all_hwtstamp_requests:1; + unsigned long change_proto_down:1; + struct list_head net_notifier_list; #if IS_ENABLED(CONFIG_MACSEC) -- cgit v1.2.3 From 00d066a4d4edbe559ba6c35153da71d4b2b8a383 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 29 Aug 2024 14:33:37 +0200 Subject: netdev_features: convert NETIF_F_LLTX to dev->lltx NETIF_F_LLTX can't be changed via Ethtool and is not a feature, rather an attribute, very similar to IFF_NO_QUEUE (and hot). Free one netdev_features_t bit and make it a "hot" private flag. Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/linux/netdev_features.h | 6 ++---- include/linux/netdevice.h | 10 +++++++--- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 7c2d77d75a88..a2e20b517584 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -24,8 +24,7 @@ enum { NETIF_F_HW_VLAN_CTAG_FILTER_BIT,/* Receive filtering on VLAN CTAGs */ NETIF_F_VLAN_CHALLENGED_BIT, /* Device cannot handle VLAN packets */ NETIF_F_GSO_BIT, /* Enable software GSO. */ - NETIF_F_LLTX_BIT, /* LockLess TX - deprecated. Please */ - /* do not use LLTX in new drivers */ + __UNUSED_NETIF_F_12, NETIF_F_NETNS_LOCAL_BIT, /* Does not change network namespaces */ NETIF_F_GRO_BIT, /* Generic receive offload */ NETIF_F_LRO_BIT, /* large receive offload */ @@ -120,7 +119,6 @@ enum { #define NETIF_F_HW_VLAN_CTAG_TX __NETIF_F(HW_VLAN_CTAG_TX) #define NETIF_F_IP_CSUM __NETIF_F(IP_CSUM) #define NETIF_F_IPV6_CSUM __NETIF_F(IPV6_CSUM) -#define NETIF_F_LLTX __NETIF_F(LLTX) #define NETIF_F_LOOPBACK __NETIF_F(LOOPBACK) #define NETIF_F_LRO __NETIF_F(LRO) #define NETIF_F_NETNS_LOCAL __NETIF_F(NETNS_LOCAL) @@ -193,7 +191,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ #define NETIF_F_NEVER_CHANGE (NETIF_F_VLAN_CHALLENGED | \ - NETIF_F_LLTX | NETIF_F_NETNS_LOCAL) + NETIF_F_NETNS_LOCAL) /* remember that ((t)1 << t_BITS) is undefined in C99 */ #define NETIF_F_ETHTOOL_BITS ((__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) | \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d6f35c9d8580..e22ca5e07490 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1720,6 +1720,9 @@ enum netdev_reg_state { * * @priv_flags: flags invisible to userspace defined as bits, see * enum netdev_priv_flags for the definitions + * @lltx: device supports lockless Tx. Deprecated for real HW + * drivers. Mainly used by logical interfaces, such as + * bonding and tunnels * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name @@ -2018,6 +2021,7 @@ struct net_device { __cacheline_group_begin(net_device_read_tx); struct_group(priv_flags_fast, unsigned long priv_flags:32; + unsigned long lltx:1; ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; @@ -4433,7 +4437,7 @@ static inline void netif_tx_unlock_bh(struct net_device *dev) } #define HARD_TX_LOCK(dev, txq, cpu) { \ - if ((dev->features & NETIF_F_LLTX) == 0) { \ + if (!(dev)->lltx) { \ __netif_tx_lock(txq, cpu); \ } else { \ __netif_tx_acquire(txq); \ @@ -4441,12 +4445,12 @@ static inline void netif_tx_unlock_bh(struct net_device *dev) } #define HARD_TX_TRYLOCK(dev, txq) \ - (((dev->features & NETIF_F_LLTX) == 0) ? \ + (!(dev)->lltx ? \ __netif_tx_trylock(txq) : \ __netif_tx_acquire(txq)) #define HARD_TX_UNLOCK(dev, txq) { \ - if ((dev->features & NETIF_F_LLTX) == 0) { \ + if (!(dev)->lltx) { \ __netif_tx_unlock(txq); \ } else { \ __netif_tx_release(txq); \ -- cgit v1.2.3 From 05c1280a2bcfca187fe7fa90bb240602cf54af0a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 29 Aug 2024 14:33:38 +0200 Subject: netdev_features: convert NETIF_F_NETNS_LOCAL to dev->netns_local "Interface can't change network namespaces" is rather an attribute, not a feature, and it can't be changed via Ethtool. Make it a "cold" private flag instead of a netdev_feature and free one more bit. Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/linux/netdev_features.h | 6 ++---- include/linux/netdevice.h | 2 ++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index a2e20b517584..d5a3836f4793 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -25,7 +25,7 @@ enum { NETIF_F_VLAN_CHALLENGED_BIT, /* Device cannot handle VLAN packets */ NETIF_F_GSO_BIT, /* Enable software GSO. */ __UNUSED_NETIF_F_12, - NETIF_F_NETNS_LOCAL_BIT, /* Does not change network namespaces */ + __UNUSED_NETIF_F_13, NETIF_F_GRO_BIT, /* Generic receive offload */ NETIF_F_LRO_BIT, /* large receive offload */ @@ -121,7 +121,6 @@ enum { #define NETIF_F_IPV6_CSUM __NETIF_F(IPV6_CSUM) #define NETIF_F_LOOPBACK __NETIF_F(LOOPBACK) #define NETIF_F_LRO __NETIF_F(LRO) -#define NETIF_F_NETNS_LOCAL __NETIF_F(NETNS_LOCAL) #define NETIF_F_NOCACHE_COPY __NETIF_F(NOCACHE_COPY) #define NETIF_F_NTUPLE __NETIF_F(NTUPLE) #define NETIF_F_RXCSUM __NETIF_F(RXCSUM) @@ -190,8 +189,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ -#define NETIF_F_NEVER_CHANGE (NETIF_F_VLAN_CHALLENGED | \ - NETIF_F_NETNS_LOCAL) +#define NETIF_F_NEVER_CHANGE NETIF_F_VLAN_CHALLENGED /* remember that ((t)1 << t_BITS) is undefined in C99 */ #define NETIF_F_ETHTOOL_BITS ((__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) | \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e22ca5e07490..a698e2402420 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1968,6 +1968,7 @@ enum netdev_reg_state { * regardless of source, even if those aren't * HWTSTAMP_SOURCE_NETDEV * @change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN + * @netns_local: interface can't change network namespaces * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved @@ -2361,6 +2362,7 @@ struct net_device { /* priv_flags_slow, ungrouped to save space */ unsigned long see_all_hwtstamp_requests:1; unsigned long change_proto_down:1; + unsigned long netns_local:1; struct list_head net_notifier_list; -- cgit v1.2.3 From 782dbbf589cd9082effaec522e3f1b4ce1594803 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 29 Aug 2024 14:33:39 +0200 Subject: netdev_features: convert NETIF_F_FCOE_MTU to dev->fcoe_mtu Ability to handle maximum FCoE frames of 2158 bytes can never be changed and thus more of an attribute, not a toggleable feature. Move it from netdev_features_t to "cold" priv flags (bitfield bool) and free yet another feature bit. Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/linux/netdev_features.h | 6 ++---- include/linux/netdevice.h | 2 ++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index d5a3836f4793..37af2c6e7caf 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -58,7 +58,7 @@ enum { NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ - NETIF_F_FCOE_MTU_BIT, /* Supports max FCoE MTU, 2158 bytes*/ + __UNUSED_NETIF_F_37, NETIF_F_NTUPLE_BIT, /* N-tuple filters supported */ NETIF_F_RXHASH_BIT, /* Receive hashing offload */ NETIF_F_RXCSUM_BIT, /* Receive checksumming offload */ @@ -105,7 +105,6 @@ enum { #define __NETIF_F(name) __NETIF_F_BIT(NETIF_F_##name##_BIT) #define NETIF_F_FCOE_CRC __NETIF_F(FCOE_CRC) -#define NETIF_F_FCOE_MTU __NETIF_F(FCOE_MTU) #define NETIF_F_FRAGLIST __NETIF_F(FRAGLIST) #define NETIF_F_FSO __NETIF_F(FSO) #define NETIF_F_GRO __NETIF_F(GRO) @@ -210,8 +209,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | \ NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID) -#define NETIF_F_ALL_FCOE (NETIF_F_FCOE_CRC | NETIF_F_FCOE_MTU | \ - NETIF_F_FSO) +#define NETIF_F_ALL_FCOE (NETIF_F_FCOE_CRC | NETIF_F_FSO) /* List of features with software fallbacks. */ #define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_GSO_SCTP | \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a698e2402420..ca5f0dda733b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1969,6 +1969,7 @@ enum netdev_reg_state { * HWTSTAMP_SOURCE_NETDEV * @change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN * @netns_local: interface can't change network namespaces + * @fcoe_mtu: device supports maximum FCoE MTU, 2158 bytes * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved @@ -2363,6 +2364,7 @@ struct net_device { unsigned long see_all_hwtstamp_requests:1; unsigned long change_proto_down:1; unsigned long netns_local:1; + unsigned long fcoe_mtu:1; struct list_head net_notifier_list; -- cgit v1.2.3 From a61fec1c87be682b5c0fdba6074649c469c675a3 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 29 Aug 2024 14:33:40 +0200 Subject: netdev_features: remove NETIF_F_ALL_FCOE NETIF_F_ALL_FCOE is used only in vlan_dev.c, 2 times. Now that it's only 2 bits, open-code it and remove the definition from netdev_features.h. Suggested-by: Jakub Kicinski Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- include/linux/netdev_features.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 37af2c6e7caf..66e7d26b70a4 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -209,8 +209,6 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) #define NETIF_F_ALL_TSO (NETIF_F_TSO | NETIF_F_TSO6 | \ NETIF_F_TSO_ECN | NETIF_F_TSO_MANGLEID) -#define NETIF_F_ALL_FCOE (NETIF_F_FCOE_CRC | NETIF_F_FSO) - /* List of features with software fallbacks. */ #define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_GSO_SCTP | \ NETIF_F_GSO_UDP_L4 | NETIF_F_GSO_FRAGLIST) -- cgit v1.2.3 From b45ed06f46737f8c2ee65698f4305409f2386674 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Tue, 13 Aug 2024 22:19:32 +0800 Subject: drivers/base: Introduce device_match_t for device finding APIs There are several drivers/base APIs for finding a specific device, and they currently use the following good type for the @match parameter: int (*match)(struct device *dev, const void *data) Since these operations do not modify the caller-provided @*data, this type is worthy of a dedicated typedef: typedef int (*device_match_t)(struct device *dev, const void *data) Advantages of using device_match_t: - Shorter API declarations and definitions - Prevent further APIs from using a bad type for @match So introduce device_match_t and apply it to the existing (bus|class|driver|auxiliary)_find_device() APIs. Signed-off-by: Zijun Hu Link: https://lore.kernel.org/r/20240813-dev_match_api-v3-1-6c6878a99b9f@quicinc.com Signed-off-by: Greg Kroah-Hartman --- include/linux/auxiliary_bus.h | 2 +- include/linux/device/bus.h | 6 ++++-- include/linux/device/class.h | 2 +- include/linux/device/driver.h | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index 662b8ae54b6a..31762324bcc9 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -271,6 +271,6 @@ void auxiliary_driver_unregister(struct auxiliary_driver *auxdrv); struct auxiliary_device *auxiliary_find_device(struct device *start, const void *data, - int (*match)(struct device *dev, const void *data)); + device_match_t match); #endif /* _AUXILIARY_BUS_H_ */ diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index 807831d6bf0f..cdc4757217f9 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -126,6 +126,9 @@ struct bus_attribute { int __must_check bus_create_file(const struct bus_type *bus, struct bus_attribute *attr); void bus_remove_file(const struct bus_type *bus, struct bus_attribute *attr); +/* Matching function type for drivers/base APIs to find a specific device */ +typedef int (*device_match_t)(struct device *dev, const void *data); + /* Generic device matching functions that all busses can use to match with */ int device_match_name(struct device *dev, const void *name); int device_match_of_node(struct device *dev, const void *np); @@ -139,8 +142,7 @@ int device_match_any(struct device *dev, const void *unused); int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data, int (*fn)(struct device *dev, void *data)); struct device *bus_find_device(const struct bus_type *bus, struct device *start, - const void *data, - int (*match)(struct device *dev, const void *data)); + const void *data, device_match_t match); /** * bus_find_device_by_name - device iterator for locating a particular device * of a specific name. diff --git a/include/linux/device/class.h b/include/linux/device/class.h index c576b49c55c2..518c9c83d64b 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -95,7 +95,7 @@ void class_dev_iter_exit(struct class_dev_iter *iter); int class_for_each_device(const struct class *class, const struct device *start, void *data, int (*fn)(struct device *dev, void *data)); struct device *class_find_device(const struct class *class, const struct device *start, - const void *data, int (*match)(struct device *, const void *)); + const void *data, device_match_t match); /** * class_find_device_by_name - device iterator for locating a particular device diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index 1fc8b68786de..5c04b8e3833b 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -157,7 +157,7 @@ int __must_check driver_for_each_device(struct device_driver *drv, struct device void *data, int (*fn)(struct device *dev, void *)); struct device *driver_find_device(const struct device_driver *drv, struct device *start, const void *data, - int (*match)(struct device *dev, const void *data)); + device_match_t match); /** * driver_find_device_by_name - device iterator for locating a particular device -- cgit v1.2.3 From 24e041e1e48d06f25a12caaf73728a4ec2e511fe Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Fri, 23 Aug 2024 15:55:44 +0800 Subject: platform: Make platform_bus_type constant Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the platform_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20240823075544.144426-1-kunwu.chan@linux.dev Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index d422db6eec63..7132623e4658 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -52,7 +52,7 @@ struct platform_device { extern int platform_device_register(struct platform_device *); extern void platform_device_unregister(struct platform_device *); -extern struct bus_type platform_bus_type; +extern const struct bus_type platform_bus_type; extern struct device platform_bus; extern struct resource *platform_get_resource(struct platform_device *, -- cgit v1.2.3 From 8064952c65045f05ee2671fe437770e50c151776 Mon Sep 17 00:00:00 2001 From: Stuart Hayes Date: Thu, 22 Aug 2024 15:28:04 -0500 Subject: driver core: shut down devices asynchronously Add code to allow asynchronous shutdown of devices, ensuring that each device is shut down before its parents & suppliers. Only devices with drivers that have async_shutdown_enable enabled will be shut down asynchronously. This can dramatically reduce system shutdown/reboot time on systems that have multiple devices that take many seconds to shut down (like certain NVMe drives). On one system tested, the shutdown time went from 11 minutes without this patch to 55 seconds with the patch. Signed-off-by: Stuart Hayes Signed-off-by: David Jeffery Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Keith Busch Tested-by: Keith Busch Link: https://lore.kernel.org/r/20240822202805.6379-4-stuart.w.hayes@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index 5c04b8e3833b..14c9211b82d6 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -56,6 +56,7 @@ enum probe_type { * @mod_name: Used for built-in modules. * @suppress_bind_attrs: Disables bind/unbind via sysfs. * @probe_type: Type of the probe (synchronous or asynchronous) to use. + * @async_shutdown_enable: Enables devices to be shutdown asynchronously. * @of_match_table: The open firmware table. * @acpi_match_table: The ACPI match table. * @probe: Called to query the existence of a specific device, @@ -102,6 +103,7 @@ struct device_driver { bool suppress_bind_attrs; /* disables bind/unbind via sysfs */ enum probe_type probe_type; + bool async_shutdown_enable; const struct of_device_id *of_match_table; const struct acpi_device_id *acpi_match_table; -- cgit v1.2.3 From 8ab0f4605d5ce3c0d386b3828b07719f1e8e0505 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Fri, 23 Aug 2024 14:24:40 +0800 Subject: bus: fsl-mc: make fsl_mc_bus_type const Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the fsl_mc_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Kunwu Chan Acked-by: Christophe Leroy # for Link: https://lore.kernel.org/r/20240823062440.113628-1-kunwu.chan@linux.dev Signed-off-by: Greg Kroah-Hartman --- include/linux/fsl/mc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 083c860fd28e..c90ec889bfc2 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -436,7 +436,7 @@ void fsl_mc_free_irqs(struct fsl_mc_device *mc_dev); struct fsl_mc_device *fsl_mc_get_endpoint(struct fsl_mc_device *mc_dev, u16 if_id); -extern struct bus_type fsl_mc_bus_type; +extern const struct bus_type fsl_mc_bus_type; extern struct device_type fsl_mc_bus_dprc_type; extern struct device_type fsl_mc_bus_dpni_type; -- cgit v1.2.3 From 4c0a6a0ac902dbf184ae7be1f1fce225cc0b7380 Mon Sep 17 00:00:00 2001 From: Chanwoo Lee Date: Thu, 29 Aug 2024 11:47:09 +0900 Subject: mmc: core: Replace the argument of mmc_sd_switch() with defines Replace with already defined values for readability. While at it, let's also change the mode-parameter from an int to bool, as the only used values are 0 or 1. Signed-off-by: Chanwoo Lee Link: https://lore.kernel.org/r/20240829024709.402285-1-cw9316.lee@samsung.com Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index ac01cd1622ef..6a31ed02d3ff 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -648,7 +648,8 @@ static inline void mmc_debugfs_err_stats_inc(struct mmc_host *host, host->err_stats[stat] += 1; } -int mmc_sd_switch(struct mmc_card *card, int mode, int group, u8 value, u8 *resp); +int mmc_sd_switch(struct mmc_card *card, bool mode, int group, + u8 value, u8 *resp); int mmc_send_status(struct mmc_card *card, u32 *status); int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error); int mmc_send_abort_tuning(struct mmc_host *host, u32 opcode); -- cgit v1.2.3 From 7df7c204c678e24cd32d33360538670b7b90e330 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 22 Aug 2024 15:50:18 +0200 Subject: xfs: enable block size larger than page size support Page cache now has the ability to have a minimum order when allocating a folio which is a prerequisite to add support for block size > page size. Signed-off-by: Pankaj Raghav Signed-off-by: Luis Chamberlain Link: https://lore.kernel.org/r/20240827-xfs-fix-wformat-bs-gt-ps-v1-1-aec6717609e0@kernel.org # fix folded Link: https://lore.kernel.org/r/20240822135018.1931258-11-kernel@pankajraghav.com Reviewed-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4cc170949e9c..55b254d951da 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -374,6 +374,19 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) +/* + * mapping_max_folio_size_supported() - Check the max folio size supported + * + * The filesystem should call this function at mount time if there is a + * requirement on the folio mapping size in the page cache. + */ +static inline size_t mapping_max_folio_size_supported(void) +{ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER); + return PAGE_SIZE; +} + /* * mapping_set_folio_order_range() - Set the orders supported by a file. * @mapping: The address space of the file. -- cgit v1.2.3 From 31754ea6cbbc08d5bbe1fa290320c3048d8d98a3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 27 Aug 2024 06:51:36 -0400 Subject: iomap: add a private argument for iomap_file_buffered_write In order to switch fuse over to using iomap for buffered writes we need to be able to have the struct file for the original write, in case we have to read in the page to make it uptodate. Handle this by using the existing private field in the iomap_iter, and add the argument to iomap_file_buffered_write. This will allow us to pass the file in through the iomap buffered write path, and is flexible for any other file systems needs. Signed-off-by: Josef Bacik Link: https://lore.kernel.org/r/7f55c7c32275004ba00cddf862d970e6e633f750.1724755651.git.josef@toxicpanda.com Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6fc1c858013d..f792b37f7627 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -257,7 +257,7 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) } ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, - const struct iomap_ops *ops); + const struct iomap_ops *ops, void *private); int iomap_file_buffered_write_punch_delalloc(struct inode *inode, struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, int (*punch)(struct inode *inode, loff_t pos, loff_t length)); -- cgit v1.2.3 From 6f634eb080161baa4811a39f5ec07923ede092bc Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Tue, 27 Aug 2024 10:42:07 +0200 Subject: filemap: fix htmldoc warning for mapping_align_index() Stephen reported that there is a kernel build warning due to a missing description of a parameter in mapping_align_index(). Add the missing index parameter in the comment description. Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240827084206.106347-2-kernel@pankajraghav.com Fixes: ab95d23bab22 ("filemap: allocate mapping_min_order folios in the page cache") Reported-by: Stephen Rothwell Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 55b254d951da..6c0dde447c98 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -470,6 +470,7 @@ mapping_min_folio_nrpages(struct address_space *mapping) /** * mapping_align_index() - Align index for this mapping. * @mapping: The address_space. + * @index: The page index. * * The index of a folio must be naturally aligned. If you are adding a * new folio to the page cache and need to know what index to give it, -- cgit v1.2.3 From 4686cc598f669dea1b50dde1568e6c65c355bc67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Aug 2024 00:25:51 +0200 Subject: sched: Clean up DL server vs core sched Abide by the simple rule: pick_next_task() := pick_task() + set_next_task(.first = true) This allows us to trivially get rid of server_pick_next() and things collapse nicely. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240813224015.837303391@infradead.org --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3709dedbab59..57cf27a3045c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -692,7 +692,6 @@ struct sched_dl_entity { */ struct rq *rq; dl_server_has_tasks_f server_has_tasks; - dl_server_pick_f server_pick_next; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES -- cgit v1.2.3 From 631598c419985327110135efb696d9f9fe67bffd Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 14 Aug 2024 11:26:29 +0200 Subject: iio: dac: ad5449: drop support for platform data There are no longer any users of the platform data struct. Remove support for it from the driver. Signed-off-by: Bartosz Golaszewski Link: https://patch.msgid.link/20240814092629.9862-1-brgl@bgdev.pl Signed-off-by: Jonathan Cameron --- include/linux/platform_data/ad5449.h | 39 ------------------------------------ 1 file changed, 39 deletions(-) delete mode 100644 include/linux/platform_data/ad5449.h (limited to 'include/linux') diff --git a/include/linux/platform_data/ad5449.h b/include/linux/platform_data/ad5449.h deleted file mode 100644 index d687ef5726c2..000000000000 --- a/include/linux/platform_data/ad5449.h +++ /dev/null @@ -1,39 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * AD5415, AD5426, AD5429, AD5432, AD5439, AD5443, AD5449 Digital to Analog - * Converter driver. - * - * Copyright 2012 Analog Devices Inc. - * Author: Lars-Peter Clausen - */ - -#ifndef __LINUX_PLATFORM_DATA_AD5449_H__ -#define __LINUX_PLATFORM_DATA_AD5449_H__ - -/** - * enum ad5449_sdo_mode - AD5449 SDO pin configuration - * @AD5449_SDO_DRIVE_FULL: Drive the SDO pin with full strength. - * @AD5449_SDO_DRIVE_WEAK: Drive the SDO pin with not full strength. - * @AD5449_SDO_OPEN_DRAIN: Operate the SDO pin in open-drain mode. - * @AD5449_SDO_DISABLED: Disable the SDO pin, in this mode it is not possible to - * read back from the device. - */ -enum ad5449_sdo_mode { - AD5449_SDO_DRIVE_FULL = 0x0, - AD5449_SDO_DRIVE_WEAK = 0x1, - AD5449_SDO_OPEN_DRAIN = 0x2, - AD5449_SDO_DISABLED = 0x3, -}; - -/** - * struct ad5449_platform_data - Platform data for the ad5449 DAC driver - * @sdo_mode: SDO pin mode - * @hardware_clear_to_midscale: Whether asserting the hardware CLR pin sets the - * outputs to midscale (true) or to zero scale(false). - */ -struct ad5449_platform_data { - enum ad5449_sdo_mode sdo_mode; - bool hardware_clear_to_midscale; -}; - -#endif -- cgit v1.2.3 From 7cb9b5fa218caa899f4288865c4598357bcce4e9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 3 Sep 2024 15:38:25 -0500 Subject: PCI: endpoint: Fix enum pci_epc_bar_type kerneldoc e01c9797c0eb ("PCI: endpoint: Clean up hardware description for BARs") added enum pci_epc_bar_type with incomplete kerneldoc. Add the missing piece. Signed-off-by: Bjorn Helgaas --- include/linux/pci-epc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 85bdf2adb760..697cc190747d 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -149,6 +149,7 @@ struct pci_epc { }; /** + * enum pci_epc_bar_type - configurability of endpoint BAR * @BAR_PROGRAMMABLE: The BAR mask can be configured by the EPC. * @BAR_FIXED: The BAR mask is fixed by the hardware. * @BAR_RESERVED: The BAR should not be touched by an EPF driver. -- cgit v1.2.3 From 364ea7ccaef917a3068236a19a4b31a0623b561a Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 31 Aug 2024 16:20:39 +0200 Subject: power: supply: Change usb_types from an array into a bitmask The bit_types array just hold a list of valid enum power_supply_usb_type values which map to 0 - 9. This can easily be represented as a bitmap. This reduces the size of struct power_supply_desc and further reduces the data section size by drivers no longer needing to store the array. This also unifies how usb_types are handled with charge_behaviours, which allows power_supply_show_usb_type() to be removed. Signed-off-by: Hans de Goede Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20240831142039.28830-7-hdegoede@redhat.com Signed-off-by: Sebastian Reichel --- include/linux/power_supply.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 72dc7e45c90c..910d407ebe63 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -243,8 +243,7 @@ struct power_supply_desc { const char *name; enum power_supply_type type; u8 charge_behaviours; - const enum power_supply_usb_type *usb_types; - size_t num_usb_types; + u32 usb_types; const enum power_supply_property *properties; size_t num_properties; -- cgit v1.2.3 From de6c85bf918ea52d5c680f0d130b37ee2ff152d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Aug 2024 09:02:47 +0300 Subject: dma-mapping: clearly mark DMA ops as an architecture feature DMA ops are a helper for architectures and not for drivers to override the DMA implementation. Unfortunately driver authors keep ignoring this. Make the fact more clear by renaming the symbol to ARCH_HAS_DMA_OPS and having the two drivers overriding their dma_ops depend on that. These drivers should probably be marked broken, but we can give them a bit of a grace period for that. Signed-off-by: Christoph Hellwig Reviewed-by: Thomas Gleixner Acked-by: Sakari Ailus # for IPU6 Acked-by: Robin Murphy --- include/linux/device.h | 2 +- include/linux/dma-map-ops.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 1c5280d28bc3..b4bde8d22697 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -750,7 +750,7 @@ struct device { struct dev_pin_info *pins; #endif struct dev_msi_info msi; -#ifdef CONFIG_DMA_OPS +#ifdef CONFIG_ARCH_HAS_DMA_OPS const struct dma_map_ops *dma_ops; #endif u64 *dma_mask; /* dma mask (if dma'able device) */ diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 077b15c93bb8..9668ddf3696e 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -75,7 +75,7 @@ struct dma_map_ops { unsigned long (*get_merge_boundary)(struct device *dev); }; -#ifdef CONFIG_DMA_OPS +#ifdef CONFIG_ARCH_HAS_DMA_OPS #include static inline const struct dma_map_ops *get_dma_ops(struct device *dev) @@ -90,7 +90,7 @@ static inline void set_dma_ops(struct device *dev, { dev->dma_ops = dma_ops; } -#else /* CONFIG_DMA_OPS */ +#else /* CONFIG_ARCH_HAS_DMA_OPS */ static inline const struct dma_map_ops *get_dma_ops(struct device *dev) { return NULL; @@ -99,7 +99,7 @@ static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *dma_ops) { } -#endif /* CONFIG_DMA_OPS */ +#endif /* CONFIG_ARCH_HAS_DMA_OPS */ #ifdef CONFIG_DMA_CMA extern struct cma *dma_contiguous_default_area; -- cgit v1.2.3 From 73ed0baae66df50359c876f65f41179d6ebd2716 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Tue, 30 Jul 2024 23:49:13 -0700 Subject: mm: swap: swap cluster switch to double link list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: swap: mTHP swap allocator base on swap cluster order", v5. This is the short term solutions "swap cluster order" listed in my "Swap Abstraction" discussion slice 8 in the recent LSF/MM conference. When commit 845982eb264bc "mm: swap: allow storage of all mTHP orders" is introduced, it only allocates the mTHP swap entries from the new empty cluster list.  It has a fragmentation issue reported by Barry. https://lore.kernel.org/all/CAGsJ_4zAcJkuW016Cfi6wicRr8N9X+GJJhgMQdSMp+Ah+NSgNQ@mail.gmail.com/ The reason is that all the empty clusters have been exhausted while there are plenty of free swap entries in the cluster that are not 100% free. Remember the swap allocation order in the cluster. Keep track of the per order non full cluster list for later allocation. This series gives the swap SSD allocation a new separate code path from the HDD allocation. The new allocator use cluster list only and do not global scan swap_map[] without lock any more. This streamline the swap allocation for SSD. The code matches the execution flow much better. User impact: For users that allocate and free mix order mTHP swapping, It greatly improves the success rate of the mTHP swap allocation after the initial phase. It also performs faster when the swapfile is close to full, because the allocator can get the non full cluster from a list rather than scanning a lot of swap_map entries.  With Barry's mthp test program V2: Without: $ ./thp_swap_allocator_test -a Iteration 1: swpout inc: 32, swpout fallback inc: 192, Fallback percentage: 85.71% Iteration 2: swpout inc: 0, swpout fallback inc: 231, Fallback percentage: 100.00% Iteration 3: swpout inc: 0, swpout fallback inc: 227, Fallback percentage: 100.00% ... Iteration 98: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% Iteration 99: swpout inc: 0, swpout fallback inc: 215, Fallback percentage: 100.00% Iteration 100: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% $ ./thp_swap_allocator_test -a -s Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00% Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% .. Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00% Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00% Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00% $ ./thp_swap_allocator_test -s Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00% Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% .. Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00% Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00% Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00% $ ./thp_swap_allocator_test Iteration 1: swpout inc: 0, swpout fallback inc: 224, Fallback percentage: 100.00% Iteration 2: swpout inc: 0, swpout fallback inc: 218, Fallback percentage: 100.00% Iteration 3: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% .. Iteration 98: swpout inc: 0, swpout fallback inc: 228, Fallback percentage: 100.00% Iteration 99: swpout inc: 0, swpout fallback inc: 230, Fallback percentage: 100.00% Iteration 100: swpout inc: 0, swpout fallback inc: 229, Fallback percentage: 100.00% With: # with all 0.00% filter out $ ./thp_swap_allocator_test -a | grep -v "0.00%" $ # all result are 0.00% $ ./thp_swap_allocator_test -a -s | grep -v "0.00%" ./thp_swap_allocator_test -a -s | grep -v "0.00%" Iteration 14: swpout inc: 223, swpout fallback inc: 3, Fallback percentage: 1.33% Iteration 19: swpout inc: 219, swpout fallback inc: 7, Fallback percentage: 3.10% Iteration 28: swpout inc: 225, swpout fallback inc: 1, Fallback percentage: 0.44% Iteration 29: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44% Iteration 34: swpout inc: 220, swpout fallback inc: 8, Fallback percentage: 3.51% Iteration 35: swpout inc: 222, swpout fallback inc: 11, Fallback percentage: 4.72% Iteration 38: swpout inc: 217, swpout fallback inc: 4, Fallback percentage: 1.81% Iteration 40: swpout inc: 222, swpout fallback inc: 6, Fallback percentage: 2.63% Iteration 42: swpout inc: 221, swpout fallback inc: 2, Fallback percentage: 0.90% Iteration 43: swpout inc: 215, swpout fallback inc: 7, Fallback percentage: 3.15% Iteration 47: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88% Iteration 49: swpout inc: 217, swpout fallback inc: 1, Fallback percentage: 0.46% Iteration 52: swpout inc: 221, swpout fallback inc: 8, Fallback percentage: 3.49% Iteration 56: swpout inc: 224, swpout fallback inc: 4, Fallback percentage: 1.75% Iteration 58: swpout inc: 214, swpout fallback inc: 5, Fallback percentage: 2.28% Iteration 62: swpout inc: 220, swpout fallback inc: 3, Fallback percentage: 1.35% Iteration 64: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44% Iteration 67: swpout inc: 221, swpout fallback inc: 1, Fallback percentage: 0.45% Iteration 75: swpout inc: 220, swpout fallback inc: 9, Fallback percentage: 3.93% Iteration 82: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44% Iteration 86: swpout inc: 211, swpout fallback inc: 12, Fallback percentage: 5.38% Iteration 89: swpout inc: 226, swpout fallback inc: 2, Fallback percentage: 0.88% Iteration 93: swpout inc: 220, swpout fallback inc: 1, Fallback percentage: 0.45% Iteration 94: swpout inc: 224, swpout fallback inc: 1, Fallback percentage: 0.44% Iteration 96: swpout inc: 221, swpout fallback inc: 6, Fallback percentage: 2.64% Iteration 98: swpout inc: 227, swpout fallback inc: 1, Fallback percentage: 0.44% Iteration 99: swpout inc: 227, swpout fallback inc: 3, Fallback percentage: 1.30% $ ./thp_swap_allocator_test ./thp_swap_allocator_test Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 2: swpout inc: 131, swpout fallback inc: 101, Fallback percentage: 43.53% Iteration 3: swpout inc: 71, swpout fallback inc: 155, Fallback percentage: 68.58% Iteration 4: swpout inc: 55, swpout fallback inc: 168, Fallback percentage: 75.34% Iteration 5: swpout inc: 35, swpout fallback inc: 191, Fallback percentage: 84.51% Iteration 6: swpout inc: 25, swpout fallback inc: 199, Fallback percentage: 88.84% Iteration 7: swpout inc: 23, swpout fallback inc: 205, Fallback percentage: 89.91% Iteration 8: swpout inc: 9, swpout fallback inc: 219, Fallback percentage: 96.05% Iteration 9: swpout inc: 13, swpout fallback inc: 213, Fallback percentage: 94.25% Iteration 10: swpout inc: 12, swpout fallback inc: 216, Fallback percentage: 94.74% Iteration 11: swpout inc: 16, swpout fallback inc: 213, Fallback percentage: 93.01% Iteration 12: swpout inc: 10, swpout fallback inc: 210, Fallback percentage: 95.45% Iteration 13: swpout inc: 16, swpout fallback inc: 212, Fallback percentage: 92.98% Iteration 14: swpout inc: 12, swpout fallback inc: 212, Fallback percentage: 94.64% Iteration 15: swpout inc: 15, swpout fallback inc: 211, Fallback percentage: 93.36% Iteration 16: swpout inc: 15, swpout fallback inc: 200, Fallback percentage: 93.02% Iteration 17: swpout inc: 9, swpout fallback inc: 220, Fallback percentage: 96.07% $ ./thp_swap_allocator_test -s ./thp_swap_allocator_test -s Iteration 1: swpout inc: 233, swpout fallback inc: 0, Fallback percentage: 0.00% Iteration 2: swpout inc: 97, swpout fallback inc: 135, Fallback percentage: 58.19% Iteration 3: swpout inc: 42, swpout fallback inc: 192, Fallback percentage: 82.05% Iteration 4: swpout inc: 19, swpout fallback inc: 214, Fallback percentage: 91.85% Iteration 5: swpout inc: 12, swpout fallback inc: 213, Fallback percentage: 94.67% Iteration 6: swpout inc: 11, swpout fallback inc: 217, Fallback percentage: 95.18% Iteration 7: swpout inc: 9, swpout fallback inc: 214, Fallback percentage: 95.96% Iteration 8: swpout inc: 8, swpout fallback inc: 213, Fallback percentage: 96.38% Iteration 9: swpout inc: 2, swpout fallback inc: 223, Fallback percentage: 99.11% Iteration 10: swpout inc: 2, swpout fallback inc: 228, Fallback percentage: 99.13% Iteration 11: swpout inc: 4, swpout fallback inc: 214, Fallback percentage: 98.17% Iteration 12: swpout inc: 5, swpout fallback inc: 226, Fallback percentage: 97.84% Iteration 13: swpout inc: 3, swpout fallback inc: 212, Fallback percentage: 98.60% Iteration 14: swpout inc: 0, swpout fallback inc: 222, Fallback percentage: 100.00% Iteration 15: swpout inc: 3, swpout fallback inc: 222, Fallback percentage: 98.67% Iteration 16: swpout inc: 4, swpout fallback inc: 223, Fallback percentage: 98.24% ========= Kernel compile under tmpfs with cgroup memory.max = 470M. 12 core 24 hyperthreading, 32 jobs. 10 Run each group SSD swap 10 runs average, 20G swap partition: With: user 2929.064 system 1479.381 : 1376.89 1398.22 1444.64 1477.39 1479.04 1497.27 1504.47 1531.4 1532.92 1551.57 real 1441.324 Without: user 2910.872 system 1482.732 : 1440.01 1451.4 1462.01 1467.47 1467.51 1469.3 1470.19 1496.32 1544.1 1559.01 real 1580.822 Two zram swap: zram0 3.0G zram1 20G. The idea is forcing the zram0 almost full then overflow to zram1: With: user 4320.301 system 4272.403 : 4236.24 4262.81 4264.75 4269.13 4269.44 4273.06 4279.85 4285.98 4289.64 4293.13 real 431.759 Without user 4301.393 system 4387.672 : 4374.47 4378.3 4380.95 4382.84 4383.06 4388.05 4389.76 4397.16 4398.23 4403.9 real 433.979 ------ more test result from Kaiui ---------- Test with build linux kernel using a 4G ZRAM, 1G memory.max limit on top of shmem: System info: 32 Core AMD Zen2, 64G total memory. Test 3 times using only 4K pages: ================================= With: ----- 1838.74user 2411.21system 2:37.86elapsed 2692%CPU (0avgtext+0avgdata 847060maxresident)k 1839.86user 2465.77system 2:39.35elapsed 2701%CPU (0avgtext+0avgdata 847060maxresident)k 1840.26user 2454.68system 2:39.43elapsed 2693%CPU (0avgtext+0avgdata 847060maxresident)k Summary (~4.6% improment of system time): User: 1839.62 System: 2443.89: 2465.77 2454.68 2411.21 Real: 158.88 Without: -------- 1837.99user 2575.95system 2:43.09elapsed 2706%CPU (0avgtext+0avgdata 846520maxresident)k 1838.32user 2555.15system 2:42.52elapsed 2709%CPU (0avgtext+0avgdata 846520maxresident)k 1843.02user 2561.55system 2:43.35elapsed 2702%CPU (0avgtext+0avgdata 846520maxresident)k Summary: User: 1839.78 System: 2564.22: 2575.95 2555.15 2561.55 Real: 162.99 Test 5 times using enabled all mTHP pages: ========================================== With: ----- 1796.44user 2937.33system 2:59.09elapsed 2643%CPU (0avgtext+0avgdata 846936maxresident)k 1802.55user 3002.32system 2:54.68elapsed 2750%CPU (0avgtext+0avgdata 847072maxresident)k 1806.59user 2986.53system 2:55.17elapsed 2736%CPU (0avgtext+0avgdata 847092maxresident)k 1803.27user 2982.40system 2:54.49elapsed 2742%CPU (0avgtext+0avgdata 846796maxresident)k 1807.43user 3036.08system 2:56.06elapsed 2751%CPU (0avgtext+0avgdata 846488maxresident)k Summary (~8.4% improvement of system time): User: 1803.25 System: 2988.93: 2937.33 3002.32 2986.53 2982.40 3036.08 Real: 175.90 mTHP swapout status: /sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout:347721 /sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpout_fallback:3110 /sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout:3365 /sys/kernel/mm/transparent_hugepage/hugepages-512kB/stats/swpout_fallback:8269 /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout:24 /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/swpout_fallback:3341 /sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout:145 /sys/kernel/mm/transparent_hugepage/hugepages-1024kB/stats/swpout_fallback:5038 /sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout:322737 /sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpout_fallback:36808 /sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout:380455 /sys/kernel/mm/transparent_hugepage/hugepages-16kB/stats/swpout_fallback:1010 /sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout:24973 /sys/kernel/mm/transparent_hugepage/hugepages-256kB/stats/swpout_fallback:13223 /sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout:197348 /sys/kernel/mm/transparent_hugepage/hugepages-128kB/stats/swpout_fallback:80541 Without: -------- 1794.41user 3151.29system 3:05.97elapsed 2659%CPU (0avgtext+0avgdata 846704maxresident)k 1810.27user 3304.48system 3:05.38elapsed 2759%CPU (0avgtext+0avgdata 846636maxresident)k 1809.84user 3254.85system 3:03.83elapsed 2755%CPU (0avgtext+0avgdata 846952maxresident)k 1813.54user 3259.56system 3:04.28elapsed 2752%CPU (0avgtext+0avgdata 846848maxresident)k 1829.97user 3338.40system 3:07.32elapsed 2759%CPU (0avgtext+0avgdata 847024maxresident)k Summary: User: 1811.61 System: 3261.72 : 3151.29 3304.48 3254.85 3259.56 3338.40 Real: 185.356 mTHP swapout status: hugepages-32kB/stats/swpout:35630 hugepages-32kB/stats/swpout_fallback:1809908 hugepages-512kB/stats/swpout:523 hugepages-512kB/stats/swpout_fallback:55235 hugepages-2048kB/stats/swpout:53 hugepages-2048kB/stats/swpout_fallback:17264 hugepages-1024kB/stats/swpout:85 hugepages-1024kB/stats/swpout_fallback:24979 hugepages-64kB/stats/swpout:30117 hugepages-64kB/stats/swpout_fallback:1825399 hugepages-16kB/stats/swpout:42775 hugepages-16kB/stats/swpout_fallback:1951123 hugepages-256kB/stats/swpout:2326 hugepages-256kB/stats/swpout_fallback:170165 hugepages-128kB/stats/swpout:17925 hugepages-128kB/stats/swpout_fallback:1309757 This patch (of 9): Previously, the swap cluster used a cluster index as a pointer to construct a custom single link list type "swap_cluster_list". The next cluster pointer is shared with the cluster->count. It prevents puting the non free cluster into a list. Change the cluster to use the standard double link list instead. This allows tracing the nonfull cluster in the follow up patch. That way, it is faster to get to the nonfull cluster of that order. Remove the cluster getter/setter for accessing the cluster struct member. The list operation is protected by the swap_info_struct->lock. Change cluster code to use "struct swap_cluster_info *" to reference the cluster rather than by using index. That is more consistent with the list manipulation. It avoids the repeat adding index to the cluser_info. The code is easier to understand. Remove the cluster next pointer is NULL flag, the double link list can handle the empty list pretty well. The "swap_cluster_info" struct is two pointer bigger, because 512 swap entries share one swap_cluster_info struct, it has very little impact on the average memory usage per swap entry. For 1TB swapfile, the swap cluster data structure increases from 8MB to 24MB. Other than the list conversion, there is no real function change in this patch. Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-1-cb9c148b9297@kernel.org Signed-off-by: Chris Li Reported-by: Barry Song <21cnbao@gmail.com> Reviewed-by: "Huang, Ying" Cc: Hugh Dickins Cc: Kairui Song Cc: Kalesh Singh Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/swap.h | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 5b920fa2315b..ead568f04f81 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -243,22 +243,20 @@ enum { * free clusters are organized into a list. We fetch an entry from the list to * get a free cluster. * - * The data field stores next cluster if the cluster is free or cluster usage - * counter otherwise. The flags field determines if a cluster is free. This is - * protected by swap_info_struct.lock. + * The flags field determines if a cluster is free. This is + * protected by cluster lock. */ struct swap_cluster_info { spinlock_t lock; /* * Protect swap_cluster_info fields - * and swap_info_struct->swap_map - * elements correspond to the swap - * cluster + * other than list, and swap_info_struct->swap_map + * elements corresponding to the swap cluster. */ - unsigned int data:24; - unsigned int flags:8; + u16 count; + u8 flags; + struct list_head list; }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ -#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ /* * The first page in the swap file is the swap header, which is always marked @@ -283,11 +281,6 @@ struct percpu_cluster { unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */ }; -struct swap_cluster_list { - struct swap_cluster_info head; - struct swap_cluster_info tail; -}; - /* * The in-memory structure used to track swap areas. */ @@ -300,7 +293,7 @@ struct swap_info_struct { unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ - struct swap_cluster_list free_clusters; /* free clusters list */ + struct list_head free_clusters; /* free clusters list */ unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ @@ -331,7 +324,7 @@ struct swap_info_struct { * list. */ struct work_struct discard_work; /* discard worker */ - struct swap_cluster_list discard_clusters; /* discard clusters list */ + struct list_head discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one * entry per node. -- cgit v1.2.3 From d07a46a4ac18786e7f4c98fb08525ed80dd1f642 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Tue, 30 Jul 2024 23:49:14 -0700 Subject: mm: swap: mTHP allocate swap entries from nonfull list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track the nonfull cluster as well as the empty cluster on lists. Each order has one nonfull cluster list. The cluster will remember which order it was used during new cluster allocation. When the cluster has free entry, add to the nonfull[order] list.  When the free cluster list is empty, also allocate from the nonempty list of that order. This improves the mTHP swap allocation success rate. There are limitations if the distribution of numbers of different orders of mTHP changes a lot. e.g. there are a lot of nonfull cluster assign to order A while later time there are a lot of order B allocation while very little allocation in order A. Currently the cluster used by order A will not reused by order B unless the cluster is 100% empty. Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-2-cb9c148b9297@kernel.org Signed-off-by: Chris Li Reported-by: Barry Song <21cnbao@gmail.com> Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kairui Song Cc: Kalesh Singh Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index ead568f04f81..7007a3c2336c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -254,9 +254,11 @@ struct swap_cluster_info { */ u16 count; u8 flags; + u8 order; struct list_head list; }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ +#define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ /* * The first page in the swap file is the swap header, which is always marked @@ -294,6 +296,8 @@ struct swap_info_struct { unsigned char *swap_map; /* vmalloc'ed array of usage counts */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ + struct list_head nonfull_clusters[SWAP_NR_ORDERS]; + /* list of cluster that contains at least one free slot */ unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ -- cgit v1.2.3 From 477cb7ba28892eda112c79d8f75d10edabfc3050 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 30 Jul 2024 23:49:19 -0700 Subject: mm: swap: add a fragment cluster list Now swap cluster allocator arranges the clusters in LRU style, so the "cold" cluster stay at the head of nonfull lists are the ones that were used for allocation long time ago and still partially occupied. So if allocator can't find enough contiguous slots to satisfy an high order allocation, it's unlikely there will be slot being free on them to satisfy the allocation, at least in a short period. As a result, nonfull cluster scanning will waste time repeatly scanning the unusable head of the list. Also, multiple CPUs could content on the same head cluster of nonfull list. Unlike free clusters which are removed from the list when any CPU starts using it, nonfull cluster stays on the head. So introduce a new list frag list, all scanned nonfull clusters will be moved to this list. Both for avoiding repeated scanning and contention. Frag list is still used as fallback for allocations, so if one CPU failed to allocate one order of slots, it can still steal other CPU's clusters. And order 0 will favor the fragmented clusters to better protect nonfull clusters If any slots on a fragment list are being freed, move the fragment list back to nonfull list indicating it worth another scan on the cluster. Compared to scan upon freeing a slot, this keep the scanning lazy and save some CPU if there are still other clusters to use. It may seems unneccessay to keep the fragmented cluster on list at all if they can't be used for specific order allocation. But this will start to make sense once reclaim dring scanning is ready. Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-7-cb9c148b9297@kernel.org Signed-off-by: Kairui Song Reported-by: Barry Song <21cnbao@gmail.com> Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kalesh Singh Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 7007a3c2336c..c526cd3e7b31 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -259,6 +259,7 @@ struct swap_cluster_info { }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ +#define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ /* * The first page in the swap file is the swap header, which is always marked @@ -298,6 +299,8 @@ struct swap_info_struct { struct list_head free_clusters; /* free clusters list */ struct list_head nonfull_clusters[SWAP_NR_ORDERS]; /* list of cluster that contains at least one free slot */ + struct list_head frag_clusters[SWAP_NR_ORDERS]; + /* list of cluster that are fragmented or contented */ unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ -- cgit v1.2.3 From 661383c6111a38c88df61af6bfbcfacd2ff20a67 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 30 Jul 2024 23:49:20 -0700 Subject: mm: swap: relaim the cached parts that got scanned This commit implements reclaim during scan for cluster allocator. Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could result in low allocation success rate or early OOM. So to ensure maximum allocation success rate, integrate reclaiming with scanning. If found a range of suitable swap slots but fragmented due to HAS_CACHE, just try to reclaim the slots. Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org Signed-off-by: Kairui Song Reported-by: Barry Song <21cnbao@gmail.com> Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kalesh Singh Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index c526cd3e7b31..e372122e4f45 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -301,6 +301,7 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ + unsigned int frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ -- cgit v1.2.3 From 2cacbdfdee65b18f9952620e762eab043d71b564 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 30 Jul 2024 23:49:21 -0700 Subject: mm: swap: add a adaptive full cluster cache reclaim Link all full cluster with one full list, and reclaim from it when the allocation have ran out of all usable clusters. There are many reason a folio can end up being in the swap cache while having no swap count reference. So the best way to search for such slots is still by iterating the swap clusters. With the list as an LRU, iterating from the oldest cluster and keep them rotating is a very doable and clean way to free up potentially not inuse clusters. When any allocation failure, try reclaim and rotate only one cluster. This is adaptive for high order allocations they can tolerate fallback. So this avoids latency, and give the full cluster list an fair chance to get reclaimed. It release the usage stress for the fallback order 0 allocation or following up high order allocation. If the swap device is getting very full, reclaim more aggresively to ensure no OOM will happen. This ensures order 0 heavy workload won't go OOM as order 0 won't fail if any cluster still have any space. [ryncsn@gmail.com: fix discard of full cluster] Link: https://lkml.kernel.org/r/CAMgjq7CWwK75_2Zi5P40K08pk9iqOcuWKL6khu=x4Yg_nXaQag@mail.gmail.com Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-9-cb9c148b9297@kernel.org Signed-off-by: Kairui Song Reported-by: Barry Song <21cnbao@gmail.com> Cc: Chris Li Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kalesh Singh Cc: Ryan Roberts Cc: David Hildenbrand Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index e372122e4f45..1c8f844a9f0f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -260,6 +260,7 @@ struct swap_cluster_info { #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NONFULL 2 /* This cluster is on nonfull list */ #define CLUSTER_FLAG_FRAG 4 /* This cluster is on nonfull list */ +#define CLUSTER_FLAG_FULL 8 /* This cluster is on full list */ /* * The first page in the swap file is the swap header, which is always marked @@ -297,6 +298,7 @@ struct swap_info_struct { unsigned char *swap_map; /* vmalloc'ed array of usage counts */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ + struct list_head full_clusters; /* full clusters list */ struct list_head nonfull_clusters[SWAP_NR_ORDERS]; /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; -- cgit v1.2.3 From 46bcce503197d1019ee5c49ccde978e31298e35f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 7 Aug 2024 09:40:51 +0300 Subject: arch, mm: move definition of node_data to generic code Every architecture that supports NUMA defines node_data in the same way: struct pglist_data *node_data[MAX_NUMNODES]; No reason to keep multiple copies of this definition and its forward declarations, especially when such forward declaration is the only thing in include/asm/mmzone.h for many architectures. Add definition and declaration of node_data to generic code and drop architecture-specific versions. Link: https://lkml.kernel.org/r/20240807064110.1003856-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Reviewed-by: Jonathan Cameron Acked-by: Davidlohr Bueso Tested-by: Zi Yan # for x86_64 and arm64 Tested-by: Jonathan Cameron [arm64 + CXL via QEMU] Acked-by: Dan Williams Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Rafael J. Wysocki Cc: Rob Herring (Arm) Cc: Samuel Holland Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/numa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/numa.h b/include/linux/numa.h index eb19503604fe..e5841d4057ab 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -30,6 +30,9 @@ static inline bool numa_valid_node(int nid) #ifdef CONFIG_NUMA #include +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) + /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); -- cgit v1.2.3 From ec164cf1dd3df4ae58931b9b491b06a90d5c22d3 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 7 Aug 2024 09:40:52 +0300 Subject: mm: drop CONFIG_HAVE_ARCH_NODEDATA_EXTENSION There are no users of HAVE_ARCH_NODEDATA_EXTENSION left, so arch_alloc_nodedata() and arch_refresh_nodedata() are not needed anymore. Replace the call to arch_alloc_nodedata() in free_area_init() with a new helper alloc_offline_node_data(), remove arch_refresh_nodedata() and cleanup include/linux/memory_hotplug.h from the associated ifdefery. Link: https://lkml.kernel.org/r/20240807064110.1003856-9-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Tested-by: Zi Yan # for x86_64 and arm64 Acked-by: Dan Williams Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: John Paul Adrian Glaubitz Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Rafael J. Wysocki Cc: Rob Herring (Arm) Cc: Samuel Holland Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 48 ------------------------------------------ include/linux/numa.h | 4 ++++ 2 files changed, 4 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ebe876930e78..b27ddce5d324 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,54 +16,6 @@ struct resource; struct vmem_altmap; struct dev_pagemap; -#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION -/* - * For supporting node-hotadd, we have to allocate a new pgdat. - * - * If an arch has generic style NODE_DATA(), - * node_data[nid] = kzalloc() works well. But it depends on the architecture. - * - * In general, generic_alloc_nodedata() is used. - * - */ -extern pg_data_t *arch_alloc_nodedata(int nid); -extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); - -#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - -#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) - -#ifdef CONFIG_NUMA -/* - * XXX: node aware allocation can't work well to get new node's memory at this time. - * Because, pgdat for the new node is not allocated/initialized yet itself. - * To use new node's memory, more consideration will be necessary. - */ -#define generic_alloc_nodedata(nid) \ -({ \ - memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ -}) - -extern pg_data_t *node_data[]; -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - node_data[nid] = pgdat; -} - -#else /* !CONFIG_NUMA */ - -/* never called */ -static inline pg_data_t *generic_alloc_nodedata(int nid) -{ - BUG(); - return NULL; -} -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - #ifdef CONFIG_MEMORY_HOTPLUG struct page *pfn_to_online_page(unsigned long pfn); diff --git a/include/linux/numa.h b/include/linux/numa.h index e5841d4057ab..b41b1569781b 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -33,6 +33,8 @@ static inline bool numa_valid_node(int nid) extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) +void __init alloc_offline_node_data(int nid); + /* Generic implementation available */ int numa_nearest_node(int node, unsigned int state); @@ -60,6 +62,8 @@ static inline int phys_to_target_node(u64 start) { return 0; } + +static inline void alloc_offline_node_data(int nid) {} #endif #define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE) -- cgit v1.2.3 From 3515863d9f29dd476cdad875fbd558fc85b4c511 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 7 Aug 2024 09:40:53 +0300 Subject: arch, mm: pull out allocation of NODE_DATA to generic code Architectures that support NUMA duplicate the code that allocates NODE_DATA on the node-local memory with slight variations in reporting of the addresses where the memory was allocated. Use x86 version as the basis for the generic alloc_node_data() function and call this function in architecture specific numa initialization. Round up node data size to SMP_CACHE_BYTES rather than to PAGE_SIZE like x86 used to do since the bootmem era when allocation granularity was PAGE_SIZE anyway. Link: https://lkml.kernel.org/r/20240807064110.1003856-10-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Reviewed-by: Jonathan Cameron Tested-by: Zi Yan # for x86_64 and arm64 Tested-by: Jonathan Cameron [arm64 + CXL via QEMU] Acked-by: Dan Williams Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Rafael J. Wysocki Cc: Rob Herring (Arm) Cc: Samuel Holland Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/numa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/numa.h b/include/linux/numa.h index b41b1569781b..3567e40329eb 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -33,6 +33,7 @@ static inline bool numa_valid_node(int nid) extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) +void __init alloc_node_data(int nid); void __init alloc_offline_node_data(int nid); /* Generic implementation available */ -- cgit v1.2.3 From 87482708210ff3333adab4dc5f1a491793159d43 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 7 Aug 2024 09:41:01 +0300 Subject: mm: introduce numa_memblks Move code dealing with numa_memblks from arch/x86 to mm/ and add Kconfig options to let x86 select it in its Kconfig. This code will be later reused by arch_numa. No functional changes. Link: https://lkml.kernel.org/r/20240807064110.1003856-18-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Tested-by: Zi Yan # for x86_64 and arm64 Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron [arm64 + CXL via QEMU] Acked-by: Dan Williams Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Rafael J. Wysocki Cc: Rob Herring (Arm) Cc: Samuel Holland Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/numa_memblks.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 include/linux/numa_memblks.h (limited to 'include/linux') diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h new file mode 100644 index 000000000000..6981cf97d2c9 --- /dev/null +++ b/include/linux/numa_memblks.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NUMA_MEMBLKS_H +#define __NUMA_MEMBLKS_H + +#ifdef CONFIG_NUMA_MEMBLKS +#include + +#define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) + +struct numa_memblk { + u64 start; + u64 end; + int nid; +}; + +struct numa_meminfo { + int nr_blks; + struct numa_memblk blk[NR_NODE_MEMBLKS]; +}; + +extern struct numa_meminfo numa_meminfo __initdata_or_meminfo; +extern struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; + +int __init numa_add_memblk(int nodeid, u64 start, u64 end); +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); + +int __init numa_cleanup_meminfo(struct numa_meminfo *mi); +int __init numa_register_meminfo(struct numa_meminfo *mi); + +void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi); + +#endif /* CONFIG_NUMA_MEMBLKS */ + +#endif /* __NUMA_MEMBLKS_H */ -- cgit v1.2.3 From 75f9d4cc4eb5e15dd5ce843e33de7f6346a0d4fa Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 7 Aug 2024 09:41:02 +0300 Subject: mm: move numa_distance and related code from x86 to numa_memblks Move code dealing with numa_distance array from arch/x86 to mm/numa_memblks.c This code will be later reused by arch_numa. No functional changes. Link: https://lkml.kernel.org/r/20240807064110.1003856-19-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Tested-by: Zi Yan # for x86_64 and arm64 Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron [arm64 + CXL via QEMU] Acked-by: Dan Williams Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Rafael J. Wysocki Cc: Rob Herring (Arm) Cc: Samuel Holland Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/numa_memblks.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index 6981cf97d2c9..968a590535ac 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -7,6 +7,10 @@ #define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) +extern int numa_distance_cnt; +void __init numa_set_distance(int from, int to, int distance); +void __init numa_reset_distance(void); + struct numa_memblk { u64 start; u64 end; -- cgit v1.2.3 From b0c4e27c687177aa36048110a12cba94bf291452 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 7 Aug 2024 09:41:03 +0300 Subject: mm: introduce numa_emulation Move numa_emulation code from arch/x86 to mm/numa_emulation.c This code will be later reused by arch_numa. No functional changes. Link: https://lkml.kernel.org/r/20240807064110.1003856-20-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Tested-by: Zi Yan # for x86_64 and arm64 Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron [arm64 + CXL via QEMU] Acked-by: Dan Williams Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Rafael J. Wysocki Cc: Rob Herring (Arm) Cc: Samuel Holland Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/numa_memblks.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index 968a590535ac..f81f98678074 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -34,6 +34,23 @@ int __init numa_register_meminfo(struct numa_meminfo *mi); void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, const struct numa_meminfo *mi); +#ifdef CONFIG_NUMA_EMU +int numa_emu_cmdline(char *str); +void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, + unsigned int nr_emu_nids); +u64 __init numa_emu_dma_end(void); +void __init numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt); +#else +static inline void numa_emulation(struct numa_meminfo *numa_meminfo, + int numa_dist_cnt) +{ } +static inline int numa_emu_cmdline(char *str) +{ + return -EINVAL; +} +#endif /* CONFIG_NUMA_EMU */ + #endif /* CONFIG_NUMA_MEMBLKS */ #endif /* __NUMA_MEMBLKS_H */ -- cgit v1.2.3 From 692d73d2f0f75e58828ab05872b3789ae1f486ef Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 7 Aug 2024 09:41:04 +0300 Subject: mm: numa_memblks: introduce numa_memblks_init Move most of x86::numa_init() to numa_memblks so that the latter will be more self-contained. With this numa_memblk data structures should not be exposed to the architecture specific code. Link: https://lkml.kernel.org/r/20240807064110.1003856-21-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Tested-by: Zi Yan # for x86_64 and arm64 Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron [arm64 + CXL via QEMU] Acked-by: Dan Williams Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Rafael J. Wysocki Cc: Rob Herring (Arm) Cc: Samuel Holland Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/numa_memblks.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index f81f98678074..07381320848f 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -34,6 +34,9 @@ int __init numa_register_meminfo(struct numa_meminfo *mi); void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, const struct numa_meminfo *mi); +int __init numa_memblks_init(int (*init_func)(void), + bool memblock_force_top_down); + #ifdef CONFIG_NUMA_EMU int numa_emu_cmdline(char *str); void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys, -- cgit v1.2.3 From 317ef4598bdc48c0196adc02ed4edaf82af279f2 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 7 Aug 2024 09:41:05 +0300 Subject: mm: numa_memblks: make several functions and variables static Make functions and variables that are exclusively used by numa_memblks static. Move numa_nodemask_from_meminfo() before its callers to avoid forward declaration. Link: https://lkml.kernel.org/r/20240807064110.1003856-22-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Tested-by: Zi Yan # for x86_64 and arm64 Reviewed-by: Jonathan Cameron Tested-by: Jonathan Cameron [arm64 + CXL via QEMU] Acked-by: Dan Williams Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Rafael J. Wysocki Cc: Rob Herring (Arm) Cc: Samuel Holland Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/numa_memblks.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index 07381320848f..5c6e12ad0b7a 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -7,7 +7,6 @@ #define NR_NODE_MEMBLKS (MAX_NUMNODES * 2) -extern int numa_distance_cnt; void __init numa_set_distance(int from, int to, int distance); void __init numa_reset_distance(void); @@ -22,17 +21,10 @@ struct numa_meminfo { struct numa_memblk blk[NR_NODE_MEMBLKS]; }; -extern struct numa_meminfo numa_meminfo __initdata_or_meminfo; -extern struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; - int __init numa_add_memblk(int nodeid, u64 start, u64 end); void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); int __init numa_cleanup_meminfo(struct numa_meminfo *mi); -int __init numa_register_meminfo(struct numa_meminfo *mi); - -void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, - const struct numa_meminfo *mi); int __init numa_memblks_init(int (*init_func)(void), bool memblock_force_top_down); -- cgit v1.2.3 From 1b5695b02444660297cc54cab44f123ff28de2cc Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 7 Aug 2024 09:41:09 +0300 Subject: mm: make range-to-target_node lookup facility a part of numa_memblks The x86 implementation of range-to-target_node lookup (i.e. phys_to_target_node() and memory_add_physaddr_to_nid()) relies on numa_memblks. Since numa_memblks are now part of the generic code, move these functions from x86 to mm/numa_memblks.c and select CONFIG_NUMA_KEEP_MEMINFO when CONFIG_NUMA_MEMBLKS=y for dax and cxl. [rppt@kernel.org: fix build] Link: https://lkml.kernel.org/r/ZtVfSt_zloPdDqVB@kernel.org Link: https://lkml.kernel.org/r/20240807064110.1003856-26-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Jonathan Cameron Tested-by: Zi Yan # for x86_64 and arm64 Tested-by: Jonathan Cameron [arm64 + CXL via QEMU] Reviewed-by: Dan Williams Acked-by: David Hildenbrand Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Davidlohr Bueso Cc: David S. Miller Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Rafael J. Wysocki Cc: Rob Herring (Arm) Cc: Samuel Holland Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/numa_memblks.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/numa_memblks.h b/include/linux/numa_memblks.h index 5c6e12ad0b7a..cfad6ce7e1bd 100644 --- a/include/linux/numa_memblks.h +++ b/include/linux/numa_memblks.h @@ -46,6 +46,13 @@ static inline int numa_emu_cmdline(char *str) } #endif /* CONFIG_NUMA_EMU */ +#ifdef CONFIG_NUMA_KEEP_MEMINFO +extern int phys_to_target_node(u64 start); +#define phys_to_target_node phys_to_target_node +extern int memory_add_physaddr_to_nid(u64 start); +#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid +#endif /* CONFIG_NUMA_KEEP_MEMINFO */ + #endif /* CONFIG_NUMA_MEMBLKS */ #endif /* __NUMA_MEMBLKS_H */ -- cgit v1.2.3 From 650180760be6bb448609d4d155eef3b728ace641 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 Aug 2024 15:42:02 +0800 Subject: mm: swap: extend swap_shmem_alloc() to support batch SWAP_MAP_SHMEM flag setting Patch series "support large folio swap-out and swap-in for shmem", v5. Shmem will support large folio allocation [1] [2] to get a better performance, however, the memory reclaim still splits the precious large folios when trying to swap-out shmem, which may lead to the memory fragmentation issue and can not take advantage of the large folio for shmeme. Moreover, the swap code already supports for swapping out large folio without split, and large folio swap-in[3] series is queued into mm-unstable branch. Hence this patch set also supports the large folio swap-out and swap-in for shmem. This patch (of 9): To support shmem large folio swap operations, add a new parameter to swap_shmem_alloc() that allows batch SWAP_MAP_SHMEM flag setting for shmem swap entries. While we are at it, using folio_nr_pages() to get the number of pages of the folio as a preparation. Link: https://lkml.kernel.org/r/cover.1723434324.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/99f64115d04b285e009580eb177352c57119ffd0.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Barry Song Cc: Chris Li Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 1c8f844a9f0f..248db1dd7812 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -481,7 +481,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern void swap_shmem_alloc(swp_entry_t); +extern void swap_shmem_alloc(swp_entry_t, int); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t entry, int nr); extern void swap_free_nr(swp_entry_t entry, int nr_pages); @@ -548,7 +548,7 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline void swap_shmem_alloc(swp_entry_t swp) +static inline void swap_shmem_alloc(swp_entry_t swp, int nr) { } -- cgit v1.2.3 From 809bc86517cc408b5b8cb8e08e69096639432bc8 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 12 Aug 2024 15:42:10 +0800 Subject: mm: shmem: support large folio swap out Shmem will support large folio allocation [1] [2] to get a better performance, however, the memory reclaim still splits the precious large folios when trying to swap out shmem, which may lead to the memory fragmentation issue and can not take advantage of the large folio for shmeme. Moreover, the swap code already supports for swapping out large folio without split, hence this patch set supports the large folio swap out for shmem. Note the i915_gem_shmem driver still need to be split when swapping, thus add a new flag 'split_large_folio' for writeback_control to indicate spliting the large folio. [1] https://lore.kernel.org/all/cover.1717495894.git.baolin.wang@linux.alibaba.com/ [2] https://lore.kernel.org/all/20240515055719.32577-1-da.gomez@samsung.com/ [hughd@google.com: shmem_writepage() split folio at EOF before swapout] Link: https://lkml.kernel.org/r/aef55f8d-6040-692d-65e3-16150cce4440@google.com [baolin.wang@linux.alibaba.com: remove the wbc->split_large_folio per Hugh] Link: https://lkml.kernel.org/r/1236a002daa301b3b9ba73d6c0fab348427cf295.1724833399.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/d80c21abd20e1b0f5ca66b330f074060fb2f082d.1723434324.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Signed-off-by: Hugh Dickins Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Lance Yang Cc: Matthew Wilcox Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/writeback.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 1a54676d843a..51278327b3c6 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -79,6 +79,9 @@ struct writeback_control { */ struct swap_iocb **swap_plug; + /* Target list for splitting a large folio */ + struct list_head *list; + /* internal fields used by the ->writepages implementation: */ struct folio_batch fbatch; pgoff_t index; -- cgit v1.2.3 From f77f0c7514789577125c1b2df145703161736359 Mon Sep 17 00:00:00 2001 From: Kaiyang Zhao Date: Wed, 14 Aug 2024 17:42:27 +0000 Subject: mm,memcg: provide per-cgroup counters for NUMA balancing operations The ability to observe the demotion and promotion decisions made by the kernel on a per-cgroup basis is important for monitoring and tuning containerized workloads on machines equipped with tiered memory. Different containers in the system may experience drastically different memory tiering actions that cannot be distinguished from the global counters alone. For example, a container running a workload that has a much hotter memory accesses will likely see more promotions and fewer demotions, potentially depriving a colocated container of top tier memory to such an extent that its performance degrades unacceptably. For another example, some containers may exhibit longer periods between data reuse, causing much more numa_hint_faults than numa_pages_migrated. In this case, tuning hot_threshold_ms may be appropriate, but the signal can easily be lost if only global counters are available. In the long term, we hope to introduce per-cgroup control of promotion and demotion actions to implement memory placement policies in tiering. This patch set adds seven counters to memory.stat in a cgroup: numa_pages_migrated, numa_pte_updates, numa_hint_faults, pgdemote_kswapd, pgdemote_khugepaged, pgdemote_direct and pgpromote_success. pgdemote_* and pgpromote_success are also available in memory.numa_stat. count_memcg_events_mm() is added to count multiple event occurrences at once, and get_mem_cgroup_from_folio() is added because we need to get a reference to the memcg of a folio before it's migrated to track numa_pages_migrated. The accounting of PGDEMOTE_* is moved to shrink_inactive_list() before being changed to per-cgroup. [kaiyang2@cs.cmu.edu: add documentation of the memcg counters in cgroup-v2.rst] Link: https://lkml.kernel.org/r/20240814235122.252309-1-kaiyang2@cs.cmu.edu Link: https://lkml.kernel.org/r/20240814174227.30639-1-kaiyang2@cs.cmu.edu Signed-off-by: Kaiyang Zhao Cc: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Wei Xu Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 24 +++++++++++++++++++++--- include/linux/vmstat.h | 1 + 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ed170399179a..fe05fdb92779 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -784,6 +784,8 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_current(void); +struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio); + struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, @@ -1028,8 +1030,8 @@ static inline void count_memcg_folio_events(struct folio *folio, count_memcg_events(memcg, idx, nr); } -static inline void count_memcg_event_mm(struct mm_struct *mm, - enum vm_event_item idx) +static inline void count_memcg_events_mm(struct mm_struct *mm, + enum vm_event_item idx, unsigned long count) { struct mem_cgroup *memcg; @@ -1039,10 +1041,16 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) - count_memcg_events(memcg, idx, 1); + count_memcg_events(memcg, idx, count); rcu_read_unlock(); } +static inline void count_memcg_event_mm(struct mm_struct *mm, + enum vm_event_item idx) +{ + count_memcg_events_mm(mm, idx, 1); +} + static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { @@ -1262,6 +1270,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_current(void) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio) +{ + return NULL; +} + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) { @@ -1484,6 +1497,11 @@ static inline void count_memcg_folio_events(struct folio *folio, { } +static inline void count_memcg_events_mm(struct mm_struct *mm, + enum vm_event_item idx, unsigned long count) +{ +} + static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 9eb77c9007e6..d2761bf8ff32 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -32,6 +32,7 @@ struct reclaim_stat { unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; + unsigned nr_demoted; }; /* Stat data for system wide items */ -- cgit v1.2.3 From e98337d11bbdfa3e3f0fb99aa93e40f97549e0cd Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Tue, 13 Aug 2024 21:54:49 -0600 Subject: mm/contig_alloc: support __GFP_COMP Patch series "mm/hugetlb: alloc/free gigantic folios", v2. Use __GFP_COMP for gigantic folios can greatly reduce not only the amount of code but also the allocation and free time. Approximate LOC to mm/hugetlb.c: +60, -240 Allocate and free 500 1GB hugeTLB memory without HVO by: time echo 500 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages time echo 0 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages Before After Alloc ~13s ~10s Free ~15s <1s The above magnitude generally holds for multiple x86 and arm64 CPU models. Perf profile before: Alloc - 99.99% alloc_pool_huge_folio - __alloc_fresh_hugetlb_folio - 83.23% alloc_contig_pages_noprof - 47.46% alloc_contig_range_noprof - 20.96% isolate_freepages_range 16.10% split_page - 14.10% start_isolate_page_range - 12.02% undo_isolate_page_range Free - update_and_free_pages_bulk - 87.71% free_contig_range - 76.02% free_unref_page - 41.30% free_unref_page_commit - 32.58% free_pcppages_bulk - 24.75% __free_one_page 13.96% _raw_spin_trylock 12.27% __update_and_free_hugetlb_folio Perf profile after: Alloc - 99.99% alloc_pool_huge_folio alloc_gigantic_folio - alloc_contig_pages_noprof - 59.15% alloc_contig_range_noprof - 20.72% start_isolate_page_range 20.64% prep_new_page - 17.13% undo_isolate_page_range Free - update_and_free_pages_bulk - __folio_put - __free_pages_ok 7.46% free_tail_page_prepare - 1.97% free_one_page 1.86% __free_one_page This patch (of 3): Support __GFP_COMP in alloc_contig_range(). When the flag is set, upon success the function returns a large folio prepared by prep_new_page(), rather than a range of order-0 pages prepared by split_free_pages() (which is renamed from split_map_pages()). alloc_contig_range() can be used to allocate folios larger than MAX_PAGE_ORDER, e.g., gigantic hugeTLB folios. So on the free path, free_one_page() needs to handle that by split_large_buddy(). [akpm@linux-foundation.org: fix folio_alloc_gigantic_noprof() WARN expression, per Yu Liao] Link: https://lkml.kernel.org/r/20240814035451.773331-1-yuzhao@google.com Link: https://lkml.kernel.org/r/20240814035451.773331-2-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Zi Yan Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Frank van der Linden Signed-off-by: Andrew Morton --- include/linux/gfp.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f53f76e0b17e..03ba9563c6db 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -446,4 +446,27 @@ extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_ #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); +#ifdef CONFIG_CONTIG_ALLOC +static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, + int nid, nodemask_t *node) +{ + struct page *page; + + if (WARN_ON(!order || !(gfp & __GFP_COMP))) + return NULL; + + page = alloc_contig_pages_noprof(1 << order, gfp, nid, node); + + return page ? page_folio(page) : NULL; +} +#else +static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp, + int nid, nodemask_t *node) +{ + return NULL; +} +#endif +/* This should be paired with folio_put() rather than free_contig_range(). */ +#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__)) + #endif /* __LINUX_GFP_H */ -- cgit v1.2.3 From 463586e9ff398f951a9a63d98a3b93f44434d20c Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Tue, 13 Aug 2024 21:54:50 -0600 Subject: mm/cma: add cma_{alloc,free}_folio() With alloc_contig_range() and free_contig_range() supporting large folios, CMA can allocate and free large folios too, by cma_alloc_folio() and cma_free_folio(). [yuzhao@google.com: fix WARN in cma_alloc_folio()] Link: https://lkml.kernel.org/r/Zsd0PgAQmbpR8jS6@google.com Link: https://lkml.kernel.org/r/20240814035451.773331-3-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Zi Yan Cc: Frank van der Linden Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/cma.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cma.h b/include/linux/cma.h index 9db877506ea8..d15b64f51336 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -52,4 +52,20 @@ extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); extern void cma_reserve_pages_on_error(struct cma *cma); + +#ifdef CONFIG_CMA +struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp); +bool cma_free_folio(struct cma *cma, const struct folio *folio); +#else +static inline struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp) +{ + return NULL; +} + +static inline bool cma_free_folio(struct cma *cma, const struct folio *folio) +{ + return false; +} +#endif + #endif -- cgit v1.2.3 From cf54f310d0d313bce6505d010a555948b0ae0e2a Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Tue, 13 Aug 2024 21:54:51 -0600 Subject: mm/hugetlb: use __GFP_COMP for gigantic folios Use __GFP_COMP for gigantic folios to greatly reduce not only the amount of code but also the allocation and free time. LOC (approximately): +60, -240 Allocate and free 500 1GB hugeTLB memory without HVO by: time echo 500 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages time echo 0 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages Before After Alloc ~13s ~10s Free ~15s <1s The above magnitude generally holds for multiple x86 and arm64 CPU models. Link: https://lkml.kernel.org/r/20240814035451.773331-4-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: Frank van der Linden Acked-by: Zi Yan Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3100a52ceb73..98c47c394b89 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -896,10 +896,11 @@ static inline bool hugepage_movable_supported(struct hstate *h) /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepage_movable_supported(h)) - return GFP_HIGHUSER_MOVABLE; - else - return GFP_HIGHUSER; + gfp_t gfp = __GFP_COMP | __GFP_NOWARN; + + gfp |= hugepage_movable_supported(h) ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER; + + return gfp; } static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask) -- cgit v1.2.3 From d0b003ce97ad6518dea4b0ed70081df95de04179 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 16 Aug 2024 12:32:46 +0200 Subject: mm/rmap: use folio->_mapcount for small folios We have some cases left whereby we operate on small folios and still refer to page->_mapcount. Let's just use folio->_mapcount instead, which currently still overlays page->_mapcount, so no change. This change will make it easier to later spot any remaining users of page->_mapcount that target tail pages. Link: https://lkml.kernel.org/r/20240816103246.719209-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 0978c64f49d8..91b5935e8485 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -331,7 +331,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, switch (level) { case RMAP_LEVEL_PTE: if (!folio_test_large(folio)) { - atomic_inc(&page->_mapcount); + atomic_inc(&folio->_mapcount); break; } @@ -425,7 +425,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, if (!folio_test_large(folio)) { if (PageAnonExclusive(page)) ClearPageAnonExclusive(page); - atomic_inc(&page->_mapcount); + atomic_inc(&folio->_mapcount); break; } -- cgit v1.2.3 From 489a744e5fb16503b495ad1fa3dd1abf25b224e7 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Tue, 13 Aug 2024 00:34:35 +0200 Subject: mm: krealloc: clarify valid usage of __GFP_ZERO Properly document that if __GFP_ZERO logic is requested, callers must ensure that, starting with the initial memory allocation, every subsequent call to this API for the same memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that __GFP_ZERO is not fully honored by this API. Link: https://lkml.kernel.org/r/20240812223707.32049-2-dakr@kernel.org Signed-off-by: Danilo Krummrich Acked-by: David Rientjes Cc: Christoph Lameter Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/slab.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index c9cb42203183..2282e67a01c7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -733,6 +733,16 @@ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t siz * @new_n: new number of elements to alloc * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * See krealloc_noprof() for further details. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. */ static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p, size_t new_n, -- cgit v1.2.3 From a759e37fb46708029c9c3c56c3b62e6f24d85cf5 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 18 Aug 2024 23:01:51 +0200 Subject: err.h: add ERR_PTR_PCPU(), PTR_ERR_PCPU() and IS_ERR_PCPU() macros Add ERR_PTR_PCPU(), PTR_ERR_PCPU() and IS_ERR_PCPU() macros that operate on pointers in the percpu address space. These macros remove the need for (__force void *) function argument casts (to avoid sparse -Wcast-from-as warnings). The patch will also avoid future build errors due to pointer address space mismatch with enabled strict percpu address space checks. Link: https://lkml.kernel.org/r/20240818210235.33481-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Acked-by: Catalin Marinas Signed-off-by: Andrew Morton --- include/linux/err.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/err.h b/include/linux/err.h index b5d9bb2a2349..a4dacd745fcf 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -41,6 +41,9 @@ static inline void * __must_check ERR_PTR(long error) return (void *) error; } +/* Return the pointer in the percpu address space. */ +#define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) + /** * PTR_ERR - Extract the error code from an error pointer. * @ptr: An error pointer. @@ -51,6 +54,9 @@ static inline long __must_check PTR_ERR(__force const void *ptr) return (long) ptr; } +/* Read an error pointer from the percpu address space. */ +#define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr))) + /** * IS_ERR - Detect an error pointer. * @ptr: The pointer to check. @@ -61,6 +67,9 @@ static inline bool __must_check IS_ERR(__force const void *ptr) return IS_ERR_VALUE((unsigned long)ptr); } +/* Read an error pointer from the percpu address space. */ +#define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr))) + /** * IS_ERR_OR_NULL - Detect an error pointer or a null pointer. * @ptr: The pointer to check. -- cgit v1.2.3 From ef5f379de302884b9b7ad9b62587a942a9f0bb55 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 20 Aug 2024 14:22:10 +0200 Subject: mm: always inline _compound_head() with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y We already force-inline page_fixed_fake_head(), page_is_fake_head() and PageTail(), however the compiler might decide that _compound_head() is not worthy to be inlined, because of page_fixed_fake_head(). The result is that, for example, PageAnonExclusive() now might involve a function call when checking PageHuge(), which performs a page_folio()->_compound_head() call. This can lead to a slight regression of the stress-ng.clone benchmark. This is not super-urgent to fix, but always inlining _compound_head() seems like the obvious thing to do for this primitive, similar to the other ones. This change restores the slight regression and a compilation with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP=y shows no relevant bloat [2]: add/remove: 15/14 grow/shrink: 79/87 up/down: 12836/-13917 (-1081) ... Total: Before=32786363, After=32785282, chg -0.00% [1] https://lkml.kernel.org/r/817150f2-abf7-430f-9973-540bd6cdd26f@intel.com [2] https://lore.kernel.org/all/116e117c-2821-401d-8e62-b85cdec37f4a@redhat.com/ Link: https://lkml.kernel.org/r/20240820122210.660140-1-david@redhat.com Fixes: c0bff412e67b ("mm: allow anon exclusive check over hugetlb tail pages") Signed-off-by: David Hildenbrand Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202407301049.5051dc19-oliver.sang@intel.com Tested-by: Yin Fengwei Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 17175a328e6b..2515ae85f191 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -232,7 +232,7 @@ static __always_inline int page_is_fake_head(const struct page *page) return page_fixed_fake_head(page) != page; } -static inline unsigned long _compound_head(const struct page *page) +static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); -- cgit v1.2.3 From 0a2d82946be67e02fdf85a4010606bdc0546ba44 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 20 Aug 2024 10:26:39 +0800 Subject: mm: allow read-ahead with IOCB_NOWAIT set Readahead support for IOCB_NOWAIT was introduced in commit 2e85abf053b9 ("mm: allow read-ahead with IOCB_NOWAIT set"). However, this implementation broke the semantics of IOCB_NOWAIT by potentially causing it to wait on I/O during memory reclamation. This behavior was later modified in commit efa8480a8316 ("fs: RWF_NOWAIT should imply IOCB_NOIO"). To resolve the blocking issue during memory reclamation, we can use memalloc_noio_{save,restore} to ensure non-blocking behavior. This change restores the original functionality, allowing preadv2(IOCB_NOWAIT) to trigger readahead if the file content is not present in the page cache. While this process may trigger direct memory reclamation, the __GFP_NORETRY flag is set in the readahead GFP flags, ensuring it won't block. A use case for this change is when we want to trigger readahead in the preadv2(2) syscall if the file cache is absent, but without waiting for certain filesystem locks, like xfs_ilock. A simple example is as follows: retry: if (preadv2(fd, iovec, cnt, offset, RWF_NOWAIT) < 0) { do_other_work(); goto retry; } Link: https://lore.gnuweeb.org/io-uring/20200624164127.GP21350@casper.infradead.org/ Link: https://lkml.kernel.org/r/20240820022639.89562-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Cc: Jens Axboe Cc: Matthew Wilcox Cc: Dave Chinner Cc: Jan Kara Cc: Christian Brauner Signed-off-by: Andrew Morton --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6ca11e241a24..8610242ec2ab 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3461,7 +3461,6 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; - kiocb_flags |= IOCB_NOIO; } if (flags & RWF_ATOMIC) { if (rw_type != WRITE) -- cgit v1.2.3 From e880034cf7182aa35962bd30dfc9fa60476d56a4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 18:39:10 +0100 Subject: mm: introduce page_mapcount_is_type() Resolve the awkward "and add one to this opaque constant" test into a self-documenting inline function. Link: https://lkml.kernel.org/r/20240821173914.2270383-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +-- include/linux/page-flags.h | 12 +++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0fde51c0532f..6f6053d7ea6f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1228,8 +1228,7 @@ static inline int folio_mapcount(const struct folio *folio) if (likely(!folio_test_large(folio))) { mapcount = atomic_read(&folio->_mapcount) + 1; - /* Handle page_has_type() pages */ - if (mapcount < PAGE_MAPCOUNT_RESERVE + 1) + if (page_mapcount_is_type(mapcount)) mapcount = 0; return mapcount; } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2515ae85f191..0b5484fdbca1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -956,12 +956,18 @@ enum pagetype { #define folio_test_type(folio, flag) \ ((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) -static inline int page_type_has_type(unsigned int page_type) +static inline bool page_type_has_type(int page_type) { - return (int)page_type < PAGE_MAPCOUNT_RESERVE; + return page_type < PAGE_MAPCOUNT_RESERVE; } -static inline int page_has_type(const struct page *page) +/* This takes a mapcount which is one more than page->_mapcount */ +static inline bool page_mapcount_is_type(unsigned int mapcount) +{ + return page_type_has_type(mapcount - 1); +} + +static inline bool page_has_type(const struct page *page) { return page_type_has_type(READ_ONCE(page->page_type)); } -- cgit v1.2.3 From 4ffca5a96678c9fe1a39ec1e9c8fd326b57ea8a7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 18:39:11 +0100 Subject: mm: support only one page_type per page By using a few values in the top byte, users of page_type can store up to 24 bits of additional data in page_type. It also reduces the code size as (with replacement of READ_ONCE() with data_race()), the kernel can check just a single byte. eg: ffffffff811e3a79: 8b 47 30 mov 0x30(%rdi),%eax ffffffff811e3a7c: 55 push %rbp ffffffff811e3a7d: 48 89 e5 mov %rsp,%rbp ffffffff811e3a80: 25 00 00 00 82 and $0x82000000,%eax ffffffff811e3a85: 3d 00 00 00 80 cmp $0x80000000,%eax ffffffff811e3a8a: 74 4d je ffffffff811e3ad9 becomes: ffffffff811e3a69: 80 7f 33 f5 cmpb $0xf5,0x33(%rdi) ffffffff811e3a6d: 55 push %rbp ffffffff811e3a6e: 48 89 e5 mov %rsp,%rbp ffffffff811e3a71: 74 4d je ffffffff811e3ac0 replacing three instructions with one. [wangkefeng.wang@huawei.com: fix ubsan warnings] Link: https://lkml.kernel.org/r/2d19c48a-c550-4345-bf36-d05cd303c5de@huawei.com Link: https://lkml.kernel.org/r/20240821173914.2270383-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 68 +++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0b5484fdbca1..7a1b2948b075 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -923,42 +923,29 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #endif /* - * For pages that are never mapped to userspace, - * page_type may be used. Because it is initialised to -1, we invert the - * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and - * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and - * low bits so that an underflow or overflow of _mapcount won't be - * mistaken for a page type value. + * For pages that do not use mapcount, page_type may be used. + * The low 24 bits of pagetype may be used for your own purposes, as long + * as you are careful to not affect the top 8 bits. The low bits of + * pagetype will be overwritten when you clear the page_type from the page. */ - enum pagetype { - PG_buddy = 0x40000000, - PG_offline = 0x20000000, - PG_table = 0x10000000, - PG_guard = 0x08000000, - PG_hugetlb = 0x04000000, - PG_slab = 0x02000000, - PG_zsmalloc = 0x01000000, - PG_unaccepted = 0x00800000, - - PAGE_TYPE_BASE = 0x80000000, - - /* - * Reserve 0xffff0000 - 0xfffffffe to catch _mapcount underflows and - * allow owners that set a type to reuse the lower 16 bit for their own - * purposes. - */ - PAGE_MAPCOUNT_RESERVE = ~0x0000ffff, + /* 0x00-0x7f are positive numbers, ie mapcount */ + /* Reserve 0x80-0xef for mapcount overflow. */ + PGTY_buddy = 0xf0, + PGTY_offline = 0xf1, + PGTY_table = 0xf2, + PGTY_guard = 0xf3, + PGTY_hugetlb = 0xf4, + PGTY_slab = 0xf5, + PGTY_zsmalloc = 0xf6, + PGTY_unaccepted = 0xf7, + + PGTY_mapcount_underflow = 0xff }; -#define PageType(page, flag) \ - ((READ_ONCE(page->page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) -#define folio_test_type(folio, flag) \ - ((READ_ONCE(folio->page.page_type) & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) - static inline bool page_type_has_type(int page_type) { - return page_type < PAGE_MAPCOUNT_RESERVE; + return page_type < (PGTY_mapcount_underflow << 24); } /* This takes a mapcount which is one more than page->_mapcount */ @@ -969,40 +956,41 @@ static inline bool page_mapcount_is_type(unsigned int mapcount) static inline bool page_has_type(const struct page *page) { - return page_type_has_type(READ_ONCE(page->page_type)); + return page_mapcount_is_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ -static __always_inline bool folio_test_##fname(const struct folio *folio)\ +static __always_inline bool folio_test_##fname(const struct folio *folio) \ { \ - return folio_test_type(folio, PG_##lname); \ + return data_race(folio->page.page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ - VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ - folio->page.page_type &= ~PG_##lname; \ + VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ + folio); \ + folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ - folio->page.page_type |= PG_##lname; \ + folio->page.page_type = UINT_MAX; \ } #define PAGE_TYPE_OPS(uname, lname, fname) \ FOLIO_TYPE_OPS(lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ - return PageType(page, PG_##lname); \ + return data_race(page->page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ - VM_BUG_ON_PAGE(!PageType(page, 0), page); \ - page->page_type &= ~PG_##lname; \ + VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ + page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ - page->page_type |= PG_##lname; \ + page->page_type = UINT_MAX; \ } /* -- cgit v1.2.3 From bf03c806993091d8fb2fc0e98f07a8ef251c78f3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:34 +0100 Subject: mm: remove PageActive Patch series "Simplify the page flags a little". In the course of our folio conversions, we have made many page flags only used on folios, so we can now remove the page-based accessors. This should cut down compile time a little, and prevent new users from cropping up. There is more that could be done in this area, but it would produce merge conflicts, so I'll sit on those patches until next merge window. We now have line of sight to removing PG_private_2 and PG_private. This patch (of 10): This flag is now only used on folios, so we can remove all the page accessors. [akpm@linux-foundation.org: fix arch/powerpc/mm/pgtable-frag.c] Link: https://lkml.kernel.org/r/20240821193445.2294269-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240821193445.2294269-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7a1b2948b075..34347bee1217 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -510,8 +510,9 @@ PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) TESTCLEARFLAG(LRU, lru, PF_HEAD) -PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) - TESTCLEARFLAG(Active, active, PF_HEAD) +FOLIO_FLAG(active, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ -- cgit v1.2.3 From 0b7582803649e0e58e5f8c9a4ede362777bc7eec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:35 +0100 Subject: mm: remove PageSwapBacked This flag is now only used on folios, so we can remove all the page accessors. Link: https://lkml.kernel.org/r/20240821193445.2294269-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 34347bee1217..cebb3dc0fd0c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -528,9 +528,9 @@ PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) -PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) - __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) - __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) +FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE) + __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE) /* * Private page markings that may be used by the filesystem that owns the page -- cgit v1.2.3 From 6f394ee9ddb4bd1879e42c9c8648a37f650bea99 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:36 +0100 Subject: mm: remove PageReadahead This flag is now only used on folios, so we can remove all the page accessors. Link: https://lkml.kernel.org/r/20240821193445.2294269-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cebb3dc0fd0c..71444223d630 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -553,8 +553,8 @@ PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) -PAGEFLAG(Readahead, readahead, PF_NO_COMPOUND) - TESTCLEARFLAG(Readahead, readahead, PF_NO_COMPOUND) +FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) #ifdef CONFIG_HIGHMEM /* -- cgit v1.2.3 From 32f51ead3d7771cdec29f75e08d50a76d2c6253d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:37 +0100 Subject: mm: remove PageSwapCache This flag is now only used on folios, so we can remove all the page accessors and reword the comments that refer to them. Link: https://lkml.kernel.org/r/20240821193445.2294269-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 +- include/linux/page-flags.h | 11 +++-------- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2419e60c9a7f..6e3bdf8e38bc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -109,7 +109,7 @@ struct page { /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. - * Used for swp_entry_t if PageSwapCache. + * Used for swp_entry_t if swapcache flag set. * Indicates order in the buddy system if PageBuddy. */ unsigned long private; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 71444223d630..b31d2c50b6f0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -574,15 +574,10 @@ static __always_inline bool folio_test_swapcache(const struct folio *folio) test_bit(PG_swapcache, const_folio_flags(folio, 0)); } -static __always_inline bool PageSwapCache(const struct page *page) -{ - return folio_test_swapcache(page_folio(page)); -} - -SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) -CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) +FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE) +FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE) #else -PAGEFLAG_FALSE(SwapCache, swapcache) +FOLIO_FLAG_FALSE(swapcache) #endif PAGEFLAG(Unevictable, unevictable, PF_HEAD) -- cgit v1.2.3 From cb29e7941d5d57576392a52c08077ef327ec8e17 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:38 +0100 Subject: mm: remove PageUnevictable There is only one caller of PageUnevictable() left; convert it to call folio_test_unevictable() and remove all the page accessors. Link: https://lkml.kernel.org/r/20240821193445.2294269-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b31d2c50b6f0..2150502195d5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -580,9 +580,9 @@ FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE) FOLIO_FLAG_FALSE(swapcache) #endif -PAGEFLAG(Unevictable, unevictable, PF_HEAD) - __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) - TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) +FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) #ifdef CONFIG_MMU PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) -- cgit v1.2.3 From 99f86bbda31790f2ca0e365f02650585536929ab Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:39 +0100 Subject: mm: remove PageMlocked This flag is now only used on folios, so we can remove all the page accessors. Link: https://lkml.kernel.org/r/20240821193445.2294269-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2150502195d5..08cce16c82aa 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -585,12 +585,15 @@ FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) #ifdef CONFIG_MMU -PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) - __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) - TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) +FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE) + __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) + FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE) #else -PAGEFLAG_FALSE(Mlocked, mlocked) __CLEARPAGEFLAG_NOOP(Mlocked, mlocked) - TESTSCFLAG_FALSE(Mlocked, mlocked) +FOLIO_FLAG_FALSE(mlocked) + __FOLIO_CLEAR_FLAG_NOOP(mlocked) + FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked) + FOLIO_TEST_SET_FLAG_FALSE(mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED -- cgit v1.2.3 From 3026bc1e82b695d69cde21712512922abe74aab9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:40 +0100 Subject: mm: remove PageOwnerPriv1 While there are many aliases for this flag, nobody actually uses the *PageOwnerPriv1() nor folio_*_owner_priv_1() accessors. Remove them. Link: https://lkml.kernel.org/r/20240821193445.2294269-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 08cce16c82aa..458d2dc5124d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -539,8 +539,6 @@ FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) */ PAGEFLAG(Private, private, PF_ANY) PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) -PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) - TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) /* * Only test-and-set exist for PG_writeback. The unconditional operators are -- cgit v1.2.3 From 6dc151388e44957d471633ce70d72e3d27ae836d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:41 +0100 Subject: mm: remove page_has_private() This function has no more callers, except folio_has_private(). Combine the two functions. Link: https://lkml.kernel.org/r/20240821193445.2294269-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 458d2dc5124d..017205048cab 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1175,20 +1175,15 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** - * page_has_private - Determine if page has private stuff - * @page: The page to be checked + * folio_has_private - Determine if folio has private stuff + * @folio: The folio to be checked * - * Determine if a page has private stuff, indicating that release routines + * Determine if a folio has private stuff, indicating that release routines * should be invoked upon it. */ -static inline int page_has_private(const struct page *page) +static inline int folio_has_private(const struct folio *folio) { - return !!(page->flags & PAGE_FLAGS_PRIVATE); -} - -static inline bool folio_has_private(const struct folio *folio) -{ - return page_has_private(&folio->page); + return !!(folio->flags & PAGE_FLAGS_PRIVATE); } #undef PF_ANY -- cgit v1.2.3 From 02e1960aafac33721401dcd92e915325fdb524b2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:42 +0100 Subject: mm: rename PG_mappedtodisk to PG_owner_2 This flag has similar constraints to PG_owner_priv_1 -- it is ignored by core code, and is entirely for the use of the code which allocated the folio. Since the pagecache does not use it, individual filesystems can use it. The bufferhead code does use it, so filesystems which use the buffer cache must not use it for another purpose. Link: https://lkml.kernel.org/r/20240821193445.2294269-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/kernel-page-flags.h | 2 +- include/linux/page-flags.h | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index 859f4b0c1b2b..7c587a711be1 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -10,7 +10,7 @@ */ #define KPF_RESERVED 32 #define KPF_MLOCKED 33 -#define KPF_MAPPEDTODISK 34 +#define KPF_OWNER_2 34 #define KPF_PRIVATE 35 #define KPF_PRIVATE_2 36 #define KPF_OWNER_PRIVATE 37 diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 017205048cab..1ff3d172c22c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -101,12 +101,12 @@ enum pageflags { PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, - PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ + PG_owner_priv_1, /* Owner use. If pagecache, fs may use */ + PG_owner_2, /* Owner use. If pagecache, fs may use */ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ - PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ @@ -131,6 +131,11 @@ enum pageflags { PG_readahead = PG_reclaim, + /* Anonymous memory (and shmem) */ + PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ + /* Some filesystems */ + PG_checked = PG_owner_priv_1, + /* * Depending on the way an anonymous folio can be mapped into a page * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped @@ -138,13 +143,13 @@ enum pageflags { * tail pages of an anonymous folio. For now, we only expect it to be * set on tail pages for PTE-mapped THP. */ - PG_anon_exclusive = PG_mappedtodisk, - - /* Filesystems */ - PG_checked = PG_owner_priv_1, + PG_anon_exclusive = PG_owner_2, - /* SwapBacked */ - PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ + /* + * Set if all buffer heads in the folio are mapped. + * Filesystems which do not use BHs can use it for their own purpose. + */ + PG_mappedtodisk = PG_owner_2, /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes @@ -540,6 +545,9 @@ FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) PAGEFLAG(Private, private, PF_ANY) PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) +/* owner_2 can be set on tail pages for anon memory */ +FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) + /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. -- cgit v1.2.3 From 7a87225ae2c6c317c7b80cf599e5cf0eee699196 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 20:34:43 +0100 Subject: x86: remove PG_uncached Convert x86 to use PG_arch_2 instead of PG_uncached and remove PG_uncached. Link: https://lkml.kernel.org/r/20240821193445.2294269-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/kernel-page-flags.h | 1 - include/linux/page-flags.h | 13 +++---------- 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel-page-flags.h b/include/linux/kernel-page-flags.h index 7c587a711be1..196778a087c4 100644 --- a/include/linux/kernel-page-flags.h +++ b/include/linux/kernel-page-flags.h @@ -15,7 +15,6 @@ #define KPF_PRIVATE_2 36 #define KPF_OWNER_PRIVATE 37 #define KPF_ARCH 38 -#define KPF_UNCACHED 39 #define KPF_SOFTDIRTY 40 #define KPF_ARCH_2 41 #define KPF_ARCH_3 42 diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1ff3d172c22c..2175ebceb41c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -113,9 +113,6 @@ enum pageflags { #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - PG_uncached, /* Page has been mapped as uncached */ -#endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif @@ -123,8 +120,10 @@ enum pageflags { PG_young, PG_idle, #endif -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 PG_arch_2, +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 PG_arch_3, #endif __NR_PAGEFLAGS, @@ -602,12 +601,6 @@ FOLIO_FLAG_FALSE(mlocked) FOLIO_TEST_SET_FLAG_FALSE(mlocked) #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED -PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) -#else -PAGEFLAG_FALSE(Uncached, uncached) -#endif - #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) -- cgit v1.2.3 From 0ca0c24e3211586d44a31b3c4767fe3f43a008a7 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 23 Aug 2024 20:04:39 +0100 Subject: mm: store zero pages to be swapped out in a bitmap Patch series "mm: store zero pages to be swapped out in a bitmap", v8. As shown in the patch series that introduced the zswap same-filled optimization [1], 10-20% of the pages stored in zswap are same-filled. This is also observed across Meta's server fleet. By using VM counters in swap_writepage (not included in this patchseries) it was found that less than 1% of the same-filled pages to be swapped out are non-zero pages. For conventional swap setup (without zswap), rather than reading/writing these pages to flash resulting in increased I/O and flash wear, a bitmap can be used to mark these pages as zero at write time, and the pages can be filled at read time if the bit corresponding to the page is set. When using zswap with swap, this also means that a zswap_entry does not need to be allocated for zero filled pages resulting in memory savings which would offset the memory used for the bitmap. A similar attempt was made earlier in [2] where zswap would only track zero-filled pages instead of same-filled. This patchseries adds zero-filled pages optimization to swap (hence it can be used even if zswap is disabled) and removes the same-filled code from zswap (as only 1% of the same-filled pages are non-zero), simplifying code. [1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/ [2] https://lore.kernel.org/lkml/20240325235018.2028408-1-yosryahmed@google.com/ This patch (of 2): Approximately 10-20% of pages to be swapped out are zero pages [1]. Rather than reading/writing these pages to flash resulting in increased I/O and flash wear, a bitmap can be used to mark these pages as zero at write time, and the pages can be filled at read time if the bit corresponding to the page is set. With this patch, NVMe writes in Meta server fleet decreased by almost 10% with conventional swap setup (zswap disabled). [1] https://lore.kernel.org/all/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1/ Link: https://lkml.kernel.org/r/20240823190545.979059-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20240823190545.979059-2-usamaarif642@gmail.com Signed-off-by: Usama Arif Reviewed-by: Chengming Zhou Reviewed-by: Yosry Ahmed Reviewed-by: Nhat Pham Acked-by: Johannes Weiner Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Cc: Usama Arif Cc: Andi Kleen Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 248db1dd7812..ca533b478c21 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -296,6 +296,7 @@ struct swap_info_struct { signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ struct list_head full_clusters; /* full clusters list */ -- cgit v1.2.3 From 63fc66f5b6b18f39269a66cf34d8cb7a24fbfe88 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 30 Aug 2024 00:00:58 -0400 Subject: ipc/shm, mm: drop do_vma_munmap() The do_vma_munmap() wrapper existed for callers that didn't have a vma iterator and needed to check the vma mseal status prior to calling the underlying munmap(). All callers now use a vma iterator and since the mseal check has been moved to do_vmi_align_munmap() and the vmas are aligned, this function can just be called instead. do_vmi_align_munmap() can no longer be static as ipc/shm is using it and it is exported via the mm.h header. Link: https://lkml.kernel.org/r/20240830040101.822209-19-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Cc: Bert Karwatzki Cc: Jeff Xu Cc: Jiri Olsa Cc: Kees Cook Cc: Lorenzo Stoakes Cc: Mark Brown Cc: Matthew Wilcox Cc: "Paul E. McKenney" Cc: Paul Moore Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6f6053d7ea6f..b0ff06d18c71 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3287,14 +3287,14 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); +int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU -extern int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *uf, bool unlock); extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) -- cgit v1.2.3 From f1264e9531b0fbadf88e0b9c8143c546343b481c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 27 Aug 2024 19:47:27 +0800 Subject: mm: migrate: add isolate_folio_to_list() Add isolate_folio_to_list() helper to try to isolate HugeTLB, no-LRU movable and LRU folios to a list, which will be reused by do_migrate_range() from memory hotplug soon, also drop the mf_isolate_folio() since we could directly use new helper in the soft_offline_in_use_page(). Link: https://lkml.kernel.org/r/20240827114728.3212578-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Acked-by: Miaohe Lin Tested-by: Miaohe Lin Cc: Dan Carpenter Cc: Jonathan Cameron Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/migrate.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 644be30b69c8..002e49b2ebd9 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -70,6 +70,7 @@ int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free, unsigned int *ret_succeeded); struct folio *alloc_migration_target(struct folio *src, unsigned long private); bool isolate_movable_page(struct page *page, isolate_mode_t mode); +bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); @@ -91,6 +92,8 @@ static inline struct folio *alloc_migration_target(struct folio *src, { return NULL; } static inline bool isolate_movable_page(struct page *page, isolate_mode_t mode) { return false; } +static inline bool isolate_folio_to_list(struct folio *folio, struct list_head *list) + { return false; } static inline int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) -- cgit v1.2.3 From d29e741cad3f8f41df1834bf74df79380c1c6c6d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 19 Aug 2024 17:17:04 +0200 Subject: gpio: davinci: drop platform data support There are no more any board files that use the platform data for gpio-davinci. We can remove the header defining it and port the code to no longer store any context in pdata. Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240819151705.37258-1-brgl@bgdev.pl Signed-off-by: Bartosz Golaszewski --- include/linux/platform_data/gpio-davinci.h | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 include/linux/platform_data/gpio-davinci.h (limited to 'include/linux') diff --git a/include/linux/platform_data/gpio-davinci.h b/include/linux/platform_data/gpio-davinci.h deleted file mode 100644 index b82e44662efe..000000000000 --- a/include/linux/platform_data/gpio-davinci.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * DaVinci GPIO Platform Related Defines - * - * Copyright (C) 2013 Texas Instruments Incorporated - https://www.ti.com/ - */ - -#ifndef __DAVINCI_GPIO_PLATFORM_H -#define __DAVINCI_GPIO_PLATFORM_H - -struct davinci_gpio_platform_data { - bool no_auto_base; - u32 base; - u32 ngpio; - u32 gpio_unbanked; -}; - -/* Convert GPIO signal to GPIO pin number */ -#define GPIO_TO_PIN(bank, gpio) (16 * (bank) + (gpio)) - -#endif -- cgit v1.2.3 From 8b1451f2f723f845c05b8bad3d4c45de284338b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 3 Sep 2024 21:54:28 -1000 Subject: sched_ext: Replace SCX_TASK_BAL_KEEP with SCX_RQ_BAL_KEEP SCX_TASK_BAL_KEEP is used by balance_one() to tell pick_next_task_scx() to keep running the current task. It's not really a task property. Replace it with SCX_RQ_BAL_KEEP which resides in rq->scx.flags and is a better fit for the usage. Also, the existing clearing rule is unnecessarily strict and makes it difficult to use with core-sched. Just clear it on entry to balance_one(). Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 69f68e2121a8..db2a266113ac 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -71,7 +71,6 @@ struct scx_dispatch_q { /* scx_entity.flags */ enum scx_ent_flags { SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ - SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ -- cgit v1.2.3 From 47f218d108950984b24af81f66356ceda380eb74 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 29 Aug 2024 21:06:17 -0300 Subject: iommu/amd: Store the nid in io_pgtable_cfg instead of the domain We already have memory in the union here that is being wasted in AMD's case, use it to store the nid. Putting the nid here further isolates the io_pgtable code from the struct protection_domain. Fixup protection_domain_alloc so that the NID from the device is provided, at this point dev is never NULL for AMD so this will now allocate the first table pointer on the correct NUMA node. Signed-off-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/8-v2-831cdc4d00f3+1a315-amd_iopgtbl_jgg@nvidia.com Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index f9a81761bfce..b1ecfc3cd5bc 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -171,6 +171,10 @@ struct io_pgtable_cfg { u64 ttbr[4]; u32 n_ttbrs; } apple_dart_cfg; + + struct { + int nid; + } amd; }; }; -- cgit v1.2.3 From 9f82f15ddfdd60bb9820f09737333b2345e22ab3 Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:46 +0100 Subject: mm: use ARCH_PKEY_BITS to define VM_PKEY_BITN Use the new CONFIG_ARCH_PKEY_BITS to simplify setting these bits for different architectures. Signed-off-by: Joey Gouly Cc: Andrew Morton Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Acked-by: Dave Hansen Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20240822151113.1479789-4-joey.gouly@arm.com Signed-off-by: Will Deacon --- include/linux/mm.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76..fb6ccd93f589 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -330,12 +330,16 @@ extern unsigned int kobjsize(const void *objp); #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 -# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ -# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */ -# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 -# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 -#ifdef CONFIG_PPC +# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 +# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 +# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 +# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 +#if CONFIG_ARCH_PKEY_BITS > 3 +# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 +#else +# define VM_PKEY_BIT3 0 +#endif +#if CONFIG_ARCH_PKEY_BITS > 4 # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 -- cgit v1.2.3 From facaa1373c9aabf8e34109a9cb205ad0f3a8584e Mon Sep 17 00:00:00 2001 From: Joey Gouly Date: Thu, 22 Aug 2024 16:10:55 +0100 Subject: arm64: re-order MTE VM_ flags VM_PKEY_BIT[012] will use VM_HIGH_ARCH_[012], move the MTE VM flags to accommodate this. Signed-off-by: Joey Gouly Cc: Catalin Marinas Cc: Will Deacon Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20240822151113.1479789-13-joey.gouly@arm.com Signed-off-by: Will Deacon --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fb6ccd93f589..406512c16471 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -378,8 +378,8 @@ extern unsigned int kobjsize(const void *objp); #endif #if defined(CONFIG_ARM64_MTE) -# define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ -# define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ +# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ +# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE -- cgit v1.2.3 From c6fb88a5270f4ba24859f5ecb61cfc731ef16300 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 4 Sep 2024 21:51:50 +0900 Subject: firewire: core: allocate workqueue to handle isochronous contexts in card This commit adds a workqueue dedicated for isochronous context processing. The workqueue is allocated per instance of fw_card structure to satisfy the following characteristics descending from 1394 OHCI specification: In 1394 OHCI specification, memory pages are reserved to each isochronous context dedicated to DMA transmission. It allows to operate these per-context pages concurrently. Software can schedule hardware interrupt for several isochronous context to the same cycle, thus WQ_UNBOUND is specified. Additionally, it is sleepable to operate the content of pages, thus WQ_BH is not used. The isochronous context delivers the packets with time stamp, thus WQ_HIGHPRI is specified for semi real-time data such as IEC 61883-1/6 protocol implemented by ALSA firewire stack. The isochronous context is not used by the implementation of SCSI over IEEE1394 protocol (sbp2), thus WQ_MEM_RECLAIM is not specified. It is useful for users to adjust cpu affinity of the workqueue depending on their work loads, thus WQ_SYS is specified to expose the attributes to user space. Tested-by: Edmund Raile Link: https://lore.kernel.org/r/20240904125155.461886-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 1cca14cf5652..10e135d60824 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -134,6 +134,8 @@ struct fw_card { __be32 topology_map[(CSR_TOPOLOGY_MAP_END - CSR_TOPOLOGY_MAP) / 4]; __be32 maint_utility_register; + + struct workqueue_struct *isoc_wq; }; static inline struct fw_card *fw_card_get(struct fw_card *card) -- cgit v1.2.3 From 4f55ad754d6b7b6fa4ebcfd8c50f7e53e661a78e Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Wed, 4 Sep 2024 21:51:51 +0900 Subject: firewire: core: add local API to queue work item to workqueue specific to isochronous contexts In the previous commit, the workqueue is added per the instance of fw_card structure for isochronous contexts. The workqueue is designed to be used by the implementation of fw_card_driver structure underlying the fw_card. This commit adds some local APIs to be used by the implementation. Tested-by: Edmund Raile Link: https://lore.kernel.org/r/20240904125155.461886-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 10e135d60824..72f497b61739 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -511,6 +511,7 @@ union fw_iso_callback { struct fw_iso_context { struct fw_card *card; + struct work_struct work; int type; int channel; int speed; -- cgit v1.2.3 From bd07d864522e7c3e4ee364e91aee8754992f5855 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 4 Sep 2024 14:11:20 +0206 Subject: printk: nbcon: Add function for printers to reacquire ownership Since ownership can be lost at any time due to handover or takeover, a printing context _must_ be prepared to back out immediately and carefully. However, there are scenarios where the printing context must reacquire ownership in order to finalize or revert hardware changes. One such example is when interrupts are disabled during printing. No other context will automagically re-enable the interrupts. For this case, the disabling context _must_ reacquire nbcon ownership so that it can re-enable the interrupts. Provide nbcon_reacquire_nobuf() for exactly this purpose. It allows a printing context to reacquire ownership using the same priority as its previous ownership. Note that after a successful reacquire the printing context will have no output buffer because that has been lost. This function cannot be used to resume printing. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240904120536.115780-2-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/console.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 9a13f91b0c43..88050d30a9cc 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -366,6 +366,10 @@ struct console { * * The callback should allow the takeover whenever it is safe. It * increases the chance to see messages when the system is in trouble. + * If the driver must reacquire ownership in order to finalize or + * revert hardware changes, nbcon_reacquire_nobuf() can be used. + * However, on reacquire the buffer content is no longer available. A + * reacquire cannot be used to resume printing. * * The callback can be called from any context (including NMI). * Therefore it must avoid usage of any locking and instead rely @@ -558,12 +562,14 @@ extern void nbcon_cpu_emergency_exit(void); extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); +extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); #else static inline void nbcon_cpu_emergency_enter(void) { } static inline void nbcon_cpu_emergency_exit(void) { } static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } +static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } #endif extern int console_set_on_cmdline; -- cgit v1.2.3 From 76f258bf3f2aae570209319944703a92ac64e29e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Sep 2024 14:11:25 +0206 Subject: printk: nbcon: Introduce printer kthreads Provide the main implementation for running a printer kthread per nbcon console that is takeover/handover aware. This includes: - new mandatory write_thread() callback - kthread creation - kthread main printing loop - kthread wakeup mechanism - kthread shutdown kthread creation is a bit tricky because consoles may register before kthreads can be created. In such cases, registration will succeed, even though no kthread exists. Once kthreads can be created, an early_initcall will set @printk_kthreads_ready. If there are no registered boot consoles, the early_initcall creates the kthreads for all registered nbcon consoles. If kthread creation fails, the related console is unregistered. If there are registered boot consoles when @printk_kthreads_ready is set, no kthreads are created until the final boot console unregisters. Once kthread creation finally occurs, @printk_kthreads_running is set so that the system knows kthreads are available for all registered nbcon consoles. If @printk_kthreads_running is already set when the console is registering, the kthread is created during registration. If kthread creation fails, the registration will fail. Until @printk_kthreads_running is set, console printing occurs directly via the console_lock. kthread shutdown on system shutdown/reboot is necessary to ensure the printer kthreads finish their printing so that the system can cleanly transition back to direct printing via the console_lock in order to reliably push out the final shutdown/reboot messages. @printk_kthreads_running is cleared before shutting down the individual kthreads. The kthread uses a new mandatory write_thread() callback that is called with both device_lock() and the console context acquired. The console ownership handling is necessary for synchronization against write_atomic() which is synchronized only via the console context ownership. The device_lock() serializes acquiring the console context with NBCON_PRIO_NORMAL. It is needed in case the device_lock() does not disable preemption. It prevents the following race: CPU0 CPU1 [ task A ] nbcon_context_try_acquire() # success with NORMAL prio # .unsafe == false; // safe for takeover [ schedule: task A -> B ] WARN_ON() nbcon_atomic_flush_pending() nbcon_context_try_acquire() # success with EMERGENCY prio # flushing nbcon_context_release() # HERE: con->nbcon_state is free # to take by anyone !!! nbcon_context_try_acquire() # success with NORMAL prio [ task B ] [ schedule: task B -> A ] nbcon_enter_unsafe() nbcon_context_can_proceed() BUG: nbcon_context_can_proceed() returns "true" because the console is owned by a context on CPU0 with NBCON_PRIO_NORMAL. But it should return "false". The console is owned by a context from task B and we do the check in a context from task A. Note that with these changes, the printer kthreads do not yet take over full responsibility for nbcon printing during normal operation. These changes only focus on the lifecycle of the kthreads. Co-developed-by: John Ogness Signed-off-by: John Ogness Signed-off-by: Thomas Gleixner (Intel) Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240904120536.115780-7-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/console.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 88050d30a9cc..788ce9c829f6 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -16,7 +16,9 @@ #include #include +#include #include +#include #include #include @@ -324,6 +326,9 @@ struct nbcon_write_context { * @nbcon_seq: Sequence number of the next record for nbcon to print * @nbcon_device_ctxt: Context available for non-printing operations * @pbufs: Pointer to nbcon private buffer + * @kthread: Printer kthread for this console + * @rcuwait: RCU-safe wait object for @kthread waking + * @irq_work: Defer @kthread waking to IRQ work context */ struct console { char name[16]; @@ -377,6 +382,37 @@ struct console { */ void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt); + /** + * @write_thread: + * + * NBCON callback to write out text in task context. + * + * This callback must be called only in task context with both + * device_lock() and the nbcon console acquired with + * NBCON_PRIO_NORMAL. + * + * The same rules for console ownership verification and unsafe + * sections handling applies as with write_atomic(). + * + * The console ownership handling is necessary for synchronization + * against write_atomic() which is synchronized only via the context. + * + * The device_lock() provides the primary serialization for operations + * on the device. It might be as relaxed (mutex)[*] or as tight + * (disabled preemption and interrupts) as needed. It allows + * the kthread to operate in the least restrictive mode[**]. + * + * [*] Standalone nbcon_context_try_acquire() is not safe with + * the preemption enabled, see nbcon_owner_matches(). But it + * can be safe when always called in the preemptive context + * under the device_lock(). + * + * [**] The device_lock() makes sure that nbcon_context_try_acquire() + * would never need to spin which is important especially with + * PREEMPT_RT. + */ + void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt); + /** * @device_lock: * @@ -423,7 +459,11 @@ struct console { atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; struct nbcon_context __private nbcon_device_ctxt; + struct printk_buffers *pbufs; + struct task_struct *kthread; + struct rcuwait rcuwait; + struct irq_work irq_work; }; #ifdef CONFIG_LOCKDEP -- cgit v1.2.3 From 5102981d5e2a5a7a12424b5d26c7524ac2899898 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 4 Sep 2024 14:11:30 +0206 Subject: printk: nbcon: Show replay message on takeover An emergency or panic context can takeover console ownership while the current owner was printing a printk message. The atomic printer will re-print the message that the previous owner was printing. However, this can look confusing to the user and may even seem as though a message was lost. [3430014.1 [3430014.181123] usb 1-2: Product: USB Audio Add a new field @nbcon_prev_seq to struct console to track the sequence number to print that was assigned to the previous console owner. If this matches the sequence number to print that the current owner is assigned, then a takeover must have occurred. In this case, print an additional message to inform the user that the previous message is being printed again. [3430014.1 ** replaying previous printk message ** [3430014.181123] usb 1-2: Product: USB Audio Signed-off-by: John Ogness Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20240904120536.115780-12-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- include/linux/console.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 788ce9c829f6..eba367bf605d 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -325,6 +325,7 @@ struct nbcon_write_context { * @nbcon_state: State for nbcon consoles * @nbcon_seq: Sequence number of the next record for nbcon to print * @nbcon_device_ctxt: Context available for non-printing operations + * @nbcon_prev_seq: Seq num the previous nbcon owner was assigned to print * @pbufs: Pointer to nbcon private buffer * @kthread: Printer kthread for this console * @rcuwait: RCU-safe wait object for @kthread waking @@ -459,6 +460,7 @@ struct console { atomic_t __private nbcon_state; atomic_long_t __private nbcon_seq; struct nbcon_context __private nbcon_device_ctxt; + atomic_long_t __private nbcon_prev_seq; struct printk_buffers *pbufs; struct task_struct *kthread; -- cgit v1.2.3 From a4d398a573d0f0c98c39d4af1866a3edaec4959c Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Mon, 2 Sep 2024 07:43:23 +0100 Subject: ARM: 9416/1: amba: make amba_bustype constant Since commit d492cc2573a0 ("driver core: device.h: make struct bus_type a const *"), the driver core can properly handle constant struct bus_type, move the amba_bustype variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Kunwu Chan Signed-off-by: Russell King (Oracle) --- include/linux/amba/bus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h index 958a55bcc708..dda2f3ea89cb 100644 --- a/include/linux/amba/bus.h +++ b/include/linux/amba/bus.h @@ -105,7 +105,7 @@ enum amba_vendor { AMBA_VENDOR_LSI = 0xb6, }; -extern struct bus_type amba_bustype; +extern const struct bus_type amba_bustype; #define to_amba_device(d) container_of_const(d, struct amba_device, dev) -- cgit v1.2.3 From 2097154a10c6ee78be8796411e5d0ad81ee06ed6 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Tue, 3 Sep 2024 17:16:12 +0200 Subject: namespace: introduce SB_I_NOIDMAP flag Right now we determine if filesystem support vfs idmappings or not basing on the FS_ALLOW_IDMAP flag presence. This "static" way works perfecly well for local filesystems like ext4, xfs, btrfs, etc. But for network-like filesystems like fuse, cephfs this approach is not ideal, because sometimes proper support of vfs idmaps requires some extensions for the on-wire protocol, which implies that changes have to be made not only in the Linux kernel code but also in the 3rd party components like libfuse, cephfs MDS server and so on. We have seen that issue during our work on cephfs idmapped mounts [1] with Christian, but right now I'm working on the idmapped mounts support for fuse/virtiofs and I think that it is a right time for this extension. [1] 5ccd8530dd7 ("ceph: handle idmapped mounts in create_request_message()") Suggested-by: Christian Brauner Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b..6ff547ef21f2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1189,6 +1189,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ +#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ /* Possible states of 'frozen' field */ enum { -- cgit v1.2.3 From 940ce73bdec5a020fec058ba947d2bf627462c53 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 4 Sep 2024 11:08:44 -0700 Subject: bpf: Remove the insn_buf array stack usage from the inline_bpf_loop() This patch removes the insn_buf array stack usage from the inline_bpf_loop(). Instead, the env->insn_buf is used. The usage in inline_bpf_loop() needs more than 16 insn, so the INSN_BUF_SIZE needs to be increased from 16 to 32. The compiler stack size warning on the verifier is gone after this change. Cc: Eduard Zingerman Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20240904180847.56947-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2e20207315a9..8458632824a4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -24,7 +24,7 @@ */ #define TMP_STR_BUF_LEN 320 /* Patch buffer size */ -#define INSN_BUF_SIZE 16 +#define INSN_BUF_SIZE 32 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that -- cgit v1.2.3 From a8532fac7b5d27b8d62008a89593dccb6f9786ef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 30 Aug 2024 22:02:34 -1000 Subject: sched_ext: TASK_DEAD tasks must be switched into SCX on ops_enable During scx_ops_enable(), SCX needs to invoke the sleepable ops.init_task() on every task. To do this, it does get_task_struct() on each iterated task, drop the lock and then call ops.init_task(). However, a TASK_DEAD task may already have lost all its usage count and be waiting for RCU grace period to be freed. If get_task_struct() is called on such task, use-after-free can happen. To avoid such situations, scx_ops_enable() skips initialization of TASK_DEAD tasks, which seems safe as they are never going to be scheduled again. Unfortunately, a racing sched_setscheduler(2) can grab the task before the task is unhashed and then continue to e.g. move the task from RT to SCX after TASK_DEAD is set and ops_enable skipped the task. As the task hasn't gone through scx_ops_init_task(), scx_ops_enable_task() called from switching_to_scx() triggers the following warning: sched_ext: Invalid task state transition 0 -> 3 for stress-ng-race-[2872] WARNING: CPU: 6 PID: 2367 at kernel/sched/ext.c:3327 scx_ops_enable_task+0x18f/0x1f0 ... RIP: 0010:scx_ops_enable_task+0x18f/0x1f0 ... switching_to_scx+0x13/0xa0 __sched_setscheduler+0x84e/0xa50 do_sched_setscheduler+0x104/0x1c0 __x64_sys_sched_setscheduler+0x18/0x30 do_syscall_64+0x7b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e As in the ops_disable path, it just doesn't seem like a good idea to leave any task in an inconsistent state, even when the task is dead. The root cause is ops_enable not being able to tell reliably whether a task is truly dead (no one else is looking at it and it's about to be freed) and was testing TASK_DEAD instead. Fix it by testing the task's usage count directly. - ops_init no longer ignores TASK_DEAD tasks. As now all users iterate all tasks, @include_dead is removed from scx_task_iter_next_locked() along with dead task filtering. - tryget_task_struct() is added. Tasks are skipped iff tryget_task_struct() fails. Signed-off-by: Tejun Heo Cc: David Vernet Cc: Peter Zijlstra --- include/linux/sched/task.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 4df2f9055587..0f2aeb37bbb0 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -120,6 +120,11 @@ static inline struct task_struct *get_task_struct(struct task_struct *t) return t; } +static inline struct task_struct *tryget_task_struct(struct task_struct *t) +{ + return refcount_inc_not_zero(&t->usage) ? t : NULL; +} + extern void __put_task_struct(struct task_struct *t); extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); -- cgit v1.2.3 From 8195136669661fdfe54e9a8923c33b31c92fc1da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Sep 2024 10:24:59 -1000 Subject: sched_ext: Add cgroup support Add sched_ext_ops operations to init/exit cgroups, and track task migrations and config changes. A BPF scheduler may not implement or implement only subset of cgroup features. The implemented features can be indicated using %SCX_OPS_HAS_CGOUP_* flags. If cgroup configuration makes use of features that are not implemented, a warning is triggered. While a BPF scheduler is being enabled and disabled, relevant cgroup operations are locked out using scx_cgroup_rwsem. This avoids situations like task prep taking place while the task is being moved across cgroups, making things easier for BPF schedulers. v7: - cgroup interface file visibility toggling is dropped in favor just warning messages. Dynamically changing interface visiblity caused more confusion than helping. v6: - Updated to reflect the removal of SCX_KF_SLEEPABLE. - Updated to use CONFIG_GROUP_SCHED_WEIGHT and fixes for !CONFIG_FAIR_GROUP_SCHED && CONFIG_EXT_GROUP_SCHED. v5: - Flipped the locking order between scx_cgroup_rwsem and cpus_read_lock() to avoid locking order conflict w/ cpuset. Better documentation around locking. - sched_move_task() takes an early exit if the source and destination are identical. This triggered the warning in scx_cgroup_can_attach() as it left p->scx.cgrp_moving_from uncleared. Updated the cgroup migration path so that ops.cgroup_prep_move() is skipped for identity migrations so that its invocations always match ops.cgroup_move() one-to-one. v4: - Example schedulers moved into their own patches. - Fix build failure when !CONFIG_CGROUP_SCHED, reported by Andrea Righi. v3: - Make scx_example_pair switch all tasks by default. - Convert to BPF inline iterators. - scx_bpf_task_cgroup() is added to determine the current cgroup from CPU controller's POV. This allows BPF schedulers to accurately track CPU cgroup membership. - scx_example_flatcg added. This demonstrates flattened hierarchy implementation of CPU cgroup control and shows significant performance improvement when cgroups which are nested multiple levels are under competition. v2: - Build fixes for different CONFIG combinations. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Reported-by: kernel test robot Cc: Andrea Righi --- include/linux/sched/ext.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index db2a266113ac..dc646cca4fcf 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -188,6 +188,9 @@ struct sched_ext_entity { bool disallow; /* reject switching into SCX */ /* cold fields */ +#ifdef CONFIG_EXT_GROUP_SCHED + struct cgroup *cgrp_moving_from; +#endif /* must be the last field, see init_scx_entity() */ struct list_head tasks_node; }; -- cgit v1.2.3 From 4e893545ef8712d25f3176790ebb95beb073637e Mon Sep 17 00:00:00 2001 From: Mariusz Tkaczyk Date: Wed, 4 Sep 2024 12:48:47 +0200 Subject: PCI/NPEM: Add Native PCIe Enclosure Management support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Native PCIe Enclosure Management (NPEM, PCIe r6.1 sec 6.28) allows managing LEDs in storage enclosures. NPEM is indication oriented and it does not give direct access to LEDs. Although each indication *could* represent an individual LED, multiple indications could also be represented as a single, multi-color LED or a single LED blinking in a specific interval. The specification leaves that open. Each enabled indication (capability register bit on) is represented as a ledclass_dev which can be controlled through sysfs. For every ledclass device only 2 brightness states are allowed: LED_ON (1) or LED_OFF (0). This corresponds to the NPEM control register (Indication bit on/off). Ledclass devices appear in sysfs as child devices (subdirectory) of PCI device which has an NPEM Extended Capability and indication is enabled in NPEM capability register. For example, these are LEDs created for pcieport "10000:02:05.0" on my setup: leds/ ├── 10000:02:05.0:enclosure:fail ├── 10000:02:05.0:enclosure:locate ├── 10000:02:05.0:enclosure:ok └── 10000:02:05.0:enclosure:rebuild They can be also found in "/sys/class/leds" directory. The parent PCIe device domain/bus/device/function address is used to guarantee uniqueness across leds subsystem. To enable/disable a "fail" indication, the "brightness" file can be edited: echo 1 > ./leds/10000:02:05.0:enclosure:fail/brightness echo 0 > ./leds/10000:02:05.0:enclosure:fail/brightness PCIe r6.1, sec 7.9.19.2 defines the possible indications. Multiple indications for same parent PCIe device can conflict and hardware may update them when processing new request. To avoid issues, driver refresh all indications by reading back control register. This driver expects to be the exclusive NPEM extended capability manager. It waits up to 1 second after imposing new request, it doesn't verify if controller is busy before write, and it assumes the mutex lock gives protection from concurrent updates. If _DSM LED management is available, we assume the platform may be using NPEM for its own purposes (see PCI Firmware Spec r3.3 sec 4.7), so the driver does not use NPEM. A future patch will add _DSM support; an info message notes whether NPEM or _DSM is being used. NPEM is a PCIe extended capability so it should be registered in pcie_init_capabilities() but it is not possible due to LED dependency. The parent pci_device must be added earlier for led_classdev_register() to be successful. NPEM does not require configuration on kernel side, so it is safe to register LED devices later. Link: https://lore.kernel.org/r/20240904104848.23480-3-mariusz.tkaczyk@linux.intel.com Suggested-by: Lukas Wunner Signed-off-by: Mariusz Tkaczyk Signed-off-by: Bjorn Helgaas Tested-by: Stuart Hayes Reviewed-by: Christoph Hellwig Reviewed-by: Ilpo Järvinen --- include/linux/pci.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 4cf89a4b4cbc..c9db853e269f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -516,6 +516,9 @@ struct pci_dev { #endif #ifdef CONFIG_PCI_DOE struct xarray doe_mbs; /* Data Object Exchange mailboxes */ +#endif +#ifdef CONFIG_PCI_NPEM + struct npem *npem; /* Native PCIe Enclosure Management */ #endif u16 acs_cap; /* ACS Capability offset */ phys_addr_t rom; /* Physical address if not from BAR */ -- cgit v1.2.3 From 73425800ac948cb78063b897a0c345d788f3594d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 5 Jun 2024 11:26:24 +0100 Subject: netfs, cifs: Move CIFS_INO_MODIFIED_ATTR to netfs_inode Move CIFS_INO_MODIFIED_ATTR to netfs_inode as NETFS_ICTX_MODIFIED_ATTR and then make netfs_perform_write() set it. This means that cifs doesn't need to implement the ->post_modify() hook. Signed-off-by: David Howells cc: Jeff Layton cc: Steve French cc: Paulo Alcantara cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-7-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c47443e7a97e..71c748c53a1f 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -73,6 +73,7 @@ struct netfs_inode { #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ +#define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */ }; /* -- cgit v1.2.3 From 52d55922e0f1db1f580c9f91c174d2392bfad481 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Jun 2024 09:02:58 +0100 Subject: netfs: Move max_len/max_nr_segs from netfs_io_subrequest to netfs_io_stream Move max_len/max_nr_segs from struct netfs_io_subrequest to struct netfs_io_stream as we only issue one subreq at a time and then don't need these values again for that subreq unless and until we have to retry it - in which case we want to renegotiate them. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-8-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 71c748c53a1f..eea5ea2842f6 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -134,6 +134,8 @@ static inline struct netfs_group *netfs_folio_group(struct folio *folio) struct netfs_io_stream { /* Submission tracking */ struct netfs_io_subrequest *construct; /* Op being constructed */ + size_t sreq_max_len; /* Maximum size of a subrequest */ + unsigned int sreq_max_segs; /* 0 or max number of segments in an iterator */ unsigned int submit_off; /* Folio offset we're submitting from */ unsigned int submit_len; /* Amount of data left to submit */ unsigned int submit_max_len; /* Amount I/O can be rounded up to */ @@ -177,14 +179,12 @@ struct netfs_io_subrequest { struct list_head rreq_link; /* Link in rreq->subrequests */ struct iov_iter io_iter; /* Iterator for this subrequest */ unsigned long long start; /* Where to start the I/O */ - size_t max_len; /* Maximum size of the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ - unsigned int max_nr_segs; /* 0 or max number of segments in an iterator */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ unsigned long flags; -- cgit v1.2.3 From 51e9a86a4f75c6dc8531407b2ab2ccc4ad137e5a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 9 Jul 2024 09:45:46 +0100 Subject: netfs: Reserve netfs_sreq_source 0 as unset/unknown Reserve the 0-valued netfs_sreq_source to mean unset or unknown so that it can be seen in the trace as such rather than appearing as download-from-server when it's going to get switched to something else. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-9-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index eea5ea2842f6..37f8ce9c0284 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -43,6 +43,7 @@ static inline void folio_start_private_2(struct folio *folio) #define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */ enum netfs_io_source { + NETFS_SOURCE_UNKNOWN, NETFS_FILL_WITH_ZEROES, NETFS_DOWNLOAD_FROM_SERVER, NETFS_READ_FROM_CACHE, -- cgit v1.2.3 From c57de2a9259d7dc18a7a425fca91c77502263d8a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 9 Jul 2024 09:43:38 +0100 Subject: netfs: Remove NETFS_COPY_TO_CACHE Remove NETFS_COPY_TO_CACHE as it isn't used anymore. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-10-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 37f8ce9c0284..63ab2c775a21 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -207,11 +207,10 @@ enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ - NETFS_COPY_TO_CACHE, /* This write is to copy a read to the cache */ + NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ - NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_DIO_WRITE, /* This is a direct I/O write */ nr__netfs_io_origin } __mode(byte); -- cgit v1.2.3 From f9ecc2febf6fd6ad53208a1c0e1b5066ee65dd8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 12 Jul 2024 19:18:20 +0200 Subject: pwm: Don't export pwm_capture() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is only a single caller of this function, and that's in drivers/pwm/core.c itself. So don't export the function. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240712171821.1470833-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index f8c2dc12dbd3..8acd60b53f58 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -394,9 +394,6 @@ static inline bool pwm_might_sleep(struct pwm_device *pwm) } /* PWM provider APIs */ -int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result, - unsigned long timeout); - void pwmchip_put(struct pwm_chip *chip); struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t sizeof_priv); struct pwm_chip *devm_pwmchip_alloc(struct device *parent, unsigned int npwm, size_t sizeof_priv); @@ -462,13 +459,6 @@ static inline void pwm_disable(struct pwm_device *pwm) might_sleep(); } -static inline int pwm_capture(struct pwm_device *pwm, - struct pwm_capture *result, - unsigned long timeout) -{ - return -EINVAL; -} - static inline void pwmchip_put(struct pwm_chip *chip) { } -- cgit v1.2.3 From 4356d575ef0f39a3e8e0ce0c40d84ce900ac3b61 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 28 Aug 2024 20:19:43 +1000 Subject: fhandle: expose u64 mount id to name_to_handle_at(2) Now that we provide a unique 64-bit mount ID interface in statx(2), we can now provide a race-free way for name_to_handle_at(2) to provide a file handle and corresponding mount without needing to worry about racing with /proc/mountinfo parsing or having to open a file just to do statx(2). While this is not necessary if you are using AT_EMPTY_PATH and don't care about an extra statx(2) call, users that pass full paths into name_to_handle_at(2) need to know which mount the file handle comes from (to make sure they don't try to open_by_handle_at a file handle from a different filesystem) and switching to AT_EMPTY_PATH would require allocating a file for every name_to_handle_at(2) call, turning err = name_to_handle_at(-EBADF, "/foo/bar/baz", &handle, &mntid, AT_HANDLE_MNT_ID_UNIQUE); into int fd = openat(-EBADF, "/foo/bar/baz", O_PATH | O_CLOEXEC); err1 = name_to_handle_at(fd, "", &handle, &unused_mntid, AT_EMPTY_PATH); err2 = statx(fd, "", AT_EMPTY_PATH, STATX_MNT_ID_UNIQUE, &statxbuf); mntid = statxbuf.stx_mnt_id; close(fd); Reviewed-by: Jeff Layton Signed-off-by: Aleksa Sarai Link: https://lore.kernel.org/r/20240828-exportfs-u64-mount-id-v3-2-10c2c4c16708@cyphar.com Reviewed-by: Jan Kara Reviewed-by: Josef Bacik Signed-off-by: Christian Brauner --- include/linux/syscalls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 4bcf6754738d..5758104921e6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -870,7 +870,7 @@ asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags, #endif asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name, struct file_handle __user *handle, - int __user *mnt_id, int flag); + void __user *mnt_id, int flag); asmlinkage long sys_open_by_handle_at(int mountdirfd, struct file_handle __user *handle, int flags); -- cgit v1.2.3 From 19156263cb1f24128a9ba6ef7340be5cbacc3d22 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 5 Sep 2024 10:14:05 +0300 Subject: dma-mapping: use IOMMU DMA calls for common alloc/free page calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Common alloca and free pages routines are called when IOMMU DMA is used, and internally it calls to DMA ops structure which is not available for default IOMMU. This patch adds necessary if checks to call IOMMU DMA. It fixes the following crash: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000040 Mem abort info: ESR = 0x0000000096000006 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x06: level 2 translation fault Data abort info: ISV = 0, ISS = 0x00000006, ISS2 = 0x00000000 CM = 0, WnR = 0, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=00000000d20bb000 [0000000000000040] pgd=08000000d20c1003 , p4d=08000000d20c1003 , pud=08000000d20c2003, pmd=0000000000000000 Internal error: Oops: 0000000096000006 [#1] PREEMPT SMP Modules linked in: ipv6 hci_uart venus_core btqca v4l2_mem2mem btrtl qcom_spmi_adc5 sbs_battery btbcm qcom_vadc_common cros_ec_typec videobuf2_v4l2 leds_cros_ec cros_kbd_led_backlight cros_ec_chardev videodev elan_i2c videobuf2_common qcom_stats mc bluetooth coresight_stm stm_core ecdh_generic ecc pwrseq_core panel_edp icc_bwmon ath10k_snoc ath10k_core ath mac80211 phy_qcom_qmp_combo aux_bridge libarc4 coresight_replicator coresight_etm4x coresight_tmc coresight_funnel cfg80211 rfkill coresight qcom_wdt cbmem ramoops reed_solomon pwm_bl coreboot_table backlight crct10dif_ce CPU: 7 UID: 0 PID: 70 Comm: kworker/u32:4 Not tainted 6.11.0-rc6-next-20240903-00003-gdfc6015d0711 #660 Hardware name: Google Lazor Limozeen without Touchscreen (rev5 - rev8) (DT) Workqueue: events_unbound deferred_probe_work_func hub 2-1:1.0: 4 ports detected pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : dma_common_alloc_pages+0x54/0x1b4 lr : dma_common_alloc_pages+0x4c/0x1b4 sp : ffff8000807d3730 x29: ffff8000807d3730 x28: ffff02a7d312f880 x27: 0000000000000001 x26: 000000000000c000 x25: 0000000000000000 x24: 0000000000000001 x23: ffff02a7d23b6898 x22: 0000000000006cc0 x21: 000000000000c000 x20: ffff02a7858bf410 x19: fffffe0a60006000 x18: 0000000000000001 x17: 00000000000000d5 x16: 1fffe054f0bcc261 x15: 0000000000000001 x14: ffff02a7844dc680 x13: 0000000000100180 x12: dead000000000100 x11: dead000000000122 x10: 00000000001001ff x9 : ffff02a87f7b7b00 x8 : ffff02a87f7b7b00 x7 : ffff405977d6b000 x6 : ffff8000807d3310 x5 : ffff02a87f6b6398 x4 : 0000000000000001 x3 : ffff405977d6b000 x2 : ffff02a7844dc600 x1 : 0000000100000000 x0 : fffffe0a60006000 Call trace: dma_common_alloc_pages+0x54/0x1b4 __dma_alloc_pages+0x68/0x90 dma_alloc_pages+0x10/0x1c snd_dma_noncoherent_alloc+0x28/0x8c __snd_dma_alloc_pages+0x30/0x50 snd_dma_alloc_dir_pages+0x40/0x80 do_alloc_pages+0xb8/0x13c preallocate_pcm_pages+0x6c/0xf8 preallocate_pages+0x160/0x1a4 snd_pcm_set_managed_buffer_all+0x64/0xb0 lpass_platform_pcm_new+0xc0/0xe8 snd_soc_pcm_component_new+0x3c/0xc8 soc_new_pcm+0x4fc/0x668 snd_soc_bind_card+0xabc/0xbac snd_soc_register_card+0xf0/0x108 devm_snd_soc_register_card+0x4c/0xa4 sc7180_snd_platform_probe+0x180/0x224 platform_probe+0x68/0xc0 really_probe+0xbc/0x298 __driver_probe_device+0x78/0x12c driver_probe_device+0x3c/0x15c __device_attach_driver+0xb8/0x134 bus_for_each_drv+0x84/0xe0 __device_attach+0x9c/0x188 device_initial_probe+0x14/0x20 bus_probe_device+0xac/0xb0 deferred_probe_work_func+0x88/0xc0 process_one_work+0x14c/0x28c worker_thread+0x2cc/0x3d4 kthread+0x114/0x118 ret_from_fork+0x10/0x20 Code: f9411c19 940000c9 aa0003f3 b4000460 (f9402326) ---[ end trace 0000000000000000 ]--- Fixes: b5c58b2fdc42 ("dma-mapping: direct calls for dma-iommu") Closes: https://lore.kernel.org/all/10431dfd-ce04-4e0f-973b-c78477303c18@notapiano Reported-by: Nícolas F. R. A. Prado #KernelCI Signed-off-by: Leon Romanovsky Tested-by: Nícolas F. R. A. Prado Signed-off-by: Christoph Hellwig --- include/linux/iommu-dma.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index d30a58bf00fd..1bb55ca1ab79 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -10,6 +10,10 @@ #include #ifdef CONFIG_IOMMU_DMA +static inline bool use_dma_iommu(struct device *dev) +{ + return dev->dma_iommu; +} dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); @@ -49,6 +53,10 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir); #else +static inline bool use_dma_iommu(struct device *dev) +{ + return false; +} static inline dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) -- cgit v1.2.3 From 59da880afed211c989ef65da577b24215ce57774 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 3 Sep 2024 10:45:58 -0700 Subject: uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks It serves no purpose beyond adding unnecessray argument passed to the filter callback. Just get rid of it, no one is actually using it. Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-4-andrii@kernel.org --- include/linux/uprobes.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index f50df1fa93e7..63ae2ade3487 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -28,20 +28,12 @@ struct page; #define MAX_URETPROBE_DEPTH 64 -enum uprobe_filter_ctx { - UPROBE_FILTER_REGISTER, - UPROBE_FILTER_UNREGISTER, - UPROBE_FILTER_MMAP, -}; - struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); int (*ret_handler)(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs); - bool (*filter)(struct uprobe_consumer *self, - enum uprobe_filter_ctx ctx, - struct mm_struct *mm); + bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); struct uprobe_consumer *next; }; -- cgit v1.2.3 From cc01bd044e6a521d2cd128f685ee8d23ef0067f2 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 3 Sep 2024 10:45:59 -0700 Subject: uprobes: travers uprobe's consumer list locklessly under SRCU protection uprobe->register_rwsem is one of a few big bottlenecks to scalability of uprobes, so we need to get rid of it to improve uprobe performance and multi-CPU scalability. First, we turn uprobe's consumer list to a typical doubly-linked list and utilize existing RCU-aware helpers for traversing such lists, as well as adding and removing elements from it. For entry uprobes we already have SRCU protection active since before uprobe lookup. For uretprobe we keep refcount, guaranteeing that uprobe won't go away from under us, but we add SRCU protection around consumer list traversal. Lastly, to keep handler_chain()'s UPROBE_HANDLER_REMOVE handling simple, we remember whether any removal was requested during handler calls, but then we double-check the decision under a proper register_rwsem using consumers' filter callbacks. Handler removal is very rare, so this extra lock won't hurt performance, overall, but we also avoid the need for any extra protection (e.g., seqcount locks). Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-5-andrii@kernel.org --- include/linux/uprobes.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 63ae2ade3487..f112b56e9479 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -29,13 +29,21 @@ struct page; #define MAX_URETPROBE_DEPTH 64 struct uprobe_consumer { + /* + * handler() can return UPROBE_HANDLER_REMOVE to signal the need to + * unregister uprobe for current process. If UPROBE_HANDLER_REMOVE is + * returned, filter() callback has to be implemented as well and it + * should return false to "confirm" the decision to uninstall uprobe + * for the current process. If filter() is omitted or returns true, + * UPROBE_HANDLER_REMOVE is effectively ignored. + */ int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); int (*ret_handler)(struct uprobe_consumer *self, unsigned long func, struct pt_regs *regs); bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); - struct uprobe_consumer *next; + struct list_head cons_node; }; #ifdef CONFIG_UPROBES -- cgit v1.2.3 From 04b01625da130c7521b768996cd5e48052198b97 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Sep 2024 10:46:00 -0700 Subject: perf/uprobe: split uprobe_unregister() With uprobe_unregister() having grown a synchronize_srcu(), it becomes fairly slow to call. Esp. since both users of this API call it in a loop. Peel off the sync_srcu() and do it once, after the loop. We also need to add uprobe_unregister_sync() into uprobe_register()'s error handling path, as we need to be careful about returning to the caller before we have a guarantee that partially attached consumer won't be called anymore. This is an unlikely slow path and this should be totally fine to be slow in the case of a failed attach. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: "Peter Zijlstra (Intel)" Co-developed-by: Andrii Nakryiko Signed-off-by: Andrii Nakryiko Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240903174603.3554182-6-andrii@kernel.org --- include/linux/uprobes.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index f112b56e9479..2b294bf1881f 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -115,7 +115,8 @@ extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool); -extern void uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc); +extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc); +extern void uprobe_unregister_sync(void); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void uprobe_start_dup_mmap(void); @@ -164,7 +165,10 @@ uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add) return -ENOSYS; } static inline void -uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc) +uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc) +{ +} +static inline void uprobe_unregister_sync(void) { } static inline int uprobe_mmap(struct vm_area_struct *vma) -- cgit v1.2.3 From 50a38035ed5ccc2ab8a28eaf70c3c7a87e060345 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 3 Sep 2024 10:46:01 -0700 Subject: rbtree: provide rb_find_rcu() / rb_find_add_rcu() Much like latch_tree, add two RCU methods for the regular RB-tree, which can be used in conjunction with a seqcount to provide lockless lookups. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: "Peter Zijlstra (Intel)" Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Reviewed-by: "Masami Hiramatsu (Google)" Link: https://lore.kernel.org/r/20240903174603.3554182-7-andrii@kernel.org --- include/linux/rbtree.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index f7edca369eda..7c173aa64e1e 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -244,6 +244,42 @@ rb_find_add(struct rb_node *node, struct rb_root *tree, return NULL; } +/** + * rb_find_add_rcu() - find equivalent @node in @tree, or add @node + * @node: node to look-for / insert + * @tree: tree to search / modify + * @cmp: operator defining the node order + * + * Adds a Store-Release for link_node. + * + * Returns the rb_node matching @node, or NULL when no match is found and @node + * is inserted. + */ +static __always_inline struct rb_node * +rb_find_add_rcu(struct rb_node *node, struct rb_root *tree, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node **link = &tree->rb_node; + struct rb_node *parent = NULL; + int c; + + while (*link) { + parent = *link; + c = cmp(node, parent); + + if (c < 0) + link = &parent->rb_left; + else if (c > 0) + link = &parent->rb_right; + else + return parent; + } + + rb_link_node_rcu(node, parent, link); + rb_insert_color(node, tree); + return NULL; +} + /** * rb_find() - find @key in tree @tree * @key: key to match @@ -272,6 +308,37 @@ rb_find(const void *key, const struct rb_root *tree, return NULL; } +/** + * rb_find_rcu() - find @key in tree @tree + * @key: key to match + * @tree: tree to search + * @cmp: operator defining the node order + * + * Notably, tree descent vs concurrent tree rotations is unsound and can result + * in false-negatives. + * + * Returns the rb_node matching @key or NULL. + */ +static __always_inline struct rb_node * +rb_find_rcu(const void *key, const struct rb_root *tree, + int (*cmp)(const void *key, const struct rb_node *)) +{ + struct rb_node *node = tree->rb_node; + + while (node) { + int c = cmp(key, node); + + if (c < 0) + node = rcu_dereference_raw(node->rb_left); + else if (c > 0) + node = rcu_dereference_raw(node->rb_right); + else + return node; + } + + return NULL; +} + /** * rb_find_first() - find the first @key in @tree * @key: key to match -- cgit v1.2.3 From e04e2b760ddbe3d7b283a05898c3a029085cd8cd Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sun, 1 Sep 2024 05:10:52 +0200 Subject: platform/x86: wmi: Pass event data directly to legacy notify handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current legacy WMI handlers are susceptible to picking up wrong WMI event data on systems where different WMI devices share some notification IDs. Prevent this by letting the WMI driver core taking care of retrieving the event data. This also simplifies the legacy WMI handlers and their implementation inside the WMI driver core. Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20240901031055.3030-3-W_Armin@gmx.de Signed-off-by: Hans de Goede --- include/linux/acpi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 0687a442fec7..eed105b1fbfb 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -386,7 +386,7 @@ extern bool acpi_is_pnp_device(struct acpi_device *); #if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE) -typedef void (*wmi_notify_handler) (u32 value, void *context); +typedef void (*wmi_notify_handler) (union acpi_object *data, void *context); int wmi_instance_count(const char *guid); -- cgit v1.2.3 From 79a56f4c8fa629ac79ada8f4b746195bd91f09c7 Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Sun, 1 Sep 2024 05:10:53 +0200 Subject: platform/x86: wmi: Remove wmi_get_event_data() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the WMI driver core now takes care of retrieving the WMI event data even for legacy WMI notify handlers, this function is no longer used. Remove it to prevent WMI drivers from messing up the ACPI firmware on some machines. Reviewed-by: Ilpo Järvinen Reviewed-by: Hans de Goede Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20240901031055.3030-4-W_Armin@gmx.de Signed-off-by: Hans de Goede --- include/linux/acpi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index eed105b1fbfb..3cbe4b57bc73 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -401,7 +401,6 @@ extern acpi_status wmi_set_block(const char *guid, u8 instance, extern acpi_status wmi_install_notify_handler(const char *guid, wmi_notify_handler handler, void *data); extern acpi_status wmi_remove_notify_handler(const char *guid); -extern acpi_status wmi_get_event_data(u32 event, struct acpi_buffer *out); extern bool wmi_has_guid(const char *guid); extern char *wmi_get_acpi_device_uid(const char *guid); -- cgit v1.2.3 From 3e6a7e3cda773adacc2fa4b9623d0a8c0f904d50 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 27 Aug 2024 09:59:38 -0700 Subject: iommufd: Reorder struct forward declarations Reorder struct forward declarations to alphabetic order to simplify maintenance, as upcoming patches will add more to the list. No functional change intended. Link: https://patch.msgid.link/r/c5dd87100f6f01389b838c63237e28c5dd373358.1724776335.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe --- include/linux/iommufd.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index c2f2f6b9148e..30f832a60ccb 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -11,12 +11,12 @@ #include struct device; -struct iommufd_device; -struct page; -struct iommufd_ctx; -struct iommufd_access; struct file; struct iommu_group; +struct iommufd_access; +struct iommufd_ctx; +struct iommufd_device; +struct page; struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id); -- cgit v1.2.3 From 6ff4cd1160afafc12ad1603e3d2f39256e4b708d Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Tue, 27 Aug 2024 10:45:15 +0800 Subject: lib/string_choices: Add str_true_false()/str_false_true() helper Add str_true_false()/str_false_true() helper to return "true" or "false" string literal. Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240827024517.914100-2-lihongbo22@huawei.com Signed-off-by: Kees Cook --- include/linux/string_choices.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index 1320bcdcb89c..ebcc56b28ede 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -48,6 +48,12 @@ static inline const char *str_up_down(bool v) } #define str_down_up(v) str_up_down(!(v)) +static inline const char *str_true_false(bool v) +{ + return v ? "true" : "false"; +} +#define str_false_true(v) str_true_false(!(v)) + /** * str_plural - Return the simple pluralization based on English counts * @num: Number used for deciding pluralization -- cgit v1.2.3 From c2708ba91c3c1fba424b77de83b6fc45cbf38c46 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Thu, 5 Sep 2024 17:25:39 +0800 Subject: lib/string_choices: Introduce several opposite string choice helpers Similar to the exists helper: str_enable_disable/ str_enabled_disabled/str_on_off/str_yes_no helpers, we can add the opposite helpers. That's str_disable_enable, str_disabled_enabled, str_off_on and str_no_yes. There are more than 10 cases currently (expect str_disable_enable now has 3 use cases) exist in the code can be replaced with these helper. Signed-off-by: Hongbo Li Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240905092540.2962122-2-lihongbo22@huawei.com Signed-off-by: Kees Cook --- include/linux/string_choices.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index ebcc56b28ede..fd067992260a 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -8,11 +8,13 @@ static inline const char *str_enable_disable(bool v) { return v ? "enable" : "disable"; } +#define str_disable_enable(v) str_enable_disable(!(v)) static inline const char *str_enabled_disabled(bool v) { return v ? "enabled" : "disabled"; } +#define str_disabled_enabled(v) str_enabled_disabled(!(v)) static inline const char *str_hi_lo(bool v) { @@ -36,11 +38,13 @@ static inline const char *str_on_off(bool v) { return v ? "on" : "off"; } +#define str_off_on(v) str_on_off(!(v)) static inline const char *str_yes_no(bool v) { return v ? "yes" : "no"; } +#define str_no_yes(v) str_yes_no(!(v)) static inline const char *str_up_down(bool v) { -- cgit v1.2.3 From c121d5cc3a993cdbfab46a152bdd50227a4d5e8c Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Thu, 5 Sep 2024 17:25:40 +0800 Subject: lib/string_choices: Add some comments to make more clear for string choices helpers. Add some comments to explain why we should use string_choices helpers. Signed-off-by: Hongbo Li Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240905092540.2962122-3-lihongbo22@huawei.com Signed-off-by: Kees Cook --- include/linux/string_choices.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h index fd067992260a..120ca0f28e95 100644 --- a/include/linux/string_choices.h +++ b/include/linux/string_choices.h @@ -2,6 +2,19 @@ #ifndef _LINUX_STRING_CHOICES_H_ #define _LINUX_STRING_CHOICES_H_ +/* + * Here provide a series of helpers in the str_$TRUE_$FALSE format (you can + * also expand some helpers as needed), where $TRUE and $FALSE are their + * corresponding literal strings. These helpers can be used in the printing + * and also in other places where constant strings are required. Using these + * helpers offers the following benefits: + * 1) Reducing the hardcoding of strings, which makes the code more elegant + * through these simple literal-meaning helpers. + * 2) Unifying the output, which prevents the same string from being printed + * in various forms, such as enable/disable, enabled/disabled, en/dis. + * 3) Deduping by the linker, which results in a smaller binary file. + */ + #include static inline const char *str_enable_disable(bool v) -- cgit v1.2.3 From 1ae497c78f01855f3695b58481311ffdd429b028 Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Thu, 5 Sep 2024 13:52:32 +0800 Subject: bpf: use type_may_be_null() helper for nullable-param check Commit 980ca8ceeae6 ("bpf: check bpf_dummy_struct_ops program params for test runs") does bitwise AND between reg_type and PTR_MAYBE_NULL, which is correct, but due to type difference the compiler complains: net/bpf/bpf_dummy_struct_ops.c:118:31: warning: bitwise operation between different enumeration types ('const enum bpf_reg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion] 118 | if (info && (info->reg_type & PTR_MAYBE_NULL)) | ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~ Workaround the warning by moving the type_may_be_null() helper from verifier.c into bpf_verifier.h, and reuse it here to check whether param is nullable. Fixes: 980ca8ceeae6 ("bpf: check bpf_dummy_struct_ops program params for test runs") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404241956.HEiRYwWq-lkp@intel.com/ Signed-off-by: Shung-Hsi Yu Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20240905055233.70203-1-shung-hsi.yu@suse.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8458632824a4..4513372c5bc8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -927,6 +927,11 @@ static inline bool type_is_sk_pointer(enum bpf_reg_type type) type == PTR_TO_XDP_SOCK; } +static inline bool type_may_be_null(u32 type) +{ + return type & PTR_MAYBE_NULL; +} + static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) { env->scratched_regs |= 1U << regno; -- cgit v1.2.3 From 706ae6446494b4523d33b0d96ea1fd9d50ca9be6 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Wed, 4 Sep 2024 14:41:07 +0300 Subject: clk: fixed-rate: add devm_clk_hw_register_fixed_rate_parent_data() Add devm_clk_hw_register_fixed_rate_parent_data(), devres-managed helper to register fixed-rate clock with parent_data. Signed-off-by: Nikita Shubin Link: https://lore.kernel.org/r/20240904-devm_clk_hw_register_fixed_rate_parent_data-v1-1-7f14d6b456e5@maquefel.me Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 4a537260f655..7e43caabb54b 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -393,6 +393,20 @@ struct clk *clk_register_fixed_rate(struct device *dev, const char *name, #define devm_clk_hw_register_fixed_rate(dev, name, parent_name, flags, fixed_rate) \ __clk_hw_register_fixed_rate((dev), NULL, (name), (parent_name), NULL, \ NULL, (flags), (fixed_rate), 0, 0, true) +/** + * devm_clk_hw_register_fixed_rate_parent_data - register fixed-rate clock with + * the clock framework + * @dev: device that is registering this clock + * @name: name of this clock + * @parent_data: parent clk data + * @flags: framework-specific flags + * @fixed_rate: non-adjustable clock rate + */ +#define devm_clk_hw_register_fixed_rate_parent_data(dev, name, parent_data, flags, \ + fixed_rate) \ + __clk_hw_register_fixed_rate((dev), NULL, (name), NULL, NULL, \ + (parent_data), (flags), (fixed_rate), 0, \ + 0, true) /** * clk_hw_register_fixed_rate_parent_hw - register fixed-rate clock with * the clock framework -- cgit v1.2.3 From 9934a1bd45b2b03f6d1204a6ae2780d3b009799f Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 5 Aug 2024 10:57:31 +0200 Subject: clk: provide devm_clk_get_optional_enabled_with_rate() There are clock users in the kernel that can't use devm_clk_get_optional_enabled() as they need to set rate after getting the clock and before enabling it. Provide a managed helper that wraps these operations in the correct order. Signed-off-by: Bartosz Golaszewski Link: https://lore.kernel.org/r/20240805-clk-new-helper-v2-1-e5fdd1e1d729@linaro.org Signed-off-by: Stephen Boyd --- include/linux/clk.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk.h b/include/linux/clk.h index 0fa56d672532..851a0f2cf42c 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -640,6 +640,32 @@ struct clk *devm_clk_get_optional_prepared(struct device *dev, const char *id); */ struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id); +/** + * devm_clk_get_optional_enabled_with_rate - devm_clk_get_optional() + + * clk_set_rate() + + * clk_prepare_enable() + * @dev: device for clock "consumer" + * @id: clock consumer ID + * @rate: new clock rate + * + * Context: May sleep. + * + * Return: a struct clk corresponding to the clock producer, or + * valid IS_ERR() condition containing errno. The implementation + * uses @dev and @id to determine the clock consumer, and thereby + * the clock producer. If no such clk is found, it returns NULL + * which serves as a dummy clk. That's the only difference compared + * to devm_clk_get_enabled(). + * + * The returned clk (if valid) is prepared and enabled and rate was set. + * + * The clock will automatically be disabled, unprepared and freed + * when the device is unbound from the bus. + */ +struct clk *devm_clk_get_optional_enabled_with_rate(struct device *dev, + const char *id, + unsigned long rate); + /** * devm_get_clk_from_child - lookup and obtain a managed reference to a * clock producer from child node. @@ -982,6 +1008,13 @@ static inline struct clk *devm_clk_get_optional_enabled(struct device *dev, return NULL; } +static inline struct clk * +devm_clk_get_optional_enabled_with_rate(struct device *dev, const char *id, + unsigned long rate) +{ + return NULL; +} + static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clks, struct clk_bulk_data *clks) { -- cgit v1.2.3 From 08062af0a52107a243f7608fd972edb54ca5b7f8 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 4 Sep 2024 15:34:30 +0000 Subject: net: napi: Prevent overflow of napi_defer_hard_irqs In commit 6f8b12d661d0 ("net: napi: add hard irqs deferral feature") napi_defer_irqs was added to net_device and napi_defer_irqs_count was added to napi_struct, both as type int. This value never goes below zero, so there is not reason for it to be a signed int. Change the type for both from int to u32, and add an overflow check to sysfs to limit the value to S32_MAX. The limit of S32_MAX was chosen because the practical limit before this patch was S32_MAX (anything larger was an overflow) and thus there are no behavioral changes introduced. If the extra bit is needed in the future, the limit can be raised. Before this patch: $ sudo bash -c 'echo 2147483649 > /sys/class/net/eth4/napi_defer_hard_irqs' $ cat /sys/class/net/eth4/napi_defer_hard_irqs -2147483647 After this patch: $ sudo bash -c 'echo 2147483649 > /sys/class/net/eth4/napi_defer_hard_irqs' bash: line 0: echo: write error: Numerical result out of range Similarly, /sys/class/net/XXXXX/tx_queue_len is defined as unsigned: include/linux/netdevice.h: unsigned int tx_queue_len; And has an overflow check: dev_change_tx_queue_len(..., unsigned long new_len): if (new_len != (unsigned int)new_len) return -ERANGE; Suggested-by: Jakub Kicinski Signed-off-by: Joe Damato Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240904153431.307932-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ca5f0dda733b..b47c00657bd0 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -356,7 +356,7 @@ struct napi_struct { unsigned long state; int weight; - int defer_hard_irqs_count; + u32 defer_hard_irqs_count; unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL @@ -2075,7 +2075,7 @@ struct net_device { unsigned int real_num_rx_queues; struct netdev_rx_queue *_rx; unsigned long gro_flush_timeout; - int napi_defer_hard_irqs; + u32 napi_defer_hard_irqs; unsigned int gro_max_size; unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; -- cgit v1.2.3 From de35996d4b364749194c5b473d1912578880833e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Sun, 4 Aug 2024 18:47:08 -0700 Subject: Input: matrix_keypad - remove support for platform data There are no more users of struct matrix_keypad_platform_data in the kernel, remove support for it from the driver. Acked-by: Arnd Bergmann Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20240805014710.1961677-6-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- include/linux/input/matrix_keypad.h | 48 ------------------------------------- 1 file changed, 48 deletions(-) (limited to 'include/linux') diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h index b8d8d69eba29..90867f44ab4d 100644 --- a/include/linux/input/matrix_keypad.h +++ b/include/linux/input/matrix_keypad.h @@ -34,52 +34,6 @@ struct matrix_keymap_data { unsigned int keymap_size; }; -/** - * struct matrix_keypad_platform_data - platform-dependent keypad data - * @keymap_data: pointer to &matrix_keymap_data - * @row_gpios: pointer to array of gpio numbers representing rows - * @col_gpios: pointer to array of gpio numbers reporesenting colums - * @num_row_gpios: actual number of row gpios used by device - * @num_col_gpios: actual number of col gpios used by device - * @col_scan_delay_us: delay, measured in microseconds, that is - * needed before we can keypad after activating column gpio - * @debounce_ms: debounce interval in milliseconds - * @clustered_irq: may be specified if interrupts of all row/column GPIOs - * are bundled to one single irq - * @clustered_irq_flags: flags that are needed for the clustered irq - * @active_low: gpio polarity - * @wakeup: controls whether the device should be set up as wakeup - * source - * @no_autorepeat: disable key autorepeat - * @drive_inactive_cols: drive inactive columns during scan, rather than - * making them inputs. - * - * This structure represents platform-specific data that use used by - * matrix_keypad driver to perform proper initialization. - */ -struct matrix_keypad_platform_data { - const struct matrix_keymap_data *keymap_data; - - const unsigned int *row_gpios; - const unsigned int *col_gpios; - - unsigned int num_row_gpios; - unsigned int num_col_gpios; - - unsigned int col_scan_delay_us; - - /* key debounce interval in milli-second */ - unsigned int debounce_ms; - - unsigned int clustered_irq; - unsigned int clustered_irq_flags; - - bool active_low; - bool wakeup; - bool no_autorepeat; - bool drive_inactive_cols; -}; - int matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data, const char *keymap_name, unsigned int rows, unsigned int cols, @@ -88,6 +42,4 @@ int matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data, int matrix_keypad_parse_properties(struct device *dev, unsigned int *rows, unsigned int *cols); -#define matrix_keypad_parse_of_params matrix_keypad_parse_properties - #endif /* _MATRIX_KEYPAD_H */ -- cgit v1.2.3 From f820b43754cdbd078a0d17122b281f42cbcd2155 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 23 Aug 2024 22:50:27 -0700 Subject: Input: zforce_ts - remove support for platfrom data There are no in-tree users of platform data and any new ones should either use device tree or static device properties, so let's remove platform data support. Tested-by: Andreas Kemnade # Tolino Shine2HD Link: https://lore.kernel.org/r/20240824055047.1706392-4-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- include/linux/platform_data/zforce_ts.h | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 include/linux/platform_data/zforce_ts.h (limited to 'include/linux') diff --git a/include/linux/platform_data/zforce_ts.h b/include/linux/platform_data/zforce_ts.h deleted file mode 100644 index 2463a4a856a6..000000000000 --- a/include/linux/platform_data/zforce_ts.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* drivers/input/touchscreen/zforce.c - * - * Copyright (C) 2012-2013 MundoReader S.L. - */ - -#ifndef _LINUX_INPUT_ZFORCE_TS_H -#define _LINUX_INPUT_ZFORCE_TS_H - -struct zforce_ts_platdata { - unsigned int x_max; - unsigned int y_max; -}; - -#endif /* _LINUX_INPUT_ZFORCE_TS_H */ -- cgit v1.2.3 From 23de126f9248a2f9218175ea0dd1c1403e334440 Mon Sep 17 00:00:00 2001 From: Pieter Van Trappen Date: Wed, 4 Sep 2024 08:27:42 +0200 Subject: net: dsa: microchip: replace unclear KSZ8830 strings Replace ksz8830 with ksz88x3 for CHIP_ID definition and other strings. This due to KSZ8830 not being an actual switch but the Chip ID shared among KSZ8863/8873 switches, impossible to differentiate from their Chip ID or Revision ID registers. Now all KSZ*_CHIP_ID macros refer to actual, existing switches which removes confusion. Signed-off-by: Pieter Van Trappen Acked-by: Arun Ramadoss Signed-off-by: David S. Miller --- include/linux/platform_data/microchip-ksz.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h index d074019474f5..2ee1a679e592 100644 --- a/include/linux/platform_data/microchip-ksz.h +++ b/include/linux/platform_data/microchip-ksz.h @@ -27,7 +27,7 @@ enum ksz_chip_id { KSZ8795_CHIP_ID = 0x8795, KSZ8794_CHIP_ID = 0x8794, KSZ8765_CHIP_ID = 0x8765, - KSZ8830_CHIP_ID = 0x8830, + KSZ88X3_CHIP_ID = 0x8830, KSZ8864_CHIP_ID = 0x8864, KSZ8895_CHIP_ID = 0x8895, KSZ9477_CHIP_ID = 0x00947700, -- cgit v1.2.3 From 56bd72e9cd8272687aa2a8eccddabc5526cd7845 Mon Sep 17 00:00:00 2001 From: Marek Maslanka Date: Mon, 12 Aug 2024 18:41:42 +0000 Subject: clocksource: acpi_pm: Add external callback for suspend/resume Provides the capability to register an external callback for the ACPI PM timer, which is called during the suspend and resume processes. Signed-off-by: Marek Maslanka Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20240812184150.1079924-1-mmaslanka@google.com Signed-off-by: Daniel Lezcano --- include/linux/acpi_pmtmr.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h index 50d88bf1498d..0ded9220d379 100644 --- a/include/linux/acpi_pmtmr.h +++ b/include/linux/acpi_pmtmr.h @@ -26,6 +26,19 @@ static inline u32 acpi_pm_read_early(void) return acpi_pm_read_verified() & ACPI_PM_MASK; } +/** + * Register callback for suspend and resume event + * + * @cb Callback triggered on suspend and resume + * @data Data passed with the callback + */ +void acpi_pmtmr_register_suspend_resume_callback(void (*cb)(void *data, bool suspend), void *data); + +/** + * Remove registered callback for suspend and resume event + */ +void acpi_pmtmr_unregister_suspend_resume_callback(void); + #else static inline u32 acpi_pm_read_early(void) -- cgit v1.2.3 From 3f4c0ad490cc55db5b375a2e19c67159cb255795 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 26 Aug 2024 12:14:04 +0200 Subject: mtd: nand: Rename the NAND IO iteration helper Soon a helper for iterating over blocks will be needed (for continuous read purposes). In order to clarify the intend of this helper, let's rename it with the "page" wording inside. While at it, improve the doc and fix a typo. Signed-off-by: Miquel Raynal Reviewed-by: Pratyush Yadav Link: https://lore.kernel.org/linux-mtd/20240826101412.20644-2-miquel.raynal@bootlin.com --- include/linux/mtd/nand.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index b2996dc987ff..92e06ac33815 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -906,19 +906,19 @@ static inline void nanddev_pos_next_page(struct nand_device *nand, } /** - * nand_io_iter_init - Initialize a NAND I/O iterator + * nand_io_page_iter_init - Initialize a NAND I/O iterator * @nand: NAND device * @offs: absolute offset * @req: MTD request * @iter: NAND I/O iterator * * Initializes a NAND iterator based on the information passed by the MTD - * layer. + * layer for page jumps. */ -static inline void nanddev_io_iter_init(struct nand_device *nand, - enum nand_page_io_req_type reqtype, - loff_t offs, struct mtd_oob_ops *req, - struct nand_io_iter *iter) +static inline void nanddev_io_page_iter_init(struct nand_device *nand, + enum nand_page_io_req_type reqtype, + loff_t offs, struct mtd_oob_ops *req, + struct nand_io_iter *iter) { struct mtd_info *mtd = nanddev_to_mtd(nand); @@ -990,10 +990,10 @@ static inline bool nanddev_io_iter_end(struct nand_device *nand, * @req: MTD I/O request * @iter: NAND I/O iterator * - * Should be used for iterate over pages that are contained in an MTD request. + * Should be used for iterating over pages that are contained in an MTD request. */ #define nanddev_io_for_each_page(nand, type, start, req, iter) \ - for (nanddev_io_iter_init(nand, type, start, req, iter); \ + for (nanddev_io_page_iter_init(nand, type, start, req, iter); \ !nanddev_io_iter_end(nand, iter); \ nanddev_io_iter_next_page(nand, iter)) -- cgit v1.2.3 From 8adf1ac24ba8ee34b8fd36ebf159004ec472fca0 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 26 Aug 2024 12:14:05 +0200 Subject: mtd: nand: Introduce a block iterator In order to be able to iterate easily across eraseblocks rather than pages, let's introduce a block iterator inspired from the page iterator. The main usage of this iterator will be for continuous/sequential reads, where it is interesting to use a single request rather than split the requests in smaller chunks (ie. pages) that can be hardly optimized. So a "continuous" boolean get's added for this purpose. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20240826101412.20644-3-miquel.raynal@bootlin.com --- include/linux/mtd/nand.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 92e06ac33815..1e4208040956 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -103,6 +103,8 @@ enum nand_page_io_req_type { * @ooblen: the number of OOB bytes to read from/write to this page * @oobbuf: buffer to store OOB data in or get OOB data from * @mode: one of the %MTD_OPS_XXX mode + * @continuous: no need to start over the operation at the end of each page, the + * NAND device will automatically prepare the next one * * This object is used to pass per-page I/O requests to NAND sub-layers. This * way all useful information are already formatted in a useful way and @@ -125,6 +127,7 @@ struct nand_page_io_req { void *in; } oobbuf; int mode; + bool continuous; }; const struct mtd_ooblayout_ops *nand_get_small_page_ooblayout(void); @@ -937,6 +940,43 @@ static inline void nanddev_io_page_iter_init(struct nand_device *nand, iter->req.ooblen = min_t(unsigned int, iter->oobbytes_per_page - iter->req.ooboffs, iter->oobleft); + iter->req.continuous = false; +} + +/** + * nand_io_block_iter_init - Initialize a NAND I/O iterator + * @nand: NAND device + * @offs: absolute offset + * @req: MTD request + * @iter: NAND I/O iterator + * + * Initializes a NAND iterator based on the information passed by the MTD + * layer for block jumps (no OOB) + * + * In practice only reads may leverage this iterator. + */ +static inline void nanddev_io_block_iter_init(struct nand_device *nand, + enum nand_page_io_req_type reqtype, + loff_t offs, struct mtd_oob_ops *req, + struct nand_io_iter *iter) +{ + unsigned int offs_in_eb; + + iter->req.type = reqtype; + iter->req.mode = req->mode; + iter->req.dataoffs = nanddev_offs_to_pos(nand, offs, &iter->req.pos); + iter->req.ooboffs = 0; + iter->oobbytes_per_page = 0; + iter->dataleft = req->len; + iter->oobleft = 0; + iter->req.databuf.in = req->datbuf; + offs_in_eb = (nand->memorg.pagesize * iter->req.pos.page) + iter->req.dataoffs; + iter->req.datalen = min_t(unsigned int, + nanddev_eraseblock_size(nand) - offs_in_eb, + iter->dataleft); + iter->req.oobbuf.in = NULL; + iter->req.ooblen = 0; + iter->req.continuous = true; } /** @@ -962,6 +1002,25 @@ static inline void nanddev_io_iter_next_page(struct nand_device *nand, iter->oobleft); } +/** + * nand_io_iter_next_block - Move to the next block + * @nand: NAND device + * @iter: NAND I/O iterator + * + * Updates the @iter to point to the next block. + * No OOB handling available. + */ +static inline void nanddev_io_iter_next_block(struct nand_device *nand, + struct nand_io_iter *iter) +{ + nanddev_pos_next_eraseblock(nand, &iter->req.pos); + iter->dataleft -= iter->req.datalen; + iter->req.databuf.in += iter->req.datalen; + iter->req.dataoffs = 0; + iter->req.datalen = min_t(unsigned int, nanddev_eraseblock_size(nand), + iter->dataleft); +} + /** * nand_io_iter_end - Should end iteration or not * @nand: NAND device @@ -997,6 +1056,21 @@ static inline bool nanddev_io_iter_end(struct nand_device *nand, !nanddev_io_iter_end(nand, iter); \ nanddev_io_iter_next_page(nand, iter)) +/** + * nand_io_for_each_block - Iterate over all NAND pages contained in an MTD I/O + * request, one block at a time + * @nand: NAND device + * @start: start address to read/write from + * @req: MTD I/O request + * @iter: NAND I/O iterator + * + * Should be used for iterating over blocks that are contained in an MTD request. + */ +#define nanddev_io_for_each_block(nand, type, start, req, iter) \ + for (nanddev_io_block_iter_init(nand, type, start, req, iter); \ + !nanddev_io_iter_end(nand, iter); \ + nanddev_io_iter_next_block(nand, iter)) + bool nanddev_isbad(struct nand_device *nand, const struct nand_pos *pos); bool nanddev_isreserved(struct nand_device *nand, const struct nand_pos *pos); int nanddev_markbad(struct nand_device *nand, const struct nand_pos *pos); -- cgit v1.2.3 From 631cfdd0520d19b7f4fc13b834fd9c8b46c6dbac Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 26 Aug 2024 12:14:07 +0200 Subject: mtd: spi-nand: Add continuous read support A regular page read consist in: - Asking one page of content from the NAND array to be loaded in the chip's SRAM, - Waiting for the operation to be done, - Retrieving the data (I/O phase) from the chip's SRAM. When reading several sequential pages, the above operation is repeated over and over. There is however a way to optimize these accesses, by enabling continuous reads. The feature requires the NAND chip to have a second internal SRAM area plus a bit of additional internal logic to trigger another internal transfer between the NAND array and the second SRAM area while the I/O phase is ongoing. Once the first I/O phase is done, the host can continue reading more data, continuously, as the chip will automatically switch to the second SRAM content (which has already been loaded) and in turns trigger the next load into the first SRAM area again. From an instruction perspective, the command op-codes are different, but the same cycles are required. The only difference is that after a continuous read (which is stopped by a CS deassert), the host must observe a delay of tRST. However, because there is no guarantee in Linux regarding the actual state of the CS pin after a transfer (in order to speed-up the next transfer if targeting the same device), it was necessary to manually end the continuous read with a configuration register write operation. Continuous reads have two main drawbacks: * They only work on full pages (column address ignored) * Only the main data area is pulled, out-of-band bytes are not accessible. Said otherwise, the feature can only be useful with on-die ECC engines. Performance wise, measures have been performed on a Zynq platform using Macronix SPI-NAND controller with a Macronix chip (based on the flash_speed tool modified for testing sequential reads): - 1-1-1 mode: performances improved from +3% (2-pages) up to +10% after a dozen pages. - 1-1-4 mode: performances improved from +15% (2-pages) up to +40% after a dozen pages. This series is based on a previous work from Macronix engineer Jaime Liao. Signed-off-by: Miquel Raynal Reviewed-by: Pratyush Yadav Link: https://lore.kernel.org/linux-mtd/20240826101412.20644-5-miquel.raynal@bootlin.com --- include/linux/mtd/spinand.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 5c19ead60499..14dce347dc46 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -336,6 +336,7 @@ struct spinand_ondie_ecc_conf { * @op_variants.update_cache: variants of the update-cache operation * @select_target: function used to select a target/die. Required only for * multi-die chips + * @set_cont_read: enable/disable continuous cached reads * * Each SPI NAND manufacturer driver should have a spinand_info table * describing all the chips supported by the driver. @@ -354,6 +355,8 @@ struct spinand_info { } op_variants; int (*select_target)(struct spinand_device *spinand, unsigned int target); + int (*set_cont_read)(struct spinand_device *spinand, + bool enable); }; #define SPINAND_ID(__method, ...) \ @@ -379,6 +382,9 @@ struct spinand_info { #define SPINAND_SELECT_TARGET(__func) \ .select_target = __func, +#define SPINAND_CONT_READ(__set_cont_read) \ + .set_cont_read = __set_cont_read, + #define SPINAND_INFO(__model, __id, __memorg, __eccreq, __op_variants, \ __flags, ...) \ { \ @@ -422,6 +428,12 @@ struct spinand_dirmap { * passed in spi_mem_op be DMA-able, so we can't based the bufs on * the stack * @manufacturer: SPI NAND manufacturer information + * @cont_read_possible: Field filled by the core once the whole system + * configuration is known to tell whether continuous reads are + * suitable to use or not in general with this chip/configuration. + * A per-transfer check must of course be done to ensure it is + * actually relevant to enable this feature. + * @set_cont_read: Enable/disable the continuous read feature * @priv: manufacturer private data */ struct spinand_device { @@ -451,6 +463,10 @@ struct spinand_device { u8 *scratchbuf; const struct spinand_manufacturer *manufacturer; void *priv; + + bool cont_read_possible; + int (*set_cont_read)(struct spinand_device *spinand, + bool enable); }; /** -- cgit v1.2.3 From a06f2e7cc4de104b14971e08d37f9d08c256b053 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 26 Aug 2024 12:14:08 +0200 Subject: mtd: spi-nand: Expose spinand_write_reg_op() This helper function will soon be used from a vendor driver, let's export it through the spinand.h header. No need for any export, as there is currently no reason for any module to need it. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20240826101412.20644-6-miquel.raynal@bootlin.com --- include/linux/mtd/spinand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 14dce347dc46..8dbf67ca710a 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -533,6 +533,7 @@ int spinand_match_and_init(struct spinand_device *spinand, enum spinand_readid_method rdid_method); int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val); +int spinand_write_reg_op(struct spinand_device *spinand, u8 reg, u8 val); int spinand_select_target(struct spinand_device *spinand, unsigned int target); #endif /* __LINUX_MTD_SPINAND_H */ -- cgit v1.2.3 From bb2ac59f8205165fce5aee9135bd1e2ea9b1a74b Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 16 Aug 2024 01:18:23 +0100 Subject: mfd: axp20x: AXP717: Add support for boost regulator The AXP717 also contains a boost regulator, to provide the 5V USB VBUS rail when running on battery. Add the registers to the MFD description to be able to use them from the regulator driver. Signed-off-by: Andre Przywara Reviewed-by: John Watts Acked-by: Lee Jones Reviewed-by: Chen-Yu Tsai Link: https://patch.msgid.link/20240816001824.6028-3-andre.przywara@arm.com Signed-off-by: Mark Brown --- include/linux/mfd/axp20x.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index 8c0a33a2e9ce..3758f986491c 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -115,6 +115,8 @@ enum axp20x_variants { #define AXP313A_IRQ_STATE 0x21 #define AXP717_ON_INDICATE 0x00 +#define AXP717_MODULE_EN_CONTROL_2 0x19 +#define AXP717_BOOST_CONTROL 0x1e #define AXP717_IRQ0_EN 0x40 #define AXP717_IRQ1_EN 0x41 #define AXP717_IRQ2_EN 0x42 -- cgit v1.2.3 From 22dfe2ea1d63f15d8b9661e7690ac0a03159db6a Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Fri, 16 Aug 2024 01:18:24 +0100 Subject: regulator: axp20x: AXP717: Add boost regulator The AXP717 also contains an adjustable boost regulator, to provide the 5V USB VBUS rail when running on battery. Add the regulator description that states the voltage range this regulator can cover. Signed-off-by: Andre Przywara Reviewed-by: John Watts Reviewed-by: Chen-Yu Tsai Link: https://patch.msgid.link/20240816001824.6028-4-andre.przywara@arm.com Signed-off-by: Mark Brown --- include/linux/mfd/axp20x.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index 3758f986491c..e0cd66bd3b6d 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -486,6 +486,7 @@ enum { AXP717_CLDO3, AXP717_CLDO4, AXP717_CPUSLDO, + AXP717_BOOST, AXP717_REG_ID_MAX, }; -- cgit v1.2.3 From 78f76b09c915d7281317d8e082c4c02f325a0366 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 17 Jul 2024 17:48:16 +0900 Subject: ata: libata: Move sata_std_hardreset() definition to libata-sata.c Unlike ata_std_prereset() and ata_std_postreset(), the function sata_std_hardreset() applies only to SATA devices, as its name implies. So move its definition to libata-sata.c. Together with this, also move the definition of sata_port_ops to libata-sata.c, where it belongs. Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Reviewed-by: Hannes Reinecke --- include/linux/libata.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 6552e90753ae..d52ae7723c05 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1104,8 +1104,6 @@ static inline bool ata_port_is_frozen(const struct ata_port *ap) extern int ata_std_prereset(struct ata_link *link, unsigned long deadline); extern int ata_wait_after_reset(struct ata_link *link, unsigned long deadline, int (*check_ready)(struct ata_link *link)); -extern int sata_std_hardreset(struct ata_link *link, unsigned int *class, - unsigned long deadline); extern void ata_std_postreset(struct ata_link *link, unsigned int *classes); extern struct ata_host *ata_host_alloc(struct device *dev, int n_ports); @@ -1229,6 +1227,8 @@ extern int sata_scr_read(struct ata_link *link, int reg, u32 *val); extern int sata_scr_write(struct ata_link *link, int reg, u32 val); extern int sata_scr_write_flush(struct ata_link *link, int reg, u32 val); extern int sata_set_spd(struct ata_link *link); +int sata_std_hardreset(struct ata_link *link, unsigned int *class, + unsigned long deadline); extern int sata_link_hardreset(struct ata_link *link, const unsigned int *timing, unsigned long deadline, bool *online, int (*check_ready)(struct ata_link *)); @@ -1256,6 +1256,11 @@ static inline int sata_scr_write_flush(struct ata_link *link, int reg, u32 val) return -EOPNOTSUPP; } static inline int sata_set_spd(struct ata_link *link) { return -EOPNOTSUPP; } +static inline int sata_std_hardreset(struct ata_link *link, unsigned int *class, + unsigned long deadline) +{ + return -EOPNOTSUPP; +} static inline int sata_link_hardreset(struct ata_link *link, const unsigned int *timing, unsigned long deadline, -- cgit v1.2.3 From 10e807637f288f9963340a30ae8da9bb06beca3c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 10 Jun 2024 19:11:31 +0900 Subject: ata: libata: Rename ata_eh_read_sense_success_ncq_log() The function ata_eh_read_sense_success_ncq_log() does more that just reading the sense data for successful NCQ commands log page as it also sets the sense data for all commands listed in the log page. Rename this function to ata_eh_get_ncq_success_sense() to better describe what the function does. Furthermore, since this function is only called from ata_eh_get_success_sense() in libata-eh.c, there is no need to export it and its declaration can be moved to drivers/ata/libata.h. To be consistent with this change, the function ata_eh_read_sense_success_non_ncq() is also renamed to ata_eh_get_non_ncq_success_sense(). Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Reviewed-by: Hannes Reinecke --- include/linux/libata.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index d52ae7723c05..55a6b57742bc 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1234,7 +1234,6 @@ extern int sata_link_hardreset(struct ata_link *link, bool *online, int (*check_ready)(struct ata_link *)); extern int sata_link_resume(struct ata_link *link, const unsigned int *params, unsigned long deadline); -extern int ata_eh_read_sense_success_ncq_log(struct ata_link *link); extern void ata_eh_analyze_ncq_error(struct ata_link *link); #else static inline const unsigned int * @@ -1277,10 +1276,6 @@ static inline int sata_link_resume(struct ata_link *link, { return -EOPNOTSUPP; } -static inline int ata_eh_read_sense_success_ncq_log(struct ata_link *link) -{ - return -EOPNOTSUPP; -} static inline void ata_eh_analyze_ncq_error(struct ata_link *link) { } #endif extern int sata_link_debounce(struct ata_link *link, -- cgit v1.2.3 From da65bbdd3bc1e8d2193e01167a413d90d9988c04 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 28 Aug 2024 11:07:43 +0900 Subject: ata: libata: Move sector_buf from struct ata_port to struct ata_device The 512B buffer sector_buf field of struct ata_port is used for scanning devices as well as during error recovery with ata EH. This buffer is thus useless if a port does not have a device connected to it. And also given that commands using this buffer are issued to devices, and not to ports, move this buffer definition from struct ata_port to struct ata_device. This change slightly increases system memory usage for systems using a port-multiplier as in that case we do not need a per-device buffer for scanning devices (PMP does not allow parallel scanning) nor for EH (as when entering EH we are guaranteed that all commands to all devices connected to the PMP have completed or have been aborted). However, this change reduces memory usage on systems that have many ports with only few devices rives connected, which is a much more common use case than the PMP use case. Suggested-by: Niklas Cassel Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Niklas Cassel --- include/linux/libata.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 55a6b57742bc..aac38dcd2230 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -769,6 +769,9 @@ struct ata_device { int spdn_cnt; /* ering is CLEAR_END, read comment above CLEAR_END */ struct ata_ering ering; + + /* For EH */ + u8 sector_buf[ATA_SECT_SIZE] ____cacheline_aligned; }; /* Fields between ATA_DEVICE_CLEAR_BEGIN and ATA_DEVICE_CLEAR_END are @@ -916,7 +919,6 @@ struct ata_port { #endif /* owned by EH */ u8 *ncq_sense_buf; - u8 sector_buf[ATA_SECT_SIZE] ____cacheline_aligned; }; /* The following initializer overrides a method to NULL whether one of -- cgit v1.2.3 From 602bcf212637633d537ee74bf39c6bc5722efb9b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 17 Jul 2024 17:55:31 +0900 Subject: ata: libata: Improve CDL resource management The ncq_sense_buf buffer field of struct ata_port is allocated and used only for devices that support the Command Duration Limits (CDL) feature. However, the cdl buffer of struct ata_device, which is used to cache the command duration limits log page for devices supporting CDL is always allocated as part of struct ata_device, which is wasteful of memory for devices that do not support this feature. Clean this up by defining both buffers as part of the new ata_cdl structure and allocating this structure only for devices that support the CDL feature. This new structure is attached to struct ata_device using the cdl pointer. The functions ata_dev_init_cdl_resources() and ata_dev_cleanup_cdl_resources() are defined to manage this new structure allocation, initialization and freeing when a port is removed or a device disabled. ata_dev_init_cdl_resources() is called from ata_dev_config_cdl() only for devices that support CDL. ata_dev_cleanup_cdl_resources() is called from ata_dev_free_resources() to free the ata_cdl structure when a device is being disabled by EH. Note that the name of the former cdl log buffer of struct ata_device is changed to desc_log_buf to make it clearer that it is a buffer for the limit descriptors log page. This change reduces the size of struct ata_device, thus reducing memory usage for ATA devices that do not support the CDL feature. Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel --- include/linux/libata.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index aac38dcd2230..9b4a6ff03235 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -700,6 +700,21 @@ struct ata_cpr_log { struct ata_cpr cpr[] __counted_by(nr_cpr); }; +struct ata_cdl { + /* + * Buffer to cache the CDL log page 18h (command duration descriptors) + * for SCSI-ATA translation. + */ + u8 desc_log_buf[ATA_LOG_CDL_SIZE]; + + /* + * Buffer to handle reading the sense data for successful NCQ Commands + * log page for commands using a CDL with one of the limits policy set + * to 0xD (successful completion with sense data available bit set). + */ + u8 ncq_sense_log_buf[ATA_LOG_SENSE_NCQ_SIZE]; +}; + struct ata_device { struct ata_link *link; unsigned int devno; /* 0 or 1 */ @@ -762,8 +777,8 @@ struct ata_device { /* Concurrent positioning ranges */ struct ata_cpr_log *cpr_log; - /* Command Duration Limits log support */ - u8 cdl[ATA_LOG_CDL_SIZE]; + /* Command Duration Limits support */ + struct ata_cdl *cdl; /* error history */ int spdn_cnt; @@ -917,8 +932,6 @@ struct ata_port { #ifdef CONFIG_ATA_ACPI struct ata_acpi_gtm __acpi_init_gtm; /* use ata_acpi_init_gtm() */ #endif - /* owned by EH */ - u8 *ncq_sense_buf; }; /* The following initializer overrides a method to NULL whether one of -- cgit v1.2.3 From 446216bd8e5d4a3ffc9738f6adffc98fbfdcc582 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sun, 8 Sep 2024 13:05:48 +0900 Subject: firewire: core: expose kernel API to schedule work item to process isochronous context In packet-per-buffer mode for isochronous context of 1394 OHCI, software can schedule hardIRQ to the buffer in which the content of isochronous packet is processed. The actual behaviour is different between isochronous receive (IR) and transmit (IT) contexts in respect to isochronous cycle in which the hardIRQ occurs. In IR context, the hardIRQ occurs when the buffer is filled actually by the content of received packet. If there are any isochronous cycles in which the packet transmission is skipped, it is postponed to generate the hardIRQ in respect to the isochronous cycle. In IT context, software can schedule the content of packet every isochronous cycle including skipping, therefore the hardIRQ occurs in the isochronous cycle to which the software scheduled. ALSA firewire stack uses the packet-per-buffer mode for both IR/IT contexts. To process time stamp per packet (or per sample in some cases) steadily for media clock recovery against unexpected transmission skips, it uses an IT context to operate all of isochronous contexts by calls of fw_iso_context_flush_completions() in the bottom-half of hardIRQ for the IT context. Although it looks well to handle all of isochronous contexts in a single bottom-half context, it relatively takes longer time. In the future code integration (not yet), it is possible to apply parallelism method to process these context. In the case, it is useful to allow unit drivers to schedule work items to process these isochronous contexts. As a preparation, this commit exposes fw_iso_context_schedule_flush_completions() as a kernel API available by unit drivers. It is renamed from fw_iso_context_queue_work() since it is a counter part of fw_iso_context_flush_completions(). Link: https://lore.kernel.org/r/20240908040549.75304-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 72f497b61739..f815d12deda0 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -531,6 +531,23 @@ int fw_iso_context_queue(struct fw_iso_context *ctx, unsigned long payload); void fw_iso_context_queue_flush(struct fw_iso_context *ctx); int fw_iso_context_flush_completions(struct fw_iso_context *ctx); + +/** + * fw_iso_context_schedule_flush_completions() - schedule work item to process isochronous context. + * @ctx: the isochronous context + * + * Schedule a work item on workqueue to process the isochronous context. The registered callback + * function is called in the worker if some packets have been already transferred since the last + * time. If it is required to process the context in the current context, + * fw_iso_context_flush_completions() is available instead. + * + * Context: Any context. + */ +static inline void fw_iso_context_schedule_flush_completions(struct fw_iso_context *ctx) +{ + queue_work(ctx->card->isoc_wq, &ctx->work); +} + int fw_iso_context_start(struct fw_iso_context *ctx, int cycle, int sync, int tags); int fw_iso_context_stop(struct fw_iso_context *ctx); -- cgit v1.2.3 From 1d07085402d122f223bda3f8b72bea37a46ee0c9 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Sat, 7 Sep 2024 16:27:20 +0800 Subject: smp: Mark smp_prepare_boot_cpu() __init smp_prepare_boot_cpu() is only called during boot, hence mark it as __init. Signed-off-by: Bibo Mao Signed-off-by: Thomas Gleixner Reviewed-by: Huacai Chen Link: https://lore.kernel.org/all/20240907082720.452148-1-maobibo@loongson.cn --- include/linux/smp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index fcd61dfe2af3..6a0813c905d0 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -109,7 +109,7 @@ static inline void on_each_cpu_cond(smp_cond_func_t cond_func, * Architecture specific boot CPU setup. Defined as empty weak function in * init/main.c. Architectures can override it. */ -void smp_prepare_boot_cpu(void); +void __init smp_prepare_boot_cpu(void); #ifdef CONFIG_SMP -- cgit v1.2.3 From c259acab839e57eab0318f32da4ae803a8d59397 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Wed, 4 Sep 2024 07:13:05 -0700 Subject: ptp/ioctl: support MONOTONIC{,_RAW} timestamps for PTP_SYS_OFFSET_EXTENDED The ability to read the PHC (Physical Hardware Clock) alongside multiple system clocks is currently dependent on the specific hardware architecture. This limitation restricts the use of PTP_SYS_OFFSET_PRECISE to certain hardware configurations. The generic soultion which would work across all architectures is to read the PHC along with the latency to perform PHC-read as offered by PTP_SYS_OFFSET_EXTENDED which provides pre and post timestamps. However, these timestamps are currently limited to the CLOCK_REALTIME timebase. Since CLOCK_REALTIME is affected by NTP (or similar time synchronization services), it can experience significant jumps forward or backward. This hinders the precise latency measurements that PTP_SYS_OFFSET_EXTENDED is designed to provide. This problem could be addressed by supporting MONOTONIC_RAW timestamps within PTP_SYS_OFFSET_EXTENDED. Unlike CLOCK_REALTIME or CLOCK_MONOTONIC, the MONOTONIC_RAW timebase is unaffected by NTP adjustments. This enhancement can be implemented by utilizing one of the three reserved words within the PTP_SYS_OFFSET_EXTENDED struct to pass the clock-id for timestamps. The current behavior aligns with clock-id for CLOCK_REALTIME timebase (value of 0), ensuring backward compatibility of the UAPI. Signed-off-by: Mahesh Bandewar Signed-off-by: Vadim Fedorenko Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 6e4b8206c7d0..c892d22ce0a7 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -47,10 +47,12 @@ struct system_device_crosststamp; * struct ptp_system_timestamp - system time corresponding to a PHC timestamp * @pre_ts: system timestamp before capturing PHC * @post_ts: system timestamp after capturing PHC + * @clockid: clock-base used for capturing the system timestamps */ struct ptp_system_timestamp { struct timespec64 pre_ts; struct timespec64 post_ts; + clockid_t clockid; }; /** @@ -457,14 +459,40 @@ static inline ktime_t ptp_convert_timestamp(const ktime_t *hwtstamp, static inline void ptp_read_system_prets(struct ptp_system_timestamp *sts) { - if (sts) - ktime_get_real_ts64(&sts->pre_ts); + if (sts) { + switch (sts->clockid) { + case CLOCK_REALTIME: + ktime_get_real_ts64(&sts->pre_ts); + break; + case CLOCK_MONOTONIC: + ktime_get_ts64(&sts->pre_ts); + break; + case CLOCK_MONOTONIC_RAW: + ktime_get_raw_ts64(&sts->pre_ts); + break; + default: + break; + } + } } static inline void ptp_read_system_postts(struct ptp_system_timestamp *sts) { - if (sts) - ktime_get_real_ts64(&sts->post_ts); + if (sts) { + switch (sts->clockid) { + case CLOCK_REALTIME: + ktime_get_real_ts64(&sts->post_ts); + break; + case CLOCK_MONOTONIC: + ktime_get_ts64(&sts->post_ts); + break; + case CLOCK_MONOTONIC_RAW: + ktime_get_raw_ts64(&sts->post_ts); + break; + default: + break; + } + } } #endif -- cgit v1.2.3 From 1fcb932c8b5ce86219d7dedcd63659351a43291c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Jul 2024 00:56:40 +0200 Subject: rcu/nocb: Simplify (de-)offloading state machine Now that the (de-)offloading process can only apply to offline CPUs, there is no more concurrency between rcu_core and nocb kthreads. Also the mutation now happens on empty queues. Therefore the state machine can be reduced to a single bit called SEGCBLIST_OFFLOADED. Simplify the transition as follows: * Upon offloading: queue the rdp to be added to the rcuog list and wait for the rcuog kthread to set the SEGCBLIST_OFFLOADED bit. Unpark rcuo kthread. * Upon de-offloading: Park rcuo kthread. Queue the rdp to be removed from the rcuog list and wait for the rcuog kthread to clear the SEGCBLIST_OFFLOADED bit. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay --- include/linux/rcu_segcblist.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 1ef1bb54853d..2fdc2208f1ca 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -185,9 +185,7 @@ struct rcu_cblist { * ---------------------------------------------------------------------------- */ #define SEGCBLIST_ENABLED BIT(0) -#define SEGCBLIST_LOCKING BIT(1) -#define SEGCBLIST_KTHREAD_GP BIT(2) -#define SEGCBLIST_OFFLOADED BIT(3) +#define SEGCBLIST_OFFLOADED BIT(1) struct rcu_segcblist { struct rcu_head *head; -- cgit v1.2.3 From bd7c8ff9fef4b21a97f9b30a7364845ee6eaaf23 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 4 Sep 2024 15:04:53 +0200 Subject: treewide: Fix wrong singular form of jiffies in comments There are several comments all over the place, which uses a wrong singular form of jiffies. Replace 'jiffie' by 'jiffy'. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: Geert Uytterhoeven # m68k Link: https://lore.kernel.org/all/20240904-devel-anna-maria-b4-timers-flseep-v1-3-e98760256370@linutronix.de --- include/linux/jiffies.h | 2 +- include/linux/timekeeper_internal.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index d9f1435a5a13..1220f0fbe5bf 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -418,7 +418,7 @@ extern unsigned long preset_lpj; #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) /* - * The maximum jiffie value is (MAX_INT >> 1). Here we translate that + * The maximum jiffy value is (MAX_INT >> 1). Here we translate that * into seconds. The 64-bit case will overflow if we are not careful, * so use the messy SH_DIV macro to do it. Still all constants. */ diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 84ff2844df2a..902c20ef495a 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -73,7 +73,7 @@ struct tk_read_base { * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what - * we need to add to xtime (or xtime corrected for sub jiffie times) + * we need to add to xtime (or xtime corrected for sub jiffy times) * to get to monotonic time. Monotonic is pegged at zero at system * boot time, so wall_to_monotonic will be negative, however, we will * ALWAYS keep the tv_nsec part positive so we can use the usual -- cgit v1.2.3 From ca229bdbef29be207c8666f106acc3bf50736c05 Mon Sep 17 00:00:00 2001 From: Cheng Ming Lin Date: Mon, 9 Sep 2024 17:26:42 +0800 Subject: mtd: spinand: Add support for setting plane select bits Add two flags for inserting the Plane Select bit into the column address during the write_to_cache and the read_from_cache operation. Add the SPINAND_HAS_PROG_PLANE_SELECT_BIT flag for serial NAND flash that require inserting the Plane Select bit into the column address during the write_to_cache operation. Add the SPINAND_HAS_READ_PLANE_SELECT_BIT flag for serial NAND flash that require inserting the Plane Select bit into the column address during the read_from_cache operation. Signed-off-by: Cheng Ming Lin Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20240909092643.2434479-2-linchengming884@gmail.com --- include/linux/mtd/spinand.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 8dbf67ca710a..702e5fb13dae 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -312,6 +312,8 @@ struct spinand_ecc_info { #define SPINAND_HAS_QE_BIT BIT(0) #define SPINAND_HAS_CR_FEAT_BIT BIT(1) +#define SPINAND_HAS_PROG_PLANE_SELECT_BIT BIT(2) +#define SPINAND_HAS_READ_PLANE_SELECT_BIT BIT(3) /** * struct spinand_ondie_ecc_conf - private SPI-NAND on-die ECC engine structure -- cgit v1.2.3 From d688d65a847ff910146eff51e70f4b7049895eb5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 30 Aug 2024 15:04:49 +0200 Subject: fs: add generic_llseek_cookie() This is similar to generic_file_llseek() but allows the caller to specify a cookie that will be updated to indicate that a seek happened. Caller's requiring that information in their readdir implementations can use that. Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-8-6d3e4816aa7b@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 58c91a52cad1..3e6b3c1afb31 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3202,6 +3202,8 @@ extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof); +loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, + u64 *cookie); extern loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size); extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); -- cgit v1.2.3 From 88d2ae0e6eb84e1ed58f339c6a0de16c24fa2a60 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 8 Aug 2024 11:18:21 -0400 Subject: inode: make __iget() a static inline bcachefs is switching to an rhashtable for vfs inodes instead of the standard inode.c hashtable, so we need this exported, or - a static inline makes more sense for a single atomic_inc(). Signed-off-by: Kent Overstreet --- include/linux/fs.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index fd34b5755c0b..8fc4bad3b6ae 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3094,7 +3094,14 @@ static inline bool is_zero_ino(ino_t ino) return (u32)ino == 0; } -extern void __iget(struct inode * inode); +/* + * inode->i_lock must be held + */ +static inline void __iget(struct inode *inode) +{ + atomic_inc(&inode->i_count); +} + extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); -- cgit v1.2.3 From f6594633817374a64a5fb31007bd810cad1425ff Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 19:00:33 -0400 Subject: lib/generic-radix-tree.c: genradix_ptr_inlined() Provide an inlined fast path Signed-off-by: Kent Overstreet --- include/linux/generic-radix-tree.h | 75 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'include/linux') diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index f3512fddf3d7..8a3e1e886d1c 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -48,6 +48,49 @@ struct genradix_root; #define GENRADIX_NODE_SHIFT 9 #define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT) +#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) +#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) + +/* depth that's needed for a genradix that can address up to ULONG_MAX: */ +#define GENRADIX_MAX_DEPTH \ + DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) + +#define GENRADIX_DEPTH_MASK \ + ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) + +static inline int genradix_depth_shift(unsigned depth) +{ + return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; +} + +/* + * Returns size (of data, in bytes) that a tree of a given depth holds: + */ +static inline size_t genradix_depth_size(unsigned depth) +{ + return 1UL << genradix_depth_shift(depth); +} + +static inline unsigned genradix_root_to_depth(struct genradix_root *r) +{ + return (unsigned long) r & GENRADIX_DEPTH_MASK; +} + +static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) +{ + return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); +} + +struct genradix_node { + union { + /* Interior node: */ + struct genradix_node *children[GENRADIX_ARY]; + + /* Leaf: */ + u8 data[GENRADIX_NODE_SIZE]; + }; +}; + struct __genradix { struct genradix_root *root; }; @@ -128,6 +171,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) #define __genradix_idx_to_offset(_radix, _idx) \ __idx_to_offset(_idx, __genradix_obj_size(_radix)) +static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset) +{ + struct genradix_root *r = READ_ONCE(radix->root); + struct genradix_node *n = genradix_root_to_node(r); + unsigned level = genradix_root_to_depth(r); + unsigned shift = genradix_depth_shift(level); + + if (unlikely(ilog2(offset) >= genradix_depth_shift(level))) + return NULL; + + while (n && shift > GENRADIX_NODE_SHIFT) { + shift -= GENRADIX_ARY_SHIFT; + n = n->children[offset >> shift]; + offset &= (1UL << shift) - 1; + } + + return n ? &n->data[offset] : NULL; +} + +#define genradix_ptr_inlined(_radix, _idx) \ + (__genradix_cast(_radix) \ + __genradix_ptr_inlined(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx))) + void *__genradix_ptr(struct __genradix *, size_t); /** @@ -144,6 +211,14 @@ void *__genradix_ptr(struct __genradix *, size_t); void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); +#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \ + (__genradix_cast(_radix) \ + (__genradix_ptr_inlined(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx)) ?: \ + __genradix_ptr_alloc(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx), \ + _gfp))) + /** * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it * if necessary -- cgit v1.2.3 From b3f9da79e7783ca4f1c1d4214af976c548629c1d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 23:14:44 -0400 Subject: lib/generic-radix-tree.c: add preallocation Signed-off-by: Kent Overstreet --- include/linux/generic-radix-tree.h | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 8a3e1e886d1c..5b51c3d582d6 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -41,6 +41,7 @@ #include #include #include +#include #include struct genradix_root; @@ -81,6 +82,10 @@ static inline struct genradix_node *genradix_root_to_node(struct genradix_root * return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); } +struct __genradix { + struct genradix_root *root; +}; + struct genradix_node { union { /* Interior node: */ @@ -91,9 +96,15 @@ struct genradix_node { }; }; -struct __genradix { - struct genradix_root *root; -}; +static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) +{ + return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); +} + +static inline void genradix_free_node(struct genradix_node *node) +{ + kfree(node); +} /* * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE: @@ -209,7 +220,8 @@ void *__genradix_ptr(struct __genradix *, size_t); __genradix_ptr(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx))) -void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); +void *__genradix_ptr_alloc(struct __genradix *, size_t, + struct genradix_node **, gfp_t); #define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \ (__genradix_cast(_radix) \ @@ -217,7 +229,15 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); __genradix_idx_to_offset(_radix, _idx)) ?: \ __genradix_ptr_alloc(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx), \ - _gfp))) + NULL, _gfp))) + +#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\ + (__genradix_cast(_radix) \ + (__genradix_ptr_inlined(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx)) ?: \ + __genradix_ptr_alloc(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx), \ + _new_node, _gfp))) /** * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it @@ -232,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); (__genradix_cast(_radix) \ __genradix_ptr_alloc(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx), \ - _gfp)) + NULL, _gfp)) + +#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\ + (__genradix_cast(_radix) \ + __genradix_ptr_alloc(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx), \ + _new_node, _gfp)) struct genradix_iter { size_t offset; -- cgit v1.2.3 From f877f1d81b2ffcac341ff62ae076d7ad5ba83f46 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 9 Sep 2024 23:00:18 +0900 Subject: firewire: core: use mutex to coordinate concurrent calls to flush completions In current implementation, test_and_set_bit_lock() is used to mediate concurrent calls of ohci_flush_iso_completions(). However, the ad-hoc usage of atomic operations is not preferable. This commit uses mutex_trylock() as the similar operations. The core function is responsible for the mediation, instead of 1394 OHCI driver. Link: https://lore.kernel.org/r/20240909140018.65289-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index f815d12deda0..19e8c5f9537c 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -512,6 +512,7 @@ union fw_iso_callback { struct fw_iso_context { struct fw_card *card; struct work_struct work; + struct mutex flushing_completions_mutex; int type; int channel; int speed; -- cgit v1.2.3 From 34c626c3004a73ee7da5072be48ddff9c2902902 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 16 Jan 2023 01:16:43 +0200 Subject: net/mlx5: Added missing mlx5_ifc definition for HW Steering Add mlx5_ifc definitions that are required for HWS support. Note that due to change in the mlx5_ifc_flow_table_context_bits structure that now includes both SWS and HWS bits in a union, this patch also includes small change in one of SWS files that was required for compilation. Reviewed-by: Hamdan Agbariya Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 189 +++++++++++++++++++++++++++++++++++------- 1 file changed, 161 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 234ad6f16e92..b6f8e3834bd3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -80,23 +80,15 @@ enum { enum { MLX5_OBJ_TYPE_SW_ICM = 0x0008, - MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT = 0x23, -}; - -enum { - MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM), - MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11), - MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13), - MLX5_GENERAL_OBJ_TYPES_CAP_HEADER_MODIFY_ARGUMENT = - (1ULL << MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT), - MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD = (1ULL << 39), -}; - -enum { MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b, MLX5_OBJ_TYPE_VIRTIO_NET_Q = 0x000d, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS = 0x001c, MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018, + MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT = 0x23, + MLX5_OBJ_TYPE_STC = 0x0040, + MLX5_OBJ_TYPE_RTC = 0x0041, + MLX5_OBJ_TYPE_STE = 0x0042, + MLX5_OBJ_TYPE_MODIFY_HDR_PATTERN = 0x0043, MLX5_OBJ_TYPE_PAGE_TRACK = 0x46, MLX5_OBJ_TYPE_MKEY = 0xff01, MLX5_OBJ_TYPE_QP = 0xff02, @@ -112,6 +104,16 @@ enum { MLX5_OBJ_TYPE_RQT = 0xff0e, MLX5_OBJ_TYPE_FLOW_COUNTER = 0xff0f, MLX5_OBJ_TYPE_CQ = 0xff10, + MLX5_OBJ_TYPE_FT_ALIAS = 0xff15, +}; + +enum { + MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM), + MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11), + MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13), + MLX5_GENERAL_OBJ_TYPES_CAP_HEADER_MODIFY_ARGUMENT = + (1ULL << MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT), + MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD = (1ULL << 39), }; enum { @@ -313,6 +315,7 @@ enum { MLX5_CMD_OP_MODIFY_VHCA_STATE = 0xb0e, MLX5_CMD_OP_SYNC_CRYPTO = 0xb12, MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS = 0xb16, + MLX5_CMD_OP_GENERATE_WQE = 0xb17, MLX5_CMD_OP_MAX }; @@ -485,7 +488,13 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_at_66[0x2]; u8 reformat_add_macsec[0x1]; u8 reformat_remove_macsec[0x1]; - u8 reserved_at_6a[0xe]; + u8 reparse[0x1]; + u8 reserved_at_6b[0x1]; + u8 cross_vhca_object[0x1]; + u8 reformat_l2_to_l3_audp_tunnel[0x1]; + u8 reformat_l3_audp_tunnel_to_l2[0x1]; + u8 ignore_flow_level_rtc_valid[0x1]; + u8 reserved_at_70[0x8]; u8 log_max_ft_num[0x8]; u8 reserved_at_80[0x10]; @@ -522,7 +531,15 @@ struct mlx5_ifc_ipv6_layout_bits { u8 ipv6[16][0x8]; }; +struct mlx5_ifc_ipv6_simple_layout_bits { + u8 ipv6_127_96[0x20]; + u8 ipv6_95_64[0x20]; + u8 ipv6_63_32[0x20]; + u8 ipv6_31_0[0x20]; +}; + union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits { + struct mlx5_ifc_ipv6_simple_layout_bits ipv6_simple_layout; struct mlx5_ifc_ipv6_layout_bits ipv6_layout; struct mlx5_ifc_ipv4_layout_bits ipv4_layout; u8 reserved_at_0[0x80]; @@ -911,7 +928,9 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits { u8 reserved_at_8[0x5]; u8 fdb_uplink_hairpin[0x1]; u8 fdb_multi_path_any_table_limit_regc[0x1]; - u8 reserved_at_f[0x3]; + u8 reserved_at_f[0x1]; + u8 fdb_dynamic_tunnel[0x1]; + u8 reserved_at_11[0x1]; u8 fdb_multi_path_any_table[0x1]; u8 reserved_at_13[0x2]; u8 fdb_modify_header_fwd_to_table[0x1]; @@ -950,6 +969,73 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits { u8 reserved_at_1900[0x6700]; }; +struct mlx5_ifc_wqe_based_flow_table_cap_bits { + u8 reserved_at_0[0x3]; + u8 log_max_num_ste[0x5]; + u8 reserved_at_8[0x3]; + u8 log_max_num_stc[0x5]; + u8 reserved_at_10[0x3]; + u8 log_max_num_rtc[0x5]; + u8 reserved_at_18[0x3]; + u8 log_max_num_header_modify_pattern[0x5]; + + u8 rtc_hash_split_table[0x1]; + u8 rtc_linear_lookup_table[0x1]; + u8 reserved_at_22[0x1]; + u8 stc_alloc_log_granularity[0x5]; + u8 reserved_at_28[0x3]; + u8 stc_alloc_log_max[0x5]; + u8 reserved_at_30[0x3]; + u8 ste_alloc_log_granularity[0x5]; + u8 reserved_at_38[0x3]; + u8 ste_alloc_log_max[0x5]; + + u8 reserved_at_40[0xb]; + u8 rtc_reparse_mode[0x5]; + u8 reserved_at_50[0x3]; + u8 rtc_index_mode[0x5]; + u8 reserved_at_58[0x3]; + u8 rtc_log_depth_max[0x5]; + + u8 reserved_at_60[0x10]; + u8 ste_format[0x10]; + + u8 stc_action_type[0x80]; + + u8 header_insert_type[0x10]; + u8 header_remove_type[0x10]; + + u8 trivial_match_definer[0x20]; + + u8 reserved_at_140[0x1b]; + u8 rtc_max_num_hash_definer_gen_wqe[0x5]; + + u8 reserved_at_160[0x18]; + u8 access_index_mode[0x8]; + + u8 reserved_at_180[0x10]; + u8 ste_format_gen_wqe[0x10]; + + u8 linear_match_definer_reg_c3[0x20]; + + u8 fdb_jump_to_tir_stc[0x1]; + u8 reserved_at_1c1[0x1f]; +}; + +struct mlx5_ifc_esw_cap_bits { + u8 reserved_at_0[0x1d]; + u8 merged_eswitch[0x1]; + u8 reserved_at_1e[0x2]; + + u8 reserved_at_20[0x40]; + + u8 esw_manager_vport_number_valid[0x1]; + u8 reserved_at_61[0xf]; + u8 esw_manager_vport_number[0x10]; + + u8 reserved_at_80[0x780]; +}; + enum { MLX5_COUNTER_SOURCE_ESWITCH = 0x0, MLX5_COUNTER_FLOW_ESWITCH = 0x1, @@ -1443,9 +1529,13 @@ enum { }; enum { + MLX5_FLEX_IPV4_OVER_VXLAN_ENABLED = 1 << 0, + MLX5_FLEX_IPV6_OVER_VXLAN_ENABLED = 1 << 1, + MLX5_FLEX_IPV6_OVER_IP_ENABLED = 1 << 2, MLX5_FLEX_PARSER_GENEVE_ENABLED = 1 << 3, MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED = 1 << 4, MLX5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED = 1 << 5, + MLX5_FLEX_P_BIT_VXLAN_GPE_ENABLED = 1 << 6, MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED = 1 << 7, MLX5_FLEX_PARSER_ICMP_V4_ENABLED = 1 << 8, MLX5_FLEX_PARSER_ICMP_V6_ENABLED = 1 << 9, @@ -1650,7 +1740,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 pci_sync_for_fw_update_event[0x1]; u8 reserved_at_1f2[0x6]; u8 init2_lag_tx_port_affinity[0x1]; - u8 reserved_at_1fa[0x3]; + u8 reserved_at_1fa[0x2]; + u8 wqe_based_flow_table_update_cap[0x1]; u8 cqe_version[0x4]; u8 compact_address_vector[0x1]; @@ -1959,7 +2050,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_760[0x3]; u8 log_max_num_header_modify_argument[0x5]; - u8 reserved_at_768[0x4]; + u8 log_header_modify_argument_granularity_offset[0x4]; u8 log_header_modify_argument_granularity[0x4]; u8 reserved_at_770[0x3]; u8 log_header_modify_argument_max_alloc[0x5]; @@ -2006,7 +2097,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_140[0x60]; u8 flow_table_type_2_type[0x8]; - u8 reserved_at_1a8[0x3]; + u8 reserved_at_1a8[0x2]; + u8 format_select_dw_8_6_ext[0x1]; u8 log_min_mkey_entity_size[0x5]; u8 reserved_at_1b0[0x10]; @@ -2022,6 +2114,16 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_250[0x10]; u8 reserved_at_260[0x120]; + + u8 format_select_dw_gtpu_dw_0[0x8]; + u8 format_select_dw_gtpu_dw_1[0x8]; + u8 format_select_dw_gtpu_dw_2[0x8]; + u8 format_select_dw_gtpu_first_ext_dw_0[0x8]; + + u8 generate_wqe_type[0x20]; + + u8 reserved_at_2c0[0xc0]; + u8 reserved_at_380[0xb]; u8 min_mkey_log_entity_size_fixed_buffer[0x5]; u8 ec_vf_vport_base[0x10]; @@ -2037,9 +2139,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_400[0x1]; u8 min_mkey_log_entity_size_fixed_buffer_valid[0x1]; - u8 reserved_at_402[0x1e]; + u8 reserved_at_402[0xe]; + u8 return_reg_id[0x10]; - u8 reserved_at_420[0x20]; + u8 reserved_at_420[0x1c]; + u8 flow_table_hash_type[0x4]; u8 reserved_at_440[0x8]; u8 max_num_eqs_24b[0x18]; @@ -2086,7 +2190,7 @@ struct mlx5_ifc_extended_dest_format_bits { u8 reserved_at_60[0x20]; }; -union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits { +union mlx5_ifc_dest_format_flow_counter_list_auto_bits { struct mlx5_ifc_extended_dest_format_bits extended_dest_format; struct mlx5_ifc_flow_counter_list_bits flow_counter_list; }; @@ -2178,7 +2282,10 @@ struct mlx5_ifc_wq_bits { u8 reserved_at_139[0x4]; u8 log_wqe_stride_size[0x3]; - u8 reserved_at_140[0x80]; + u8 dbr_umem_id[0x20]; + u8 wq_umem_id[0x20]; + + u8 wq_umem_offset[0x40]; u8 headers_mkey[0x20]; @@ -3562,6 +3669,8 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_per_protocol_networking_offload_caps_bits per_protocol_networking_offload_caps; struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap; struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; + struct mlx5_ifc_wqe_based_flow_table_cap_bits wqe_based_flow_table_cap; + struct mlx5_ifc_esw_cap_bits esw_cap; struct mlx5_ifc_e_switch_cap_bits e_switch_cap; struct mlx5_ifc_port_selection_cap_bits port_selection_cap; struct mlx5_ifc_qos_cap_bits qos_cap; @@ -3678,7 +3787,7 @@ struct mlx5_ifc_flow_context_bits { u8 reserved_at_1300[0x500]; - union mlx5_ifc_dest_format_struct_flow_counter_list_auto_bits destination[]; + union mlx5_ifc_dest_format_flow_counter_list_auto_bits destination[]; }; enum { @@ -3919,7 +4028,8 @@ struct mlx5_ifc_sqc_bits { u8 reg_umr[0x1]; u8 allow_swp[0x1]; u8 hairpin[0x1]; - u8 reserved_at_f[0xb]; + u8 non_wire[0x1]; + u8 reserved_at_10[0xa]; u8 ts_format[0x2]; u8 reserved_at_1c[0x4]; @@ -4961,6 +5071,16 @@ struct mlx5_ifc_set_fte_in_bits { struct mlx5_ifc_flow_context_bits flow_context; }; +struct mlx5_ifc_dest_format_bits { + u8 destination_type[0x8]; + u8 destination_id[0x18]; + + u8 destination_eswitch_owner_vhca_id_valid[0x1]; + u8 packet_reformat[0x1]; + u8 reserved_at_22[0xe]; + u8 destination_eswitch_owner_vhca_id[0x10]; +}; + struct mlx5_ifc_rts2rts_qp_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -6127,7 +6247,8 @@ struct mlx5_ifc_flow_table_context_bits { u8 termination_table[0x1]; u8 table_miss_action[0x4]; u8 level[0x8]; - u8 reserved_at_10[0x8]; + u8 rtc_valid[0x1]; + u8 reserved_at_11[0x7]; u8 log_size[0x8]; u8 reserved_at_20[0x8]; @@ -6137,11 +6258,21 @@ struct mlx5_ifc_flow_table_context_bits { u8 lag_master_next_table_id[0x18]; u8 reserved_at_60[0x60]; + union { + struct { + u8 sw_owner_icm_root_1[0x40]; + + u8 sw_owner_icm_root_0[0x40]; + } sws; + struct { + u8 rtc_id_0[0x20]; - u8 sw_owner_icm_root_1[0x40]; + u8 rtc_id_1[0x20]; - u8 sw_owner_icm_root_0[0x40]; + u8 reserved_at_100[0x40]; + } hws; + }; }; struct mlx5_ifc_query_flow_table_out_bits { @@ -8923,7 +9054,9 @@ struct mlx5_ifc_create_qp_in_bits { struct mlx5_ifc_qpc_bits qpc; - u8 reserved_at_800[0x60]; + u8 wq_umem_offset[0x40]; + + u8 wq_umem_id[0x20]; u8 wq_umem_valid[0x1]; u8 reserved_at_861[0x1f]; -- cgit v1.2.3 From 00b9f0daefd7670356e592d418c890acf9179c94 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 17 Jan 2023 17:14:51 +0200 Subject: net/mlx5: Added missing definitions in preparation for HW Steering As part of preparation for HWS, added missing definitions in qp.h and fs_core.h: - FS_FT_FDB_RX/TX table types that are used by HWS in addition to an existing FS_FT_FDB - MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE that is used by HWS to require fence in WQE Reviewed-by: Hamdan Agbariya Signed-off-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/qp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index ad1ce650146c..fc7eeff99a8a 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -149,6 +149,7 @@ enum { MLX5_WQE_CTRL_CQ_UPDATE = 2 << 2, MLX5_WQE_CTRL_CQ_UPDATE_AND_EQE = 3 << 2, MLX5_WQE_CTRL_SOLICITED = 1 << 1, + MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE = 1 << 5, }; enum { -- cgit v1.2.3 From 452ef7f86036392005940de54228d42ca0044192 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Mon, 5 Aug 2024 10:03:20 +0300 Subject: net/mlx5: Add missing masks and QoS bit masks for scheduling elements Add the missing masks for supported element types and Transmit Scheduling Arbiter (TSAR) types in scheduling elements. Also, add the corresponding bit masks for these types in the QoS capabilities of a NIC scheduler. Fixes: 214baf22870c ("net/mlx5e: Support HTB offload") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cab228cf51c6..cfdf984a95a8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1027,7 +1027,8 @@ struct mlx5_ifc_qos_cap_bits { u8 max_tsar_bw_share[0x20]; - u8 reserved_at_100[0x20]; + u8 nic_element_type[0x10]; + u8 nic_tsar_type[0x10]; u8 reserved_at_120[0x3]; u8 log_meter_aso_granularity[0x5]; @@ -3966,6 +3967,7 @@ enum { ELEMENT_TYPE_CAP_MASK_VPORT = 1 << 1, ELEMENT_TYPE_CAP_MASK_VPORT_TC = 1 << 2, ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC = 1 << 3, + ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP = 1 << 4, }; struct mlx5_ifc_scheduling_context_bits { @@ -4675,6 +4677,12 @@ enum { TSAR_ELEMENT_TSAR_TYPE_ETS = 0x2, }; +enum { + TSAR_TYPE_CAP_MASK_DWRR = 1 << 0, + TSAR_TYPE_CAP_MASK_ROUND_ROBIN = 1 << 1, + TSAR_TYPE_CAP_MASK_ETS = 1 << 2, +}; + struct mlx5_ifc_tsar_element_bits { u8 reserved_at_0[0x8]; u8 tsar_type[0x8]; -- cgit v1.2.3 From 3dc95a3edd0a86b4a59670b3fafcc64c7d83e2e7 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 4 Sep 2024 17:47:45 +0200 Subject: netdevice: add netdev_tx_reset_subqueue() shorthand Add a shorthand similar to other net*_subqueue() helpers for resetting the queue by its index w/o obtaining &netdev_tx_queue beforehand manually. Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- include/linux/netdevice.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b47c00657bd0..44d1dbb54ffe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3568,6 +3568,17 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q) #endif } +/** + * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue + * @dev: network device + * @qid: stack index of the queue to reset + */ +static inline void netdev_tx_reset_subqueue(const struct net_device *dev, + u32 qid) +{ + netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid)); +} + /** * netdev_reset_queue - reset the packets and bytes count of a network device * @dev_queue: network device @@ -3577,7 +3588,7 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q) */ static inline void netdev_reset_queue(struct net_device *dev_queue) { - netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0)); + netdev_tx_reset_subqueue(dev_queue, 0); } /** -- cgit v1.2.3 From 40a895fd9a358eea16901026360bd2cd3ca691a7 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 5 Sep 2024 15:35:45 -0700 Subject: cxl: move cxl headers to new include/cxl/ directory Group all cxl related kernel headers into include/cxl/ directory. Reviewed-by: Alison Schofield Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20240905223711.1990186-2-dave.jiang@intel.com Signed-off-by: Dave Jiang --- include/linux/cxl-event.h | 175 ---------------------------------------------- include/linux/einj-cxl.h | 44 ------------ 2 files changed, 219 deletions(-) delete mode 100644 include/linux/cxl-event.h delete mode 100644 include/linux/einj-cxl.h (limited to 'include/linux') diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h deleted file mode 100644 index 0bea1afbd747..000000000000 --- a/include/linux/cxl-event.h +++ /dev/null @@ -1,175 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright(c) 2023 Intel Corporation. */ -#ifndef _LINUX_CXL_EVENT_H -#define _LINUX_CXL_EVENT_H - -#include -#include -#include - -/* - * Common Event Record Format - * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 - */ -struct cxl_event_record_hdr { - u8 length; - u8 flags[3]; - __le16 handle; - __le16 related_handle; - __le64 timestamp; - u8 maint_op_class; - u8 reserved[15]; -} __packed; - -struct cxl_event_media_hdr { - struct cxl_event_record_hdr hdr; - __le64 phys_addr; - u8 descriptor; - u8 type; - u8 transaction_type; - /* - * The meaning of Validity Flags from bit 2 is - * different across DRAM and General Media records - */ - u8 validity_flags[2]; - u8 channel; - u8 rank; -} __packed; - -#define CXL_EVENT_RECORD_DATA_LENGTH 0x50 -struct cxl_event_generic { - struct cxl_event_record_hdr hdr; - u8 data[CXL_EVENT_RECORD_DATA_LENGTH]; -} __packed; - -/* - * General Media Event Record - * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 - */ -#define CXL_EVENT_GEN_MED_COMP_ID_SIZE 0x10 -struct cxl_event_gen_media { - struct cxl_event_media_hdr media_hdr; - u8 device[3]; - u8 component_id[CXL_EVENT_GEN_MED_COMP_ID_SIZE]; - u8 reserved[46]; -} __packed; - -/* - * DRAM Event Record - DER - * CXL rev 3.0 section 8.2.9.2.1.2; Table 3-44 - */ -#define CXL_EVENT_DER_CORRECTION_MASK_SIZE 0x20 -struct cxl_event_dram { - struct cxl_event_media_hdr media_hdr; - u8 nibble_mask[3]; - u8 bank_group; - u8 bank; - u8 row[3]; - u8 column[2]; - u8 correction_mask[CXL_EVENT_DER_CORRECTION_MASK_SIZE]; - u8 reserved[0x17]; -} __packed; - -/* - * Get Health Info Record - * CXL rev 3.0 section 8.2.9.8.3.1; Table 8-100 - */ -struct cxl_get_health_info { - u8 health_status; - u8 media_status; - u8 add_status; - u8 life_used; - u8 device_temp[2]; - u8 dirty_shutdown_cnt[4]; - u8 cor_vol_err_cnt[4]; - u8 cor_per_err_cnt[4]; -} __packed; - -/* - * Memory Module Event Record - * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45 - */ -struct cxl_event_mem_module { - struct cxl_event_record_hdr hdr; - u8 event_type; - struct cxl_get_health_info info; - u8 reserved[0x3d]; -} __packed; - -union cxl_event { - struct cxl_event_generic generic; - struct cxl_event_gen_media gen_media; - struct cxl_event_dram dram; - struct cxl_event_mem_module mem_module; - /* dram & gen_media event header */ - struct cxl_event_media_hdr media_hdr; -} __packed; - -/* - * Common Event Record Format; in event logs - * CXL rev 3.0 section 8.2.9.2.1; Table 8-42 - */ -struct cxl_event_record_raw { - uuid_t id; - union cxl_event event; -} __packed; - -enum cxl_event_type { - CXL_CPER_EVENT_GENERIC, - CXL_CPER_EVENT_GEN_MEDIA, - CXL_CPER_EVENT_DRAM, - CXL_CPER_EVENT_MEM_MODULE, -}; - -#define CPER_CXL_DEVICE_ID_VALID BIT(0) -#define CPER_CXL_DEVICE_SN_VALID BIT(1) -#define CPER_CXL_COMP_EVENT_LOG_VALID BIT(2) -struct cxl_cper_event_rec { - struct { - u32 length; - u64 validation_bits; - struct cper_cxl_event_devid { - u16 vendor_id; - u16 device_id; - u8 func_num; - u8 device_num; - u8 bus_num; - u16 segment_num; - u16 slot_num; /* bits 2:0 reserved */ - u8 reserved; - } __packed device_id; - struct cper_cxl_event_sn { - u32 lower_dw; - u32 upper_dw; - } __packed dev_serial_num; - } __packed hdr; - - union cxl_event event; -} __packed; - -struct cxl_cper_work_data { - enum cxl_event_type event_type; - struct cxl_cper_event_rec rec; -}; - -#ifdef CONFIG_ACPI_APEI_GHES -int cxl_cper_register_work(struct work_struct *work); -int cxl_cper_unregister_work(struct work_struct *work); -int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd); -#else -static inline int cxl_cper_register_work(struct work_struct *work) -{ - return 0; -} - -static inline int cxl_cper_unregister_work(struct work_struct *work) -{ - return 0; -} -static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd) -{ - return 0; -} -#endif - -#endif /* _LINUX_CXL_EVENT_H */ diff --git a/include/linux/einj-cxl.h b/include/linux/einj-cxl.h deleted file mode 100644 index 624ff6ff41f9..000000000000 --- a/include/linux/einj-cxl.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * CXL protocol Error INJection support. - * - * Copyright (c) 2023 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Author: Ben Cheatham - */ -#ifndef EINJ_CXL_H -#define EINJ_CXL_H - -#include -#include - -struct pci_dev; -struct seq_file; - -#if IS_ENABLED(CONFIG_ACPI_APEI_EINJ_CXL) -int einj_cxl_available_error_type_show(struct seq_file *m, void *v); -int einj_cxl_inject_error(struct pci_dev *dport_dev, u64 type); -int einj_cxl_inject_rch_error(u64 rcrb, u64 type); -bool einj_cxl_is_initialized(void); -#else /* !IS_ENABLED(CONFIG_ACPI_APEI_EINJ_CXL) */ -static inline int einj_cxl_available_error_type_show(struct seq_file *m, - void *v) -{ - return -ENXIO; -} - -static inline int einj_cxl_inject_error(struct pci_dev *dport_dev, u64 type) -{ - return -ENXIO; -} - -static inline int einj_cxl_inject_rch_error(u64 rcrb, u64 type) -{ - return -ENXIO; -} - -static inline bool einj_cxl_is_initialized(void) { return false; } -#endif /* CONFIG_ACPI_APEI_EINJ_CXL */ - -#endif /* EINJ_CXL_H */ -- cgit v1.2.3 From 246d3aa3e53151fa150f10257ddd8a4facd31a6a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 8 Aug 2024 12:18:46 +0100 Subject: mm: cleanup count_mthp_stat() definition Patch series "Shmem mTHP controls and stats improvements", v3. This is a small series to tidy up the way the shmem controls and stats are exposed. These patches were previously part of the series at [2], but I decided to split them out since they can go in independently. This patch (of 2): Let's move count_mthp_stat() so that it's always defined, even when THP is disabled. Previously uses of the function in files such as shmem.c, which are compiled even when THP is disabled, required ugly THP ifdeferry. With this cleanup, we can remove those ifdefs and the function resolves to a nop when THP is disabled. I shortly plan to call count_mthp_stat() from more THP-invariant source files. Link: https://lkml.kernel.org/r/20240808111849.651867-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20240808111849.651867-2-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Barry Song Reviewed-by: Baolin Wang Reviewed-by: Lance Yang Acked-by: David Hildenbrand Cc: Gavin Shan Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 70 ++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 6370026689e0..4c32058cacfe 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -114,6 +114,41 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) +enum mthp_stat_item { + MTHP_STAT_ANON_FAULT_ALLOC, + MTHP_STAT_ANON_FAULT_FALLBACK, + MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + MTHP_STAT_SWPOUT, + MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_SHMEM_ALLOC, + MTHP_STAT_SHMEM_FALLBACK, + MTHP_STAT_SHMEM_FALLBACK_CHARGE, + MTHP_STAT_SPLIT, + MTHP_STAT_SPLIT_FAILED, + MTHP_STAT_SPLIT_DEFERRED, + __MTHP_STAT_COUNT +}; + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) +struct mthp_stat { + unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; +}; + +DECLARE_PER_CPU(struct mthp_stat, mthp_stats); + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ + if (order <= 0 || order > PMD_ORDER) + return; + + this_cpu_inc(mthp_stats.stats[order][item]); +} +#else +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ +} +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long transparent_hugepage_flags; @@ -269,41 +304,6 @@ struct thpsize { #define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) -enum mthp_stat_item { - MTHP_STAT_ANON_FAULT_ALLOC, - MTHP_STAT_ANON_FAULT_FALLBACK, - MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, - MTHP_STAT_SWPOUT, - MTHP_STAT_SWPOUT_FALLBACK, - MTHP_STAT_SHMEM_ALLOC, - MTHP_STAT_SHMEM_FALLBACK, - MTHP_STAT_SHMEM_FALLBACK_CHARGE, - MTHP_STAT_SPLIT, - MTHP_STAT_SPLIT_FAILED, - MTHP_STAT_SPLIT_DEFERRED, - __MTHP_STAT_COUNT -}; - -struct mthp_stat { - unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; -}; - -#ifdef CONFIG_SYSFS -DECLARE_PER_CPU(struct mthp_stat, mthp_stats); - -static inline void count_mthp_stat(int order, enum mthp_stat_item item) -{ - if (order <= 0 || order > PMD_ORDER) - return; - - this_cpu_inc(mthp_stats.stats[order][item]); -} -#else -static inline void count_mthp_stat(int order, enum mthp_stat_item item) -{ -} -#endif - #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1< Date: Sat, 24 Aug 2024 13:04:40 +1200 Subject: mm: count the number of anonymous THPs per size Patch series "mm: count the number of anonymous THPs per size", v4. Knowing the number of transparent anon THPs in the system is crucial for performance analysis. It helps in understanding the ratio and distribution of THPs versus small folios throughout the system. Additionally, partial unmapping by userspace can lead to significant waste of THPs over time and increase memory reclamation pressure. We need this information for comprehensive system tuning. This patch (of 2): Let's track for each anonymous THP size, how many of them are currently allocated. We'll track the complete lifespan of an anon THP, starting when it becomes an anon THP ("large anon folio") (->mapping gets set), until it gets freed (->mapping gets cleared). Introduce a new "nr_anon" counter per THP size and adjust the corresponding counter in the following cases: * We allocate a new THP and call folio_add_new_anon_rmap() to map it the first time and turn it into an anon THP. * We split an anon THP into multiple smaller ones. * We migrate an anon THP, when we prepare the destination. * We free an anon THP back to the buddy. Note that AnonPages in /proc/meminfo currently tracks the total number of *mapped* anonymous *pages*, and therefore has slightly different semantics. In the future, we might also want to track "nr_anon_mapped" for each THP size, which might be helpful when comparing it to the number of allocated anon THPs (long-term pinning, stuck in swapcache, memory leaks, ...). Further note that for now, we only track anon THPs after they got their ->mapping set, for example via folio_add_new_anon_rmap(). If we would allocate some in the swapcache, they will only show up in the statistics for now after they have been mapped to user space the first time, where we call folio_add_new_anon_rmap(). [akpm@linux-foundation.org: documentation fixups, per David] Link: https://lkml.kernel.org/r/3e8add35-e26b-443b-8a04-1078f4bc78f6@redhat.com Link: https://lkml.kernel.org/r/20240824010441.21308-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240824010441.21308-2-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: Chuanhua Han Cc: Kairui Song Cc: Kalesh Singh Cc: Lance Yang Cc: Ryan Roberts Cc: Shuai Yuan Cc: Usama Arif Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4c32058cacfe..2ee2971e4e10 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -126,6 +126,7 @@ enum mthp_stat_item { MTHP_STAT_SPLIT, MTHP_STAT_SPLIT_FAILED, MTHP_STAT_SPLIT_DEFERRED, + MTHP_STAT_NR_ANON, __MTHP_STAT_COUNT }; @@ -136,14 +137,24 @@ struct mthp_stat { DECLARE_PER_CPU(struct mthp_stat, mthp_stats); -static inline void count_mthp_stat(int order, enum mthp_stat_item item) +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta) { if (order <= 0 || order > PMD_ORDER) return; - this_cpu_inc(mthp_stats.stats[order][item]); + this_cpu_add(mthp_stats.stats[order][item], delta); +} + +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ + mod_mthp_stat(order, item, 1); } + #else +static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta) +{ +} + static inline void count_mthp_stat(int order, enum mthp_stat_item item) { } -- cgit v1.2.3 From 8175ebfd302abe6fbdca9037f763ecbfdb8db572 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 24 Aug 2024 13:04:41 +1200 Subject: mm: count the number of partially mapped anonymous THPs per size When a THP is added to the deferred_list due to partially mapped, its partial pages are unused, leading to wasted memory and potentially increasing memory reclamation pressure. Detailing the specifics of how unmapping occurs is quite difficult and not that useful, so we adopt a simple approach: each time a THP enters the deferred_list, we increment the count by 1; whenever it leaves for any reason, we decrement the count by 1. Link: https://lkml.kernel.org/r/20240824010441.21308-3-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Chris Li Cc: Chuanhua Han Cc: Kairui Song Cc: Kalesh Singh Cc: Lance Yang Cc: Ryan Roberts Cc: Shuai Yuan Cc: Usama Arif Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2ee2971e4e10..4902e2f7e896 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -127,6 +127,7 @@ enum mthp_stat_item { MTHP_STAT_SPLIT_FAILED, MTHP_STAT_SPLIT_DEFERRED, MTHP_STAT_NR_ANON, + MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, __MTHP_STAT_COUNT }; -- cgit v1.2.3 From b7315fbb64736acad3cd33851884b222b00dc0f4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 25 Aug 2024 21:23:20 -0700 Subject: mm/damon/core: introduce per-context region priorities histogram buffer Patch series "replace per-quota region priorities histogram buffer with per-context one". Each DAMOS quota (struct damos_quota) maintains a histogram for total regions size per its prioritization score. DAMOS calcultes minimum prioritization score of regions that are ok to apply the DAMOS action to while respecting the quota. The histogram is constructed only for the calculation of the minimum score in damos_adjust_quota() for each quota which called by kdamond_fn(). Hence, there is no real reason to have per-quota histogram. Only per-kdamond histogram is needed, since parallel kdamonds could have races otherwise. The current implementation is only wasting the memory, and can easily cause unintended stack usage[1]. So, introducing a per-kdamond histogram and replacing the per-quota one with it would be the right solution for the issue. However, supporting multiple DAMON contexts per kdamond is still an ongoing work[2] without a clear estimated time of arrival. Meanwhile, per-context histogram could be an effective and straightforward solution having no blocker. Let's fix the problem first in the way. This patch (of 4): Introduce per-context buffer for region priority scores-total size histogram. Same to the per-quota one (->histogram of struct damos_quota), the new buffer is hidden from DAMON API users by being defined as a private field of DAMON context structure. It is dynamically allocated and de-allocated at the beginning and ending of the execution of the kdamond by kdamond_fn() itself. [1] commit 0742cadf5e4c ("mm/damon/lru_sort: adjust local variable to dynamic allocation") [2] https://lore.kernel.org/20240531122320.909060-1-yorha.op@gmail.com Link: https://lkml.kernel.org/r/20240826042323.87025-1-sj@kernel.org Link: https://lkml.kernel.org/r/20240826042323.87025-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 27c546bfc6d4..60cb8eada3d6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -630,6 +630,8 @@ struct damon_ctx { unsigned long next_ops_update_sis; /* for waiting until the execution of the kdamond_fn is started */ struct completion kdamond_started; + /* for scheme quotas prioritization */ + unsigned long *regions_score_histogram; /* public: */ struct task_struct *kdamond; -- cgit v1.2.3 From e3bcb1672583f2e1cf456e4303ce562a3cc775c0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 25 Aug 2024 21:23:22 -0700 Subject: mm/damon/core: remove per-scheme region priority histogram buffer Nobody is reading from or writing to the per-scheme region priorities histogram buffer. It is only wasting memory. Remove it. Link: https://lkml.kernel.org/r/20240826042323.87025-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index 60cb8eada3d6..a67f2c4940e9 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -233,7 +233,6 @@ struct damos_quota { unsigned long charge_addr_from; /* For prioritization */ - unsigned long histogram[DAMOS_MAX_SCORE + 1]; unsigned int min_score; /* For feedback loop */ -- cgit v1.2.3 From 17d75422604f0b92869aa17cb44f60958212f033 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 31 Aug 2024 08:28:22 +1200 Subject: mm: document __GFP_NOFAIL must be blockable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Non-blocking allocation with __GFP_NOFAIL is not supported and may still result in NULL pointers (if we don't return NULL, we result in busy-loop within non-sleepable contexts): static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { ... /* * Make sure that __GFP_NOFAIL request doesn't leak out and make sure * we always retry */ if (gfp_mask & __GFP_NOFAIL) { /* * All existing users of the __GFP_NOFAIL are blockable, so warn * of any new users that actually require GFP_NOWAIT */ if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask)) goto fail; ... } ... fail: warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: return page; } Highlight this in the documentation of __GFP_NOFAIL so that non-mm subsystems can reject any illegal usage of __GFP_NOFAIL with GFP_ATOMIC, GFP_NOWAIT, etc. Link: https://lkml.kernel.org/r/20240830202823.21478-3-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: Michal Hocko Reviewed-by: Christoph Hellwig Acked-by: Vlastimil Babka Acked-by: Davidlohr Bueso Acked-by: David Hildenbrand Cc: Christoph Lameter Cc: David Rientjes Cc: "Eugenio Pérez" Cc: Hailong.Liu Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Jason Wang Cc: Joonsoo Kim Cc: Kees Cook Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Maxime Coquelin Cc: "Michael S. Tsirkin" Cc: Pekka Enberg Cc: Roman Gushchin Cc: Uladzislau Rezki (Sony) Cc: Xuan Zhuo Cc: Christoph Hellwig Cc: Xie Yongji Cc: Yafang Shao Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 313be4ad79fd..4a1fa7706b0c 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -215,7 +215,8 @@ enum { * the caller still has to check for failures) while costly requests try to be * not disruptive and back off even without invoking the OOM killer. * The following three modifiers might be used to override some of these - * implicit rules. + * implicit rules. Please note that all of them must be used along with + * %__GFP_DIRECT_RECLAIM flag. * * %__GFP_NORETRY: The VM implementation will try only very lightweight * memory direct reclaim to get some memory under memory pressure (thus @@ -246,6 +247,8 @@ enum { * cannot handle allocation failures. The allocation could block * indefinitely but will never return with failure. Testing for * failure is pointless. + * It _must_ be blockable and used together with __GFP_DIRECT_RECLAIM. + * It should _never_ be used in non-sleepable contexts. * New users should be evaluated carefully (and the flag should be * used only when there is no reasonable failure policy) but it is * definitely preferable to use the flag rather than opencode endless -- cgit v1.2.3 From 903edea6c53f097f5f0c847fdbbfab0c6c44f241 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 31 Aug 2024 08:28:23 +1200 Subject: mm: warn about illegal __GFP_NOFAIL usage in a more appropriate location and manner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three points for this change: 1. We should consolidate all warnings in one place. Currently, the order > 1 warning is in the hotpath, while others are in less likely scenarios. Moving all warnings to the slowpath will reduce the overhead for order > 1 and increase the visibility of other warnings. 2. We currently have two warnings for order: one for order > 1 in the hotpath and another for order > costly_order in the laziest path. I suggest standardizing on order > 1 since it's been in use for a long time. 3. We don't need to check for __GFP_NOWARN in this case. __GFP_NOWARN is meant to suppress allocation failure reports, but here we're dealing with bug detection, not allocation failures. So replace WARN_ON_ONCE_GFP by WARN_ON_ONCE. [v-songbaohua@oppo.com: also update the doc for __GFP_NOFAIL with order > 1] Link: https://lkml.kernel.org/r/20240903223935.1697-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240830202823.21478-4-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand Acked-by: Michal Hocko Cc: Christoph Hellwig Cc: Christoph Lameter Cc: Davidlohr Bueso Cc: David Rientjes Cc: "Eugenio Pérez" Cc: Hailong.Liu Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Jason Wang Cc: Joonsoo Kim Cc: Kees Cook Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Maxime Coquelin Cc: "Michael S. Tsirkin" Cc: Pekka Enberg Cc: Roman Gushchin Cc: Uladzislau Rezki (Sony) Cc: Xie Yongji Cc: Xuan Zhuo Cc: Yafang Shao Signed-off-by: Andrew Morton --- include/linux/gfp_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 4a1fa7706b0c..65db9349f905 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -253,7 +253,8 @@ enum { * used only when there is no reasonable failure policy) but it is * definitely preferable to use the flag rather than opencode endless * loop around allocator. - * Using this flag for costly allocations is _highly_ discouraged. + * Allocating pages from the buddy with __GFP_NOFAIL and order > 1 is + * not supported. Please consider using kvmalloc() instead. */ #define __GFP_IO ((__force gfp_t)___GFP_IO) #define __GFP_FS ((__force gfp_t)___GFP_FS) -- cgit v1.2.3 From b1f202060afeb7fcb98473929d26fd3d2093b067 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Fri, 30 Aug 2024 11:03:36 +0100 Subject: mm: remap unused subpages to shared zeropage when splitting isolated thp Patch series "mm: split underused THPs", v5. The current upstream default policy for THP is always. However, Meta uses madvise in production as the current THP=always policy vastly overprovisions THPs in sparsely accessed memory areas, resulting in excessive memory pressure and premature OOM killing. Using madvise + relying on khugepaged has certain drawbacks over THP=always. Using madvise hints mean THPs aren't "transparent" and require userspace changes. Waiting for khugepaged to scan memory and collapse pages into THP can be slow and unpredictable in terms of performance (i.e. you dont know when the collapse will happen), while production environments require predictable performance. If there is enough memory available, its better for both performance and predictability to have a THP from fault time, i.e. THP=always rather than wait for khugepaged to collapse it, and deal with sparsely populated THPs when the system is running out of memory. This patch series is an attempt to mitigate the issue of running out of memory when THP is always enabled. During runtime whenever a THP is being faulted in or collapsed by khugepaged, the THP is added to a list. Whenever memory reclaim happens, the kernel runs the deferred_split shrinker which goes through the list and checks if the THP was underused, i.e. how many of the base 4K pages of the entire THP were zero-filled. If this number goes above a certain threshold, the shrinker will attempt to split that THP. Then at remap time, the pages that were zero-filled are mapped to the shared zeropage, hence saving memory. This method avoids the downside of wasting memory in areas where THP is sparsely filled when THP is always enabled, while still providing the upside THPs like reduced TLB misses without having to use madvise. Meta production workloads that were CPU bound (>99% CPU utilzation) were tested with THP shrinker. The results after 2 hours are as follows: | THP=madvise | THP=always | THP=always | | | + shrinker series | | | + max_ptes_none=409 ----------------------------------------------------------------------------- Performance improvement | - | +1.8% | +1.7% (over THP=madvise) | | | ----------------------------------------------------------------------------- Memory usage | 54.6G | 58.8G (+7.7%) | 55.9G (+2.4%) ----------------------------------------------------------------------------- max_ptes_none=409 means that any THP that has more than 409 out of 512 (80%) zero filled filled pages will be split. To test out the patches, the below commands without the shrinker will invoke OOM killer immediately and kill stress, but will not fail with the shrinker: echo 450 > /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none mkdir /sys/fs/cgroup/test echo $$ > /sys/fs/cgroup/test/cgroup.procs echo 20M > /sys/fs/cgroup/test/memory.max echo 0 > /sys/fs/cgroup/test/memory.swap.max # allocate twice memory.max for each stress worker and touch 40/512 of # each THP, i.e. vm-stride 50K. # With the shrinker, max_ptes_none of 470 and below won't invoke OOM # killer. # Without the shrinker, OOM killer is invoked immediately irrespective # of max_ptes_none value and kills stress. stress --vm 1 --vm-bytes 40M --vm-stride 50K This patch (of 5): Here being unused means containing only zeros and inaccessible to userspace. When splitting an isolated thp under reclaim or migration, the unused subpages can be mapped to the shared zeropage, hence saving memory. This is particularly helpful when the internal fragmentation of a thp is high, i.e. it has many untouched subpages. This is also a prerequisite for THP low utilization shrinker which will be introduced in later patches, where underutilized THPs are split, and the zero-filled pages are freed saving memory. Link: https://lkml.kernel.org/r/20240830100438.3623486-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20240830100438.3623486-3-usamaarif642@gmail.com Signed-off-by: Yu Zhao Signed-off-by: Usama Arif Tested-by: Shuang Zhai Cc: Alexander Zhu Cc: Barry Song Cc: David Hildenbrand Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kairui Song Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Nico Pache Cc: Rik van Riel Cc: Roman Gushchin Cc: Ryan Roberts Cc: Shakeel Butt Cc: Shuang Zhai Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 91b5935e8485..d5e93e44322e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *); int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); +enum rmp_flags { + RMP_LOCKED = 1 << 0, + RMP_USE_SHARED_ZEROPAGE = 1 << 1, +}; + +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags); /* * rmap_walk_control: To control rmap traversing for specific needs -- cgit v1.2.3 From 8422acdc97ed5839692b45f800dbfb78abe65a94 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 30 Aug 2024 11:03:38 +0100 Subject: mm: introduce a pageflag for partially mapped folios Currently folio->_deferred_list is used to keep track of partially_mapped folios that are going to be split under memory pressure. In the next patch, all THPs that are faulted in and collapsed by khugepaged are also going to be tracked using _deferred_list. This patch introduces a pageflag to be able to distinguish between partially mapped folios and others in the deferred_list at split time in deferred_split_scan. Its needed as __folio_remove_rmap decrements _mapcount, _large_mapcount and _entire_mapcount, hence it won't be possible to distinguish between partially mapped folios and others in deferred_split_scan. Eventhough it introduces an extra flag to track if the folio is partially mapped, there is no functional change intended with this patch and the flag is not useful in this patch itself, it will become useful in the next patch when _deferred_list has non partially mapped folios. Link: https://lkml.kernel.org/r/20240830100438.3623486-5-usamaarif642@gmail.com Signed-off-by: Usama Arif Cc: Alexander Zhu Cc: Barry Song Cc: David Hildenbrand Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kairui Song Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Nico Pache Cc: Rik van Riel Cc: Roman Gushchin Cc: Ryan Roberts Cc: Shakeel Butt Cc: Shuang Zhai Cc: Yu Zhao Cc: Shuang Zhai Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- include/linux/page-flags.h | 13 ++++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4902e2f7e896..e3896ebf0f94 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -333,7 +333,7 @@ static inline int split_huge_page(struct page *page) { return split_huge_page_to_list_to_order(page, NULL, 0); } -void deferred_split_folio(struct folio *folio); +void deferred_split_folio(struct folio *folio, bool partially_mapped); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct folio *folio); @@ -507,7 +507,7 @@ static inline int split_huge_page(struct page *page) { return 0; } -static inline void deferred_split_folio(struct folio *folio) {} +static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2175ebceb41c..1b3a76710487 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -186,6 +186,7 @@ enum pageflags { /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ + PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -859,8 +860,18 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) +FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +/* + * PG_partially_mapped is protected by deferred_split split_queue_lock, + * so its safe to use non-atomic set/clear. + */ +__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) +FOLIO_TEST_FLAG_FALSE(partially_mapped) +__FOLIO_SET_FLAG_NOOP(partially_mapped) +__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) @@ -1171,7 +1182,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ - 1UL << PG_large_rmappable) + 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) -- cgit v1.2.3 From dafff3f4c850c98e15501bc6ee20581f9a013dcc Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 30 Aug 2024 11:03:39 +0100 Subject: mm: split underused THPs This is an attempt to mitigate the issue of running out of memory when THP is always enabled. During runtime whenever a THP is being faulted in (__do_huge_pmd_anonymous_page) or collapsed by khugepaged (collapse_huge_page), the THP is added to _deferred_list. Whenever memory reclaim happens in linux, the kernel runs the deferred_split shrinker which goes through the _deferred_list. If the folio was partially mapped, the shrinker attempts to split it. If the folio is not partially mapped, the shrinker checks if the THP was underused, i.e. how many of the base 4K pages of the entire THP were zero-filled. If this number goes above a certain threshold (decided by /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none), the shrinker will attempt to split that THP. Then at remap time, the pages that were zero-filled are mapped to the shared zeropage, hence saving memory. Link: https://lkml.kernel.org/r/20240830100438.3623486-6-usamaarif642@gmail.com Signed-off-by: Usama Arif Suggested-by: Rik van Riel Co-authored-by: Johannes Weiner Cc: Alexander Zhu Cc: Barry Song Cc: David Hildenbrand Cc: Domenico Cerasuolo Cc: Jonathan Corbet Cc: Kairui Song Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Shakeel Butt Cc: Shuang Zhai Cc: Yu Zhao Cc: Shuang Zhai Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 1 + include/linux/vm_event_item.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index f68865e19b0b..30baae91b225 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -4,6 +4,7 @@ #include /* MMF_VM_HUGEPAGE */ +extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index aae5c7c5cfb4..aed952d04132 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -105,6 +105,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, + THP_UNDERUSED_SPLIT_PAGE, THP_SPLIT_PMD, THP_SCAN_EXCEED_NONE_PTE, THP_SCAN_EXCEED_SWAP_PTE, -- cgit v1.2.3 From 0a6fff20d36bc81156a49649f622b152330fed3c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Sep 2024 16:24:59 -0400 Subject: mm: fix folio_alloc_noprof() folio_alloc_noprof) wasn't calling the _noprof version, causing allocations to be accounted here instead of to the caller Link: https://lkml.kernel.org/r/20240901202459.4867-1-kent.overstreet@linux.dev Signed-off-by: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 03ba9563c6db..a951de920e20 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -319,7 +319,7 @@ static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order } static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { - return __folio_alloc_node(gfp, order, numa_node_id()); + return __folio_alloc_node_noprof(gfp, order, numa_node_id()); } static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid) -- cgit v1.2.3 From 6050df6d706f58b2240bfabece9f406170104d57 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 9 Aug 2024 02:01:15 +0000 Subject: maple_tree: fix comment typo on ma_flag of allocation tree The maple tree flag of allocation tree is MT_FLAGS_ALLOC_RANGE. Just correct it. Link: https://lkml.kernel.org/r/20240809020115.31575-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 8e1504a81cd2..c2c11004085e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -52,9 +52,9 @@ * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * - * Once the type is decided, the decision of an allocation range type or a range - * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE - * flag. + * Once the type is decided, the decision of an allocation range type or a + * range type is done by examining the immutable tree flag for the + * MT_FLAGS_ALLOC_RANGE flag. * * Node types: * 0x??1 = Root -- cgit v1.2.3 From 4fc4187984e5dbd7ad63c3acf0b228fa9bf298d3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 2 Sep 2024 19:55:49 +0900 Subject: lib: zstd: export API needed for dictionary support Patch series "zram: introduce custom comp backends API", v7. This series introduces support for run-time compression algorithms tuning, so users, for instance, can adjust compression/acceleration levels and provide pre-trained compression/decompression dictionaries which certain algorithms support. At this point we stop supporting (old/deprecated) comp API. We may add new acomp API support in the future, but before that zram needs to undergo some major rework (we are not ready for async compression). Some benchmarks for reference (look at column #2) *** init zstd /sys/block/zram0/mm_stat 1750659072 504622188 514355200 0 514355200 1 0 34204 34204 *** init zstd dict=/home/ss/zstd-dict-amd64 /sys/block/zram0/mm_stat 1750650880 465908890 475398144 0 475398144 1 0 34185 34185 *** init zstd level=8 dict=/home/ss/zstd-dict-amd64 /sys/block/zram0/mm_stat 1750654976 430803319 439873536 0 439873536 1 0 34185 34185 *** init lz4 /sys/block/zram0/mm_stat 1750646784 664266564 677060608 0 677060608 1 0 34288 34288 *** init lz4 dict=/home/ss/lz4-dict-amd64 /sys/block/zram0/mm_stat 1750650880 619990300 632102912 0 632102912 1 0 34278 34278 *** init lz4hc /sys/block/zram0/mm_stat 1750630400 609023822 621232128 0 621232128 1 0 34288 34288 *** init lz4hc dict=/home/ss/lz4-dict-amd64 /sys/block/zram0/mm_stat 1750659072 505133172 515231744 0 515231744 1 0 34278 34278 Recompress init zram zstd (prio=0), zstd level=5 (prio 1), zstd with dict (prio 2) *** zstd /sys/block/zram0/mm_stat 1750982656 504630584 514269184 0 514269184 1 0 34204 34204 *** idle recompress priority=1 (zstd level=5) /sys/block/zram0/mm_stat 1750982656 488645601 525438976 0 514269184 1 0 34204 34204 *** idle recompress priority=2 (zstd dict) /sys/block/zram0/mm_stat 1750982656 460869640 517914624 0 514269184 1 0 34185 34204 This patch (of 24): We need to export a number of API functions that enable advanced zstd usage - C/D dictionaries, dictionaries sharing between contexts, etc. Link: https://lkml.kernel.org/r/20240902105656.1383858-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20240902105656.1383858-2-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Nick Terrell Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- include/linux/zstd.h | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) (limited to 'include/linux') diff --git a/include/linux/zstd.h b/include/linux/zstd.h index 113408eef6ec..b2c7cf310c8f 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -77,6 +77,30 @@ int zstd_min_clevel(void); */ int zstd_max_clevel(void); +/** + * zstd_default_clevel() - default compression level + * + * Return: Default compression level. + */ +int zstd_default_clevel(void); + +/** + * struct zstd_custom_mem - custom memory allocation + */ +typedef ZSTD_customMem zstd_custom_mem; + +/** + * struct zstd_dict_load_method - Dictionary load method. + * See zstd_lib.h. + */ +typedef ZSTD_dictLoadMethod_e zstd_dict_load_method; + +/** + * struct zstd_dict_content_type - Dictionary context type. + * See zstd_lib.h. + */ +typedef ZSTD_dictContentType_e zstd_dict_content_type; + /* ====== Parameter Selection ====== */ /** @@ -136,6 +160,19 @@ typedef ZSTD_parameters zstd_parameters; zstd_parameters zstd_get_params(int level, unsigned long long estimated_src_size); + +/** + * zstd_get_cparams() - returns zstd_compression_parameters for selected level + * @level: The compression level + * @estimated_src_size: The estimated source size to compress or 0 + * if unknown. + * @dict_size: Dictionary size. + * + * Return: The selected zstd_compression_parameters. + */ +zstd_compression_parameters zstd_get_cparams(int level, + unsigned long long estimated_src_size, size_t dict_size); + /* ====== Single-pass Compression ====== */ typedef ZSTD_CCtx zstd_cctx; @@ -180,6 +217,71 @@ zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size); size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, const void *src, size_t src_size, const zstd_parameters *parameters); +/** + * zstd_create_cctx_advanced() - Create compression context + * @custom_mem: Custom allocator. + * + * Return: NULL on error, pointer to compression context otherwise. + */ +zstd_cctx *zstd_create_cctx_advanced(zstd_custom_mem custom_mem); + +/** + * zstd_free_cctx() - Free compression context + * @cdict: Pointer to compression context. + * + * Return: Always 0. + */ +size_t zstd_free_cctx(zstd_cctx* cctx); + +/** + * struct zstd_cdict - Compression dictionary. + * See zstd_lib.h. + */ +typedef ZSTD_CDict zstd_cdict; + +/** + * zstd_create_cdict_byreference() - Create compression dictionary + * @dict: Pointer to dictionary buffer. + * @dict_size: Size of the dictionary buffer. + * @dict_load_method: Dictionary load method. + * @dict_content_type: Dictionary content type. + * @custom_mem: Memory allocator. + * + * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be + * free before zstd_cdict is destroyed. + * + * Return: NULL on error, pointer to compression dictionary + * otherwise. + */ +zstd_cdict *zstd_create_cdict_byreference(const void *dict, size_t dict_size, + zstd_compression_parameters cparams, + zstd_custom_mem custom_mem); + +/** + * zstd_free_cdict() - Free compression dictionary + * @cdict: Pointer to compression dictionary. + * + * Return: Always 0. + */ +size_t zstd_free_cdict(zstd_cdict* cdict); + +/** + * zstd_compress_using_cdict() - compress src into dst using a dictionary + * @cctx: The context. Must have been initialized with zstd_init_cctx(). + * @dst: The buffer to compress src into. + * @dst_capacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @src_size: The size of the data to compress. + * @cdict: The dictionary to be used. + * + * Return: The compressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_compress_using_cdict(zstd_cctx *cctx, void *dst, + size_t dst_capacity, const void *src, size_t src_size, + const zstd_cdict *cdict); + /* ====== Single-pass Decompression ====== */ typedef ZSTD_DCtx zstd_dctx; @@ -220,6 +322,71 @@ zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size); size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity, const void *src, size_t src_size); +/** + * struct zstd_ddict - Decompression dictionary. + * See zstd_lib.h. + */ +typedef ZSTD_DDict zstd_ddict; + +/** + * zstd_create_ddict_byreference() - Create decompression dictionary + * @dict: Pointer to dictionary buffer. + * @dict_size: Size of the dictionary buffer. + * @dict_load_method: Dictionary load method. + * @dict_content_type: Dictionary content type. + * @custom_mem: Memory allocator. + * + * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be + * free before zstd_ddict is destroyed. + * + * Return: NULL on error, pointer to decompression dictionary + * otherwise. + */ +zstd_ddict *zstd_create_ddict_byreference(const void *dict, size_t dict_size, + zstd_custom_mem custom_mem); +/** + * zstd_free_ddict() - Free decompression dictionary + * @dict: Pointer to the dictionary. + * + * Return: Always 0. + */ +size_t zstd_free_ddict(zstd_ddict *ddict); + +/** + * zstd_create_dctx_advanced() - Create decompression context + * @custom_mem: Custom allocator. + * + * Return: NULL on error, pointer to decompression context otherwise. + */ +zstd_dctx *zstd_create_dctx_advanced(zstd_custom_mem custom_mem); + +/** + * zstd_free_dctx() -- Free decompression context + * @dctx: Pointer to decompression context. + * Return: Always 0. + */ +size_t zstd_free_dctx(zstd_dctx *dctx); + +/** + * zstd_decompress_using_ddict() - decompress src into dst using a dictionary + * @dctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dst_capacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @src_size: The exact size of the data to decompress. + * @ddict: The dictionary to be used. + * + * Return: The decompressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_decompress_using_ddict(zstd_dctx *dctx, + void *dst, size_t dst_capacity, const void *src, size_t src_size, + const zstd_ddict *ddict); + + /* ====== Streaming Buffers ====== */ /** -- cgit v1.2.3 From e1e4cfd01a6e75dd4c810aeac115340805cf63ff Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 3 Sep 2024 11:19:28 -0400 Subject: mm,tmpfs: consider end of file write in shmem_is_huge Take the end of a file write into consideration when deciding whether or not to use huge pages for tmpfs files when the tmpfs filesystem is mounted with huge=within_size This allows large writes that append to the end of a file to automatically use large pages. Doing 4MB sequential writes without fallocate to a 16GB tmpfs file with fio. The numbers without THP or with huge=always stay the same, but the performance with huge=within_size now matches that of huge=always. huge before after 4kB pages 1560 MB/s 1560 MB/s within_size 1560 MB/s 4720 MB/s always: 4720 MB/s 4720 MB/s [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20240903111928.7171e60c@imladris.surriel.com Signed-off-by: Rik van Riel Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Darrick J. Wong Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 1564d7d3ca61..515a9a6a3c6f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -113,11 +113,11 @@ int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool shmem_huge_force); + loff_t write_end, bool shmem_huge_force); #else static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, - bool shmem_huge_force) + loff_t write_end, bool shmem_huge_force) { return 0; } @@ -143,8 +143,8 @@ enum sgp_type { SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, - enum sgp_type sgp); +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end, + struct folio **foliop, enum sgp_type sgp); struct folio *shmem_read_folio_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp); -- cgit v1.2.3 From 25d4054cc97484f2555709ac233f955f674e026a Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 4 Sep 2024 17:57:59 +0100 Subject: mm: make arch_get_unmapped_area() take vm_flags by default Patch series "mm: Care about shadow stack guard gap when getting an unmapped area", v2. As covered in the commit log for c44357c2e76b ("x86/mm: care about shadow stack guard gap during placement") our current mmap() implementation does not take care to ensure that a new mapping isn't placed with existing mappings inside it's own guard gaps. This is particularly important for shadow stacks since if two shadow stacks end up getting placed adjacent to each other then they can overflow into each other which weakens the protection offered by the feature. On x86 there is a custom arch_get_unmapped_area() which was updated by the above commit to cover this case by specifying a start_gap for allocations with VM_SHADOW_STACK. Both arm64 and RISC-V have equivalent features and use the generic implementation of arch_get_unmapped_area() so let's make the equivalent change there so they also don't get shadow stack pages placed without guard pages. The arm64 and RISC-V shadow stack implementations are currently on the list: https://lore.kernel.org/r/20240829-arm64-gcs-v12-0-42fec94743 https://lore.kernel.org/lkml/20240403234054.2020347-1-debug@rivosinc.com/ Given the addition of the use of vm_flags in the generic implementation we also simplify the set of possibilities that have to be dealt with in the core code by making arch_get_unmapped_area() take vm_flags as standard. This is a bit invasive since the prototype change touches quite a few architectures but since the parameter is ignored the change is straightforward, the simplification for the generic code seems worth it. This patch (of 3): When we introduced arch_get_unmapped_area_vmflags() in 961148704acd ("mm: introduce arch_get_unmapped_area_vmflags()") we did so as part of properly supporting guard pages for shadow stacks on x86_64, which uses a custom arch_get_unmapped_area(). Equivalent features are also present on both arm64 and RISC-V, both of which use the generic implementation of arch_get_unmapped_area() and will require equivalent modification there. Rather than continue to deal with having two versions of the functions let's bite the bullet and have all implementations of arch_get_unmapped_area() take vm_flags as a parameter. The new parameter is currently ignored by all implementations other than x86. The only caller that doesn't have a vm_flags available is mm_get_unmapped_area(), as for the x86 implementation and the wrapper used on other architectures this is modified to supply no flags. No functional changes. Link: https://lkml.kernel.org/r/20240904-mm-generic-shadow-stack-guard-v2-0-a46b8b6dc0ed@kernel.org Link: https://lkml.kernel.org/r/20240904-mm-generic-shadow-stack-guard-v2-1-a46b8b6dc0ed@kernel.org Signed-off-by: Mark Brown Acked-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: Helge Deller [parisc] Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: "Edgecombe, Rick P" Cc: Gerald Schaefer Cc: Guo Ren Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: John Paul Adrian Glaubitz Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 91546493c43d..c4d34abc45d4 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -179,27 +179,20 @@ static inline void mm_update_next_owner(struct mm_struct *mm) extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); -extern unsigned long -arch_get_unmapped_area(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); -extern unsigned long + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t vm_flags); +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long len, unsigned long pgoff, + unsigned long flags, vm_flags_t); unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -unsigned long -arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags, vm_flags_t vm_flags); -unsigned long -arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags, vm_flags_t); - unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp, unsigned long addr, -- cgit v1.2.3 From 540e00a729dfbefe0aa0d64b6dac1d1b620a1787 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 4 Sep 2024 17:58:00 +0100 Subject: mm: pass vm_flags to generic_get_unmapped_area() In preparation for using vm_flags to ensure guard pages for shadow stacks supply them as an argument to generic_get_unmapped_area(). The only user outside of the core code is the PowerPC book3s64 implementation which is trivially wrapping the generic implementation in the radix_enabled() case. No functional changes. Link: https://lkml.kernel.org/r/20240904-mm-generic-shadow-stack-guard-v2-2-a46b8b6dc0ed@kernel.org Signed-off-by: Mark Brown Acked-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: Michael Ellerman Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: "Edgecombe, Rick P" Cc: Gerald Schaefer Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: John Paul Adrian Glaubitz Cc: Matt Turner Cc: Max Filippov Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Vlastimil Babka Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index c4d34abc45d4..07bb8d4181d7 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -204,11 +204,11 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long flags, vm_flags_t vm_flags); unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, - unsigned long flags); + unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) {} -- cgit v1.2.3 From 08e28de1160a712724268fd33d77b32f1bc84d1c Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 3 Sep 2024 09:36:28 +0200 Subject: uprobes: use vm_special_mapping close() functionality The following KASAN splat was shown: [ 44.505448] ================================================================== 20:37:27 [3421/145075] [ 44.505455] BUG: KASAN: slab-use-after-free in special_mapping_close+0x9c/0xc8 [ 44.505471] Read of size 8 at addr 00000000868dac48 by task sh/1384 [ 44.505479] [ 44.505486] CPU: 51 UID: 0 PID: 1384 Comm: sh Not tainted 6.11.0-rc6-next-20240902-dirty #1496 [ 44.505503] Hardware name: IBM 3931 A01 704 (z/VM 7.3.0) [ 44.505508] Call Trace: [ 44.505511] [<000b0324d2f78080>] dump_stack_lvl+0xd0/0x108 [ 44.505521] [<000b0324d2f5435c>] print_address_description.constprop.0+0x34/0x2e0 [ 44.505529] [<000b0324d2f5464c>] print_report+0x44/0x138 [ 44.505536] [<000b0324d1383192>] kasan_report+0xc2/0x140 [ 44.505543] [<000b0324d2f52904>] special_mapping_close+0x9c/0xc8 [ 44.505550] [<000b0324d12c7978>] remove_vma+0x78/0x120 [ 44.505557] [<000b0324d128a2c6>] exit_mmap+0x326/0x750 [ 44.505563] [<000b0324d0ba655a>] __mmput+0x9a/0x370 [ 44.505570] [<000b0324d0bbfbe0>] exit_mm+0x240/0x340 [ 44.505575] [<000b0324d0bc0228>] do_exit+0x548/0xd70 [ 44.505580] [<000b0324d0bc1102>] do_group_exit+0x132/0x390 [ 44.505586] [<000b0324d0bc13b6>] __s390x_sys_exit_group+0x56/0x60 [ 44.505592] [<000b0324d0adcbd6>] do_syscall+0x2f6/0x430 [ 44.505599] [<000b0324d2f78434>] __do_syscall+0xa4/0x170 [ 44.505606] [<000b0324d2f9454c>] system_call+0x74/0x98 [ 44.505614] [ 44.505616] Allocated by task 1384: [ 44.505621] kasan_save_stack+0x40/0x70 [ 44.505630] kasan_save_track+0x28/0x40 [ 44.505636] __kasan_kmalloc+0xa0/0xc0 [ 44.505642] __create_xol_area+0xfa/0x410 [ 44.505648] get_xol_area+0xb0/0xf0 [ 44.505652] uprobe_notify_resume+0x27a/0x470 [ 44.505657] irqentry_exit_to_user_mode+0x15e/0x1d0 [ 44.505664] pgm_check_handler+0x122/0x170 [ 44.505670] [ 44.505672] Freed by task 1384: [ 44.505676] kasan_save_stack+0x40/0x70 [ 44.505682] kasan_save_track+0x28/0x40 [ 44.505687] kasan_save_free_info+0x4a/0x70 [ 44.505693] __kasan_slab_free+0x5a/0x70 [ 44.505698] kfree+0xe8/0x3f0 [ 44.505704] __mmput+0x20/0x370 [ 44.505709] exit_mm+0x240/0x340 [ 44.505713] do_exit+0x548/0xd70 [ 44.505718] do_group_exit+0x132/0x390 [ 44.505722] __s390x_sys_exit_group+0x56/0x60 [ 44.505727] do_syscall+0x2f6/0x430 [ 44.505732] __do_syscall+0xa4/0x170 [ 44.505738] system_call+0x74/0x98 The problem is that uprobe_clear_state() kfree's struct xol_area, which contains struct vm_special_mapping *xol_mapping. This one is passed to _install_special_mapping() in xol_add_vma(). __mput reads: static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); ... } So uprobe_clear_state() in the beginning free's the memory area containing the vm_special_mapping data, but exit_mmap() uses this address later via vma->vm_private_data (which was set in _install_special_mapping(). Fix this by moving uprobe_clear_state() to uprobes.c and use it as close() callback. [usama.anjum@collabora.com: remove unneeded condition] Link: https://lkml.kernel.org/r/20240906101825.177490-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20240903073629.2442754-1-svens@linux.ibm.com Fixes: 223febc6e557 ("mm: add optional close() to struct vm_special_mapping") Signed-off-by: Sven Schnelle Suggested-by: Linus Torvalds Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Michael Ellerman Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/uprobes.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b503fafb7fb3..493dc95d912c 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -126,7 +126,6 @@ extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); extern bool uprobe_deny_signal(void); extern bool arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs); -extern void uprobe_clear_state(struct mm_struct *mm); extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); -- cgit v1.2.3 From 0e40cf2a8b2c847950e025d5aa594bd545118d26 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Thu, 5 Sep 2024 00:30:50 +0000 Subject: cgroup: clarify css sibling linkage is protected by cgroup_mutex or RCU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Improve mem_cgroup_iter()", v4. Incremental cgroup iteration is being used again [1]. This patchset improves the reliability of mem_cgroup_iter(). It also improves simplicity and code readability. [1] https://lore.kernel.org/20240514202641.2821494-1-hannes@cmpxchg.org/ This patch (of 5): Explicitly document that css sibling/descendant linkage is protected by cgroup_mutex or RCU. Also, document in css_next_descendant_pre() and similar functions that it isn't necessary to hold a ref on @pos. The following changes in this patchset rely on this clarification for simplification in memcg iteration code. Link: https://lkml.kernel.org/r/20240905003058.1859929-1-kinseyho@google.com Link: https://lkml.kernel.org/r/20240905003058.1859929-2-kinseyho@google.com Suggested-by: Yosry Ahmed Reviewed-by: Michal Koutný Signed-off-by: Kinsey Ho Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tejun Heo Cc: Zefan Li Cc: Hugh Dickins Cc: T.J. Mercier Signed-off-by: Andrew Morton --- include/linux/cgroup-defs.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7fc2d0195f56..ca7e912b8355 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -172,7 +172,11 @@ struct cgroup_subsys_state { /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; - /* siblings list anchored at the parent's ->children */ + /* + * siblings list anchored at the parent's ->children + * + * linkage is protected by cgroup_mutex or RCU + */ struct list_head sibling; struct list_head children; -- cgit v1.2.3 From ec0db74b4b1f249ffca4df450f54c17573114045 Mon Sep 17 00:00:00 2001 From: Kinsey Ho Date: Thu, 5 Sep 2024 00:30:53 +0000 Subject: mm: restart if multiple traversals raced MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, if multiple reclaimers raced on the same position, the reclaimers which detect the race will still reclaim from the same memcg. Instead, the reclaimers which detect the race should move on to the next memcg in the hierarchy. So, in the case where multiple traversals race, jump back to the start of the mem_cgroup_iter() function to find the next memcg in the hierarchy to reclaim from. Link: https://lkml.kernel.org/r/20240905003058.1859929-5-kinseyho@google.com Reported-by: syzbot+e099d407346c45275ce9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/000000000000817cf10620e20d33@google.com/ Signed-off-by: Kinsey Ho Reviewed-by: T.J. Mercier Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tejun Heo Cc: Yosry Ahmed Cc: Zefan Li Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fe05fdb92779..2ef94c74847d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -57,7 +57,7 @@ enum memcg_memory_event { struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; - unsigned int generation; + int generation; }; #ifdef CONFIG_MEMCG @@ -78,7 +78,7 @@ struct lruvec_stats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; /* scan generation, increased every round-trip */ - unsigned int generation; + atomic_t generation; }; /* -- cgit v1.2.3 From eebc0f48468e68b93393d11f75ad17385a9acca8 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 5 Sep 2024 22:21:06 -0600 Subject: mm/codetag: fix a typo Link: https://lkml.kernel.org/r/20240906042108.1150526-1-yuzhao@google.com Fixes: 22d407b164ff ("lib: add allocation tagging support for memory allocation profiling") Signed-off-by: Yu Zhao Acked-by: Suren Baghdasaryan Acked-by: Muchun Song Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 8c61ccd161ba..896491d9ebe8 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -70,7 +70,7 @@ static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) /* * When percpu variables are required to be defined as weak, static percpu * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION). - * Instead we will accound all module allocations to a single counter. + * Instead we will account all module allocations to a single counter. */ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); -- cgit v1.2.3 From 95599ef684d01136a8b77c16a7c853496786e173 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 5 Sep 2024 22:21:07 -0600 Subject: mm/codetag: fix pgalloc_tag_split() The current assumption is that a large folio can only be split into order-0 folios. That is not the case for hugeTLB demotion, nor for THP split: see commit c010d47f107f ("mm: thp: split huge page to any lower order pages"). When a large folio is split into ones of a lower non-zero order, only the new head pages should be tagged. Tagging tail pages can cause imbalanced "calls" counters, since only head pages are untagged by pgalloc_tag_sub() and the "calls" counts on tail pages are leaked, e.g., # echo 2048kB >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote_size # echo 700 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages # time echo 700 >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote # echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # grep alloc_gigantic_folio /proc/allocinfo Before this patch: 0 549427200 mm/hugetlb.c:1549 func:alloc_gigantic_folio real 0m2.057s user 0m0.000s sys 0m2.051s After this patch: 0 0 mm/hugetlb.c:1549 func:alloc_gigantic_folio real 0m1.711s user 0m0.000s sys 0m1.704s Not tagging tail pages also improves the splitting time, e.g., by about 15% when demoting 1GB hugeTLB folios to 2MB ones, as shown above. Link: https://lkml.kernel.org/r/20240906042108.1150526-2-yuzhao@google.com Fixes: be25d1d4e822 ("mm: create new codetag references during page splitting") Signed-off-by: Yu Zhao Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 30 ++++++++++++++++++++++++++++++ include/linux/pgalloc_tag.h | 31 ------------------------------- 2 files changed, 30 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b0ff06d18c71..6bb778cbaabf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4084,4 +4084,34 @@ void vma_pgtable_walk_end(struct vm_area_struct *vma); int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); +#ifdef CONFIG_MEM_ALLOC_PROFILING +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ + int i; + struct alloc_tag *tag; + unsigned int nr_pages = 1 << new_order; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = pgalloc_tag_get(&folio->page); + if (!tag) + return; + + for (i = nr_pages; i < (1 << old_order); i += nr_pages) { + union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i)); + + if (ref) { + /* Set new reference to point to the original tag */ + alloc_tag_ref_set(ref, tag); + put_page_tag_ref(ref); + } + } +} +#else /* !CONFIG_MEM_ALLOC_PROFILING */ +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ +} +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + #endif /* _LINUX_MM_H */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 207f0c83c8e9..59a3deb792a8 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -80,36 +80,6 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) } } -static inline void pgalloc_tag_split(struct page *page, unsigned int nr) -{ - int i; - struct page_ext *first_page_ext; - struct page_ext *page_ext; - union codetag_ref *ref; - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - first_page_ext = page_ext = page_ext_get(page); - if (unlikely(!page_ext)) - return; - - ref = codetag_ref_from_page_ext(page_ext); - if (!ref->ct) - goto out; - - tag = ct_to_alloc_tag(ref->ct); - page_ext = page_ext_next(page_ext); - for (i = 1; i < nr; i++) { - /* Set new reference to point to the original tag */ - alloc_tag_ref_set(codetag_ref_from_page_ext(page_ext), tag); - page_ext = page_ext_next(page_ext); - } -out: - page_ext_put(first_page_ext); -} - static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { struct alloc_tag *tag = NULL; @@ -142,7 +112,6 @@ static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} -- cgit v1.2.3 From e0a955bf7f61cb034d228736d81c1ab3a47a3dca Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 5 Sep 2024 22:21:08 -0600 Subject: mm/codetag: add pgalloc_tag_copy() Add pgalloc_tag_copy() to transfer the codetag from the old folio to the new one during migration. This makes original allocation sites persist cross migration rather than lump into the get_new_folio callbacks passed into migrate_pages(), e.g., compaction_alloc(): # echo 1 >/proc/sys/vm/compact_memory # grep compaction_alloc /proc/allocinfo Before this patch: 132968448 32463 mm/compaction.c:1880 func:compaction_alloc After this patch: 0 0 mm/compaction.c:1880 func:compaction_alloc Link: https://lkml.kernel.org/r/20240906042108.1150526-3-yuzhao@google.com Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging") Signed-off-by: Yu Zhao Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 24 ++++++++++-------------- include/linux/mm.h | 27 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 896491d9ebe8..1f0a9ff23a2c 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -137,7 +137,16 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {} /* Caller should verify both ref and tag to be valid */ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) { + alloc_tag_add_check(ref, tag); + if (!ref || !tag) + return; + ref->ct = &tag->ct; +} + +static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +{ + __alloc_tag_ref_set(ref, tag); /* * We need in increment the call counter every time we have a new * allocation or when we split a large allocation into smaller ones. @@ -147,22 +156,9 @@ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag this_cpu_inc(tag->counters->calls); } -static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) -{ - alloc_tag_add_check(ref, tag); - if (!ref || !tag) - return; - - __alloc_tag_ref_set(ref, tag); -} - static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) { - alloc_tag_add_check(ref, tag); - if (!ref || !tag) - return; - - __alloc_tag_ref_set(ref, tag); + alloc_tag_ref_set(ref, tag); this_cpu_add(tag->counters->bytes, bytes); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 6bb778cbaabf..39f6faace590 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4108,10 +4108,37 @@ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new } } } + +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ + struct alloc_tag *tag; + union codetag_ref *ref; + + tag = pgalloc_tag_get(&old->page); + if (!tag) + return; + + ref = get_page_tag_ref(&new->page); + if (!ref) + return; + + /* Clear the old ref to the original allocation tag. */ + clear_page_tag_ref(&old->page); + /* Decrement the counters of the tag on get_new_folio. */ + alloc_tag_sub(ref, folio_nr_pages(new)); + + __alloc_tag_ref_set(ref, tag); + + put_page_tag_ref(ref); +} #else /* !CONFIG_MEM_ALLOC_PROFILING */ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) { } + +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ +} #endif /* CONFIG_MEM_ALLOC_PROFILING */ #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From 6462dd53a26088a433f90a5c15822196f201037c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Sep 2024 13:42:47 -1000 Subject: sched_ext: Compact struct bpf_iter_scx_dsq_kern struct scx_iter_scx_dsq is defined as 6 u64's and scx_dsq_iter_kern was using 5 of them. We want to add two more u64 fields but it's better if we do so while staying within scx_iter_scx_dsq to maintain binary compatibility. The way scx_iter_scx_dsq_kern is laid out is rather inefficient - the node field takes up three u64's but only one bit of the last u64 is used. Turn the bool into u32 flags and only use the lower 16 bits freeing up 48 bits - 16 bits for flags, 32 bits for a u32 - for use by struct bpf_iter_scx_dsq_kern. This allows moving the dsq_seq and flags fields of bpf_iter_scx_dsq_kern into the cursor field reducing the struct size by a full u64. No behavior changes intended. Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index dc646cca4fcf..1ddbde64a31b 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -119,9 +119,17 @@ enum scx_kf_mask { __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, }; +enum scx_dsq_lnode_flags { + SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0, + + /* high 16 bits can be for iter cursor flags */ + __SCX_DSQ_LNODE_PRIV_SHIFT = 16, +}; + struct scx_dsq_list_node { struct list_head node; - bool is_bpf_iter_cursor; + u32 flags; + u32 priv; /* can be used by iter cursor */ }; /* -- cgit v1.2.3 From 17245a195df4c86b1fd38718d8cdc532c040c08e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 6 Sep 2024 09:10:59 -0700 Subject: net: remove dev_pick_tx_cpu_id() dev_pick_tx_cpu_id() has been introduced with two users by commit a4ea8a3dacc3 ("net: Add generic ndo_select_queue functions"). The use in AF_PACKET has been removed in 2019 by commit b71b5837f871 ("packet: rework packet_pick_tx_queue() to use common code selection") The other user was a Netlogic XLP driver, removed in 2021 by commit 47ac6f567c28 ("staging: Remove Netlogic XLP network driver"). It's relatively unlikely that any modern driver will need an .ndo_select_queue implementation which picks purely based on CPU ID and skips XPS, delete dev_pick_tx_cpu_id() Found by code inspection. Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240906161059.715546-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b47c00657bd0..2e40a137dc12 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3085,8 +3085,6 @@ void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); -u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev); int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); -- cgit v1.2.3 From 2f87e9cf0c9e21ab9be1fb2ba8520a1525359497 Mon Sep 17 00:00:00 2001 From: Cindy Lu Date: Wed, 31 Jul 2024 11:16:01 +0800 Subject: vdpa: support set mac address from vdpa tool Add new UAPI to support the mac address from vdpa tool Function vdpa_nl_cmd_dev_attr_set_doit() will get the new MAC address from the vdpa tool and then set it to the device. The usage is: vdpa dev set name vdpa_name mac **:**:**:**:**:** Here is example: root@L1# vdpa -jp dev config show vdpa0 { "config": { "vdpa0": { "mac": "82:4d:e9:5d:d7:e6", "link ": "up", "link_announce ": false, "mtu": 1500 } } } root@L1# vdpa dev set name vdpa0 mac 00:11:22:33:44:55 root@L1# vdpa -jp dev config show vdpa0 { "config": { "vdpa0": { "mac": "00:11:22:33:44:55", "link ": "up", "link_announce ": false, "mtu": 1500 } } } Signed-off-by: Cindy Lu Message-Id: <20240731031653.1047692-2-lulu@redhat.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- include/linux/vdpa.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 7977ca03ac7a..2e7a30fe6b92 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -582,11 +582,20 @@ void vdpa_set_status(struct vdpa_device *vdev, u8 status); * @dev: vdpa device to remove * Driver need to remove the specified device by calling * _vdpa_unregister_device(). + * @dev_set_attr: change a vdpa device's attr after it was create + * @mdev: parent device to use for device + * @dev: vdpa device structure + * @config:Attributes to be set for the device. + * The driver needs to check the mask of the structure and then set + * the related information to the vdpa device. The driver must return 0 + * if set successfully. */ struct vdpa_mgmtdev_ops { int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name, const struct vdpa_dev_set_config *config); void (*dev_del)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev); + int (*dev_set_attr)(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev, + const struct vdpa_dev_set_config *config); }; /** -- cgit v1.2.3 From 11596dc3dfae572cc267dda2dc4dab9ae34668f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Sep 2024 07:39:05 +0300 Subject: iomap: pass flags to iomap_file_buffered_write_punch_delalloc To fix short write error handling, We'll need to figure out what operation iomap_file_buffered_write_punch_delalloc is called for. Pass the flags argument on to it, and reorder the argument list to match that of ->iomap_end so that the compiler only has to add the new punch argument to the end of it instead of reshuffling the registers. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240910043949.3481298-4-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index f792b37f7627..52626906ff35 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -258,10 +258,6 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, void *private); -int iomap_file_buffered_write_punch_delalloc(struct inode *inode, - struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, - int (*punch)(struct inode *inode, loff_t pos, loff_t length)); - int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); @@ -277,6 +273,12 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, const struct iomap_ops *ops); vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); + +typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); +int iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos, + loff_t length, ssize_t written, unsigned flag, + struct iomap *iomap, iomap_punch_t punch); + int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, -- cgit v1.2.3 From 492f53758fad4fde3e9a98696780f8b53f87cdae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Sep 2024 07:39:06 +0300 Subject: iomap: pass the iomap to the punch callback XFS will need to look at the flags in the iomap structure, so pass it down all the way to the callback. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240910043949.3481298-5-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 52626906ff35..ec2eab94f00a 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -274,7 +274,8 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); -typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); +typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, + struct iomap *iomap); int iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned flag, struct iomap *iomap, iomap_punch_t punch); -- cgit v1.2.3 From 4bceb9ba05aca23652567fb5f899cc7fcb12c8d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Sep 2024 07:39:07 +0300 Subject: iomap: remove the iomap_file_buffered_write_punch_delalloc return value iomap_file_buffered_write_punch_delalloc can only return errors if either the ->punch callback returned an error, or if someone changed the API of mapping_seek_hole_data to return a negative error code that is not -ENXIO. As the only instance of ->punch never returns an error, an such an error would be fatal anyway remove the entire error propagation and don't return an error code from iomap_file_buffered_write_punch_delalloc. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240910043949.3481298-6-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index ec2eab94f00a..4ad12a3c8bae 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -274,9 +274,9 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); -typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, +typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); -int iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos, +void iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned flag, struct iomap *iomap, iomap_punch_t punch); -- cgit v1.2.3 From 9028cdeb38e1f37d63cb3154799dd259b67e879e Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 5 Sep 2024 10:34:22 -0700 Subject: memcg: add charging of already allocated slab objects At the moment, the slab objects are charged to the memcg at the allocation time. However there are cases where slab objects are allocated at the time where the right target memcg to charge it to is not known. One such case is the network sockets for the incoming connection which are allocated in the softirq context. Couple hundred thousand connections are very normal on large loaded server and almost all of those sockets underlying those connections get allocated in the softirq context and thus not charged to any memcg. However later at the accept() time we know the right target memcg to charge. Let's add new API to charge already allocated objects, so we can have better accounting of the memory usage. To measure the performance impact of this change, tcp_crr is used from the neper [1] performance suite. Basically it is a network ping pong test with new connection for each ping pong. The server and the client are run inside 3 level of cgroup hierarchy using the following commands: Server: $ tcp_crr -6 Client: $ tcp_crr -6 -c -H ${server_ip} If the client and server run on different machines with 50 GBPS NIC, there is no visible impact of the change. For the same machine experiment with v6.11-rc5 as base. base (throughput) with-patch tcp_crr 14545 (+- 80) 14463 (+- 56) It seems like the performance impact is within the noise. Link: https://github.com/google/neper [1] Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Reviewed-by: Yosry Ahmed Acked-by: Paolo Abeni # net Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index eb2bf4629157..3be2a5ed4936 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -547,6 +547,35 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) __assume_slab_alignment __malloc; #define kmem_cache_alloc_lru(...) alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__)) +/** + * kmem_cache_charge - memcg charge an already allocated slab memory + * @objp: address of the slab object to memcg charge + * @gfpflags: describe the allocation context + * + * kmem_cache_charge allows charging a slab object to the current memcg, + * primarily in cases where charging at allocation time might not be possible + * because the target memcg is not known (i.e. softirq context) + * + * The objp should be pointer returned by the slab allocator functions like + * kmalloc (with __GFP_ACCOUNT in flags) or kmem_cache_alloc. The memcg charge + * behavior can be controlled through gfpflags parameter, which affects how the + * necessary internal metadata can be allocated. Including __GFP_NOFAIL denotes + * that overcharging is requested instead of failure, but is not applied for the + * internal metadata allocation. + * + * There are several cases where it will return true even if the charging was + * not done: + * More specifically: + * + * 1. For !CONFIG_MEMCG or cgroup_disable=memory systems. + * 2. Already charged slab objects. + * 3. For slab objects from KMALLOC_NORMAL caches - allocated by kmalloc() + * without __GFP_ACCOUNT + * 4. Allocating internal metadata has failed + * + * Return: true if charge was successful otherwise false. + */ +bool kmem_cache_charge(void *objp, gfp_t gfpflags); void kmem_cache_free(struct kmem_cache *s, void *objp); kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, -- cgit v1.2.3 From 879fb3c274c12cb0892718e1e54f85fc406e3c7b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:45 +0200 Subject: slab: add struct kmem_cache_args Currently we have multiple kmem_cache_create*() variants that take up to seven separate parameters with one of the functions having to grow an eigth parameter in the future to handle both usercopy and a custom freelist pointer. Add a struct kmem_cache_args structure and move less common parameters into it. Core parameters such as name, object size, and flags continue to be passed separately. Add a new function __kmem_cache_create_args() that takes a struct kmem_cache_args pointer and port do_kmem_cache_create_usercopy() over to it. In follow-up patches we will port the other kmem_cache_create*() variants over to it as well. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 5b2da2cf31a8..2b8eeca7fd2c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -240,6 +240,28 @@ struct mem_cgroup; */ bool slab_is_available(void); +/** + * struct kmem_cache_args - Less common arguments for kmem_cache_create() + * @align: The required alignment for the objects. + * @useroffset: Usercopy region offset + * @usersize: Usercopy region size + * @freeptr_offset: Custom offset for the free pointer in RCU caches + * @use_freeptr_offset: Whether a @freeptr_offset is used + * @ctor: A constructor for the objects. + */ +struct kmem_cache_args { + unsigned int align; + unsigned int useroffset; + unsigned int usersize; + unsigned int freeptr_offset; + bool use_freeptr_offset; + void (*ctor)(void *); +}; + +struct kmem_cache *__kmem_cache_create_args(const char *name, + unsigned int object_size, + struct kmem_cache_args *args, + slab_flags_t flags); struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); -- cgit v1.2.3 From 052d67b46bcd91b6785e8e6e047241814f6142a3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:53 +0200 Subject: slab: port KMEM_CACHE() to struct kmem_cache_args Make KMEM_CACHE() use struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 2b8eeca7fd2c..1f38d94387cc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -284,9 +284,11 @@ int kmem_cache_shrink(struct kmem_cache *s); * f.e. add ____cacheline_aligned_in_smp to the struct declaration * then the objects will be properly aligned in SMP configurations. */ -#define KMEM_CACHE(__struct, __flags) \ - kmem_cache_create(#__struct, sizeof(struct __struct), \ - __alignof__(struct __struct), (__flags), NULL) +#define KMEM_CACHE(__struct, __flags) \ + __kmem_cache_create_args(#__struct, sizeof(struct __struct), \ + &(struct kmem_cache_args) { \ + .align = __alignof__(struct __struct), \ + }, (__flags)) /* * To whitelist a single field for copying to/from usercopy, use this -- cgit v1.2.3 From 199cd13a745eb44fb4828bca373155293cdcfa5c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:54 +0200 Subject: slab: port KMEM_CACHE_USERCOPY() to struct kmem_cache_args Make KMEM_CACHE_USERCOPY() use struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 1f38d94387cc..7e15a3a3edb1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -294,12 +294,13 @@ int kmem_cache_shrink(struct kmem_cache *s); * To whitelist a single field for copying to/from usercopy, use this * macro instead for KMEM_CACHE() above. */ -#define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \ - kmem_cache_create_usercopy(#__struct, \ - sizeof(struct __struct), \ - __alignof__(struct __struct), (__flags), \ - offsetof(struct __struct, __field), \ - sizeof_field(struct __struct, __field), NULL) +#define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \ + __kmem_cache_create_args(#__struct, sizeof(struct __struct), \ + &(struct kmem_cache_args) { \ + .align = __alignof__(struct __struct), \ + .useroffset = offsetof(struct __struct, __field), \ + .usersize = sizeof_field(struct __struct, __field), \ + }, (__flags)) /* * Common kmalloc functions provided by all allocators -- cgit v1.2.3 From b2e7456b5c25c41eda7a8a15f7ccaa4e7579949f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:55 +0200 Subject: slab: create kmem_cache_create() compatibility layer Use _Generic() to create a compatibility layer that type switches on the third argument to either call __kmem_cache_create() or __kmem_cache_create_args(). If NULL is passed for the struct kmem_cache_args argument use default args making porting for callers that don't care about additional arguments easy. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 7e15a3a3edb1..15eb0a0defd6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -262,9 +262,10 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, unsigned int object_size, struct kmem_cache_args *args, slab_flags_t flags); -struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, - unsigned int align, slab_flags_t flags, - void (*ctor)(void *)); + +struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, + void (*ctor)(void *)); struct kmem_cache *kmem_cache_create_usercopy(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, @@ -273,6 +274,28 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, unsigned int freeptr_offset, slab_flags_t flags); + +/* If NULL is passed for @args, use this variant with default arguments. */ +static inline struct kmem_cache * +__kmem_cache_default_args(const char *name, unsigned int size, + struct kmem_cache_args *args, + slab_flags_t flags) +{ + struct kmem_cache_args kmem_default_args = {}; + + /* Make sure we don't get passed garbage. */ + if (WARN_ON_ONCE(args)) + return ERR_PTR(-EINVAL); + + return __kmem_cache_create_args(name, size, &kmem_default_args, flags); +} + +#define kmem_cache_create(__name, __object_size, __args, ...) \ + _Generic((__args), \ + struct kmem_cache_args *: __kmem_cache_create_args, \ + void *: __kmem_cache_default_args, \ + default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__) + void kmem_cache_destroy(struct kmem_cache *s); int kmem_cache_shrink(struct kmem_cache *s); -- cgit v1.2.3 From 3d453e60f1a9c377d670d5fe306d3b9eed477bc0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:57 +0200 Subject: slab: remove kmem_cache_create_rcu() Now that we have ported all users of kmem_cache_create_rcu() to struct kmem_cache_args the function is unused and can be removed. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 15eb0a0defd6..cb82a6414a28 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -271,9 +271,6 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); -struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, - unsigned int freeptr_offset, - slab_flags_t flags); /* If NULL is passed for @args, use this variant with default arguments. */ static inline struct kmem_cache * -- cgit v1.2.3 From 0c9050b09cfb1ddb3fe4b2d401dca1fb674c825a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:58 +0200 Subject: slab: make kmem_cache_create_usercopy() static inline Make kmem_cache_create_usercopy() a static inline function. Signed-off-by: Christian Brauner Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 49 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index cb82a6414a28..634717c9f73b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -266,11 +266,50 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); -struct kmem_cache *kmem_cache_create_usercopy(const char *name, - unsigned int size, unsigned int align, - slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)); + +/** + * kmem_cache_create_usercopy - Create a cache with a region suitable + * for copying to userspace + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @useroffset: Usercopy region offset + * @usersize: Usercopy region size + * @ctor: A constructor for the objects. + * + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +static inline struct kmem_cache * +kmem_cache_create_usercopy(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)) +{ + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + .useroffset = useroffset, + .usersize = usersize, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, flags); +} /* If NULL is passed for @args, use this variant with default arguments. */ static inline struct kmem_cache * -- cgit v1.2.3 From 781aee755638dbb6dbfe022bdd2f732621ec9534 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:59 +0200 Subject: slab: make __kmem_cache_create() static inline Make __kmem_cache_create() a static inline function. Signed-off-by: Christian Brauner Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 634717c9f73b..331412a9f4f2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -262,10 +262,17 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, unsigned int object_size, struct kmem_cache_args *args, slab_flags_t flags); +static inline struct kmem_cache * +__kmem_cache_create(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) +{ + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + }; -struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, - unsigned int align, slab_flags_t flags, - void (*ctor)(void *)); + return __kmem_cache_create_args(name, size, &kmem_args, flags); +} /** * kmem_cache_create_usercopy - Create a cache with a region suitable -- cgit v1.2.3 From 4ba4f1afb6a9fed8ef896c2363076e36572f71da Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Aug 2024 08:16:37 -0700 Subject: perf: Generic hotplug support for a PMU with a scope The perf subsystem assumes that the counters of a PMU are per-CPU. So the user space tool reads a counter from each CPU in the system wide mode. However, many PMUs don't have a per-CPU counter. The counter is effective for a scope, e.g., a die or a socket. To address this, a cpumask is exposed by the kernel driver to restrict to one CPU to stand for a specific scope. In case the given CPU is removed, the hotplug support has to be implemented for each such driver. The codes to support the cpumask and hotplug are very similar. - Expose a cpumask into sysfs - Pickup another CPU in the same scope if the given CPU is removed. - Invoke the perf_pmu_migrate_context() to migrate to a new CPU. - In event init, always set the CPU in the cpumask to event->cpu Similar duplicated codes are implemented for each such PMU driver. It would be good to introduce a generic infrastructure to avoid such duplication. 5 popular scopes are implemented here, core, die, cluster, pkg, and the system-wide. The scope can be set when a PMU is registered. If so, a "cpumask" is automatically exposed for the PMU. The "cpumask" is from the perf_online__mask, which is to track the active CPU for each scope. They are set when the first CPU of the scope is online via the generic perf hotplug support. When a corresponding CPU is removed, the perf_online__mask is updated accordingly and the PMU will be moved to a new CPU from the same scope if possible. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com --- include/linux/perf_event.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 701549967c18..a3cbcd727366 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -295,6 +295,19 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 +/** + * pmu::scope + */ +enum perf_pmu_scope { + PERF_PMU_SCOPE_NONE = 0, + PERF_PMU_SCOPE_CORE, + PERF_PMU_SCOPE_DIE, + PERF_PMU_SCOPE_CLUSTER, + PERF_PMU_SCOPE_PKG, + PERF_PMU_SCOPE_SYS_WIDE, + PERF_PMU_MAX_SCOPE, +}; + struct perf_output_handle; #define PMU_NULL_DEV ((void *)(~0UL)) @@ -318,6 +331,11 @@ struct pmu { */ int capabilities; + /* + * PMU scope + */ + unsigned int scope; + int __percpu *pmu_disable_count; struct perf_cpu_pmu_context __percpu *cpu_pmu_context; atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ -- cgit v1.2.3 From a48a36b316ae5d3ab83f9b545dba15998e96d59c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Aug 2024 08:16:38 -0700 Subject: perf: Add PERF_EV_CAP_READ_SCOPE Usually, an event can be read from any CPU of the scope. It doesn't need to be read from the advertised CPU. Add a new event cap, PERF_EV_CAP_READ_SCOPE. An event of a PMU with scope can be read from any active CPU in the scope. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240802151643.1691631-3-kan.liang@linux.intel.com --- include/linux/perf_event.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a3cbcd727366..794f66057878 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -636,10 +636,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *, * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and * cannot be a group leader. If an event with this flag is detached from the * group it is scheduled out and moved into an unrecoverable ERROR state. + * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the + * PMU scope where it is active. */ #define PERF_EV_CAP_SOFTWARE BIT(0) #define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1) #define PERF_EV_CAP_SIBLING BIT(2) +#define PERF_EV_CAP_READ_SCOPE BIT(3) #define SWEVENT_HLIST_BITS 8 #define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS) -- cgit v1.2.3 From 08155c7f2a2cc6ff99886ea5743da274be24ebe4 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Aug 2024 08:16:39 -0700 Subject: perf/x86/intel/cstate: Clean up cpumask and hotplug There are three cstate PMUs with different scopes, core, die and module. The scopes are supported by the generic perf_event subsystem now. Set the scope for each PMU and remove all the cpumask and hotplug codes. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240802151643.1691631-4-kan.liang@linux.intel.com --- include/linux/cpuhotplug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 9316c39260e0..2101ae2ecfca 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -152,7 +152,6 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, CPUHP_AP_PERF_X86_AMD_IBS_STARTING, - CPUHP_AP_PERF_X86_CSTATE_STARTING, CPUHP_AP_PERF_XTENSA_STARTING, CPUHP_AP_ARM_VFP_STARTING, CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, @@ -209,7 +208,6 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, CPUHP_AP_PERF_X86_RAPL_ONLINE, - CPUHP_AP_PERF_X86_CSTATE_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, -- cgit v1.2.3 From 740c1c84bfa3d8c63bd3b01fb570e7452f51fbd8 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 10 Sep 2024 10:26:17 +0800 Subject: spi: remove spi_controller_is_slave() and spi_slave_abort() spi_controller_is_slave() and spi_slave_abort() are all replaced, so they can be removed. No functional changed. Signed-off-by: Yang Yingliang Link: https://patch.msgid.link/20240910022618.1397-8-yangyingliang@huaweicloud.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index d47d5f14ff99..4b95663163e0 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -498,7 +498,6 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch * controller has native support for memory like operations. * @mem_caps: controller capabilities for the handling of memory operations. * @unprepare_message: undo any work done by prepare_message(). - * @slave_abort: abort the ongoing transfer request on an SPI slave controller * @target_abort: abort the ongoing transfer request on an SPI target controller * @cs_gpiods: Array of GPIO descriptors to use as chip select lines; one per CS * number. Any individual value may be NULL for CS lines that @@ -725,10 +724,7 @@ struct spi_controller { struct spi_message *message); int (*unprepare_message)(struct spi_controller *ctlr, struct spi_message *message); - union { - int (*slave_abort)(struct spi_controller *ctlr); - int (*target_abort)(struct spi_controller *ctlr); - }; + int (*target_abort)(struct spi_controller *ctlr); /* * These hooks are for drivers that use a generic implementation @@ -802,11 +798,6 @@ static inline void spi_controller_put(struct spi_controller *ctlr) put_device(&ctlr->dev); } -static inline bool spi_controller_is_slave(struct spi_controller *ctlr) -{ - return IS_ENABLED(CONFIG_SPI_SLAVE) && ctlr->slave; -} - static inline bool spi_controller_is_target(struct spi_controller *ctlr) { return IS_ENABLED(CONFIG_SPI_SLAVE) && ctlr->target; @@ -1296,7 +1287,6 @@ extern int devm_spi_optimize_message(struct device *dev, struct spi_device *spi, extern int spi_setup(struct spi_device *spi); extern int spi_async(struct spi_device *spi, struct spi_message *message); -extern int spi_slave_abort(struct spi_device *spi); extern int spi_target_abort(struct spi_device *spi); static inline size_t -- cgit v1.2.3 From d6de45e3c6f3713d3825d3e2860c11d24e0f941f Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Tue, 10 Sep 2024 00:35:03 +0200 Subject: platform/x86: asus-wmi: Disable OOBE experience on Zenbook S 16 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OOBE experience fades the keyboard backlight in & out continuously, and make the backlight uncontrollable using its device. Workaround taken from https://wiki.archlinux.org/index.php?title=ASUS_Zenbook_UM5606&diff=next&oldid=815547 Signed-off-by: Bas Nieuwenhuizen Reviewed-by: Luke D. Jones Link: https://lore.kernel.org/r/20240909223503.1445779-1-bas@basnieuwenhuizen.nl Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 0aeeae1c1943..ae9bf7479e7b 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -62,6 +62,7 @@ #define ASUS_WMI_DEVID_KBD_BACKLIGHT 0x00050021 #define ASUS_WMI_DEVID_LIGHT_SENSOR 0x00050022 /* ?? */ #define ASUS_WMI_DEVID_LIGHTBAR 0x00050025 +#define ASUS_WMI_DEVID_OOBE 0x0005002F /* This can only be used to disable the screen, not re-enable */ #define ASUS_WMI_DEVID_SCREENPAD_POWER 0x00050031 /* Writing a brightness re-enables the screen if disabled */ -- cgit v1.2.3 From 070a5e6295e88a2b75cbfef04ff8b4ec05775e6d Mon Sep 17 00:00:00 2001 From: Furong Xu <0x1207@gmail.com> Date: Fri, 6 Sep 2024 22:30:06 +0800 Subject: net: stmmac: move stmmac_fpe_cfg to stmmac_priv data By moving the fpe_cfg field to the stmmac_priv data, stmmac_fpe_cfg becomes platform-data eventually, instead of a run-time config. Suggested-by: Serge Semin Signed-off-by: Furong Xu <0x1207@gmail.com> Reviewed-by: Vladimir Oltean Reviewed-by: Serge Semin Link: https://patch.msgid.link/d9b3d7ecb308c5e39778a4c8ae9df288a2754379.1725631883.git.0x1207@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 338991c08f00..d79ff252cfdc 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -138,33 +138,6 @@ struct stmmac_txq_cfg { int tbs_en; }; -/* FPE link state */ -enum stmmac_fpe_state { - FPE_STATE_OFF = 0, - FPE_STATE_CAPABLE = 1, - FPE_STATE_ENTERING_ON = 2, - FPE_STATE_ON = 3, -}; - -/* FPE link-partner hand-shaking mPacket type */ -enum stmmac_mpacket_type { - MPACKET_VERIFY = 0, - MPACKET_RESPONSE = 1, -}; - -enum stmmac_fpe_task_state_t { - __FPE_REMOVING, - __FPE_TASK_SCHED, -}; - -struct stmmac_fpe_cfg { - bool enable; /* FPE enable */ - bool hs_enable; /* FPE handshake enable */ - enum stmmac_fpe_state lp_fpe_state; /* Link Partner FPE state */ - enum stmmac_fpe_state lo_fpe_state; /* Local station FPE state */ - u32 fpe_csr; /* MAC_FPE_CTRL_STS reg cache */ -}; - struct stmmac_safety_feature_cfg { u32 tsoee; u32 mrxpee; @@ -232,7 +205,6 @@ struct plat_stmmacenet_data { struct fwnode_handle *port_node; struct device_node *mdio_node; struct stmmac_dma_cfg *dma_cfg; - struct stmmac_fpe_cfg *fpe_cfg; struct stmmac_safety_feature_cfg *safety_feat_cfg; int clk_csr; int has_gmac; -- cgit v1.2.3 From d591f6804e7e1310881c9224d72247a2b65039af Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 27 Aug 2024 18:48:46 -0500 Subject: PCI: Wait for device readiness with Configuration RRS After a device reset, delays are required before the device can successfully complete config accesses. PCIe r6.0, sec 6.6, specifies some delays required before software can perform config accesses. Devices that require more time after those delays may respond to config accesses with Configuration Request Retry Status (RRS) completions. Callers of pci_dev_wait() are responsible for delays until the device can respond to config accesses. pci_dev_wait() waits any additional time until the device can successfully complete config accesses. Reading config space of devices that are not present or not ready typically returns ~0 (PCI_ERROR_RESPONSE). Previously we polled the Command register until we got a value other than ~0. This is sometimes a problem because Root Complex handling of RRS completions may include several retries and implementation-specific behavior that is invisible to software (see sec 2.3.2), so the exponential backoff in pci_dev_wait() may not work as intended. Linux enables Configuration RRS Software Visibility on all Root Ports that support it. If it is enabled, read the Vendor ID instead of the Command register. RRS completions cause immediate return of the 0x0001 reserved Vendor ID value, so the pci_dev_wait() backoff works correctly. When a read of Vendor ID eventually completes successfully by returning a non-0x0001 value (the Vendor ID or 0xffff for VFs), the device should be initialized and ready to respond to config requests. For conventional PCI devices or devices below Root Ports that don't support Configuration RRS Software Visibility, poll the Command register as before. This was developed independently, but is very similar to Stanislav Spassov's previous work at https://lore.kernel.org/linux-pci/20200223122057.6504-1-stanspas@amazon.com Link: https://lore.kernel.org/r/20240827234848.4429-2-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Tested-by: Duc Dang --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 4cf89a4b4cbc..121d8d94d6d0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -371,6 +371,7 @@ struct pci_dev { can be generated */ unsigned int pme_poll:1; /* Poll device's PME status bit */ unsigned int pinned:1; /* Whether this dev is pinned */ + unsigned int config_crs_sv:1; /* Config CRS software visibility */ unsigned int imm_ready:1; /* Supports Immediate Readiness */ unsigned int d1_support:1; /* Low power state D1 is supported */ unsigned int d2_support:1; /* Low power state D2 is supported */ -- cgit v1.2.3 From 87f10faf166a9114aa0d4132298cad379de16fdd Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 27 Aug 2024 18:48:48 -0500 Subject: PCI: Rename CRS Completion Status to RRS PCIe r6.0 changed the abbreviation for "Configuration Request Retry Status" Completion Status from "CRS" to "RRS" and uses the terminology of "Configuration RRS Software Visibility" instead of "CRS Software Visibility". Align the Linux usage with the r6.0 spec language. No functional change intended. It's confusing to make this change, but I think "RRS" *is* a better abbreviation because it was easy to interpret "CRS" as "Completion Retry Status", which really didn't make any sense. Link: https://lore.kernel.org/r/20240827234848.4429-4-helgaas@kernel.org Signed-off-by: Bjorn Helgaas --- include/linux/bcma/bcma_driver_pci.h | 2 +- include/linux/pci.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h index 68da8dba5162..dba41b65ae0d 100644 --- a/include/linux/bcma/bcma_driver_pci.h +++ b/include/linux/bcma/bcma_driver_pci.h @@ -203,7 +203,7 @@ struct pci_dev; #define BCMA_CORE_PCI_MDIO_RXCTRL0 0x840 /* PCIE Root Capability Register bits (Host mode only) */ -#define BCMA_CORE_PCI_RC_CRS_VISIBILITY 0x0001 +#define BCMA_CORE_PCI_RC_RRS_VISIBILITY 0x0001 struct bcma_drv_pci; struct bcma_bus; diff --git a/include/linux/pci.h b/include/linux/pci.h index 121d8d94d6d0..27d8ce977674 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -371,7 +371,7 @@ struct pci_dev { can be generated */ unsigned int pme_poll:1; /* Poll device's PME status bit */ unsigned int pinned:1; /* Whether this dev is pinned */ - unsigned int config_crs_sv:1; /* Config CRS software visibility */ + unsigned int config_rrs_sv:1; /* Config RRS software visibility */ unsigned int imm_ready:1; /* Supports Immediate Readiness */ unsigned int d1_support:1; /* Low power state D1 is supported */ unsigned int d2_support:1; /* Low power state D2 is supported */ -- cgit v1.2.3 From 884ee6dc85b959bc152f15bca80c30f06069e6c4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 6 Sep 2024 14:27:24 +0800 Subject: f2fs: get rid of online repaire on corrupted directory syzbot reports a f2fs bug as below: kernel BUG at fs/f2fs/inode.c:896! RIP: 0010:f2fs_evict_inode+0x1598/0x15c0 fs/f2fs/inode.c:896 Call Trace: evict+0x532/0x950 fs/inode.c:704 dispose_list fs/inode.c:747 [inline] evict_inodes+0x5f9/0x690 fs/inode.c:797 generic_shutdown_super+0x9d/0x2d0 fs/super.c:627 kill_block_super+0x44/0x90 fs/super.c:1696 kill_f2fs_super+0x344/0x690 fs/f2fs/super.c:4898 deactivate_locked_super+0xc4/0x130 fs/super.c:473 cleanup_mnt+0x41f/0x4b0 fs/namespace.c:1373 task_work_run+0x24f/0x310 kernel/task_work.c:228 ptrace_notify+0x2d2/0x380 kernel/signal.c:2402 ptrace_report_syscall include/linux/ptrace.h:415 [inline] ptrace_report_syscall_exit include/linux/ptrace.h:477 [inline] syscall_exit_work+0xc6/0x190 kernel/entry/common.c:173 syscall_exit_to_user_mode_prepare kernel/entry/common.c:200 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:205 [inline] syscall_exit_to_user_mode+0x279/0x370 kernel/entry/common.c:218 do_syscall_64+0x100/0x230 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0010:f2fs_evict_inode+0x1598/0x15c0 fs/f2fs/inode.c:896 Online repaire on corrupted directory in f2fs_lookup() can generate dirty data/meta while racing w/ readonly remount, it may leave dirty inode after filesystem becomes readonly, however, checkpoint() will skips flushing dirty inode in a state of readonly mode, result in above panic. Let's get rid of online repaire in f2fs_lookup(), and leave the work to fsck.f2fs. Fixes: 510022a85839 ("f2fs: add F2FS_INLINE_DOTS to recover missing dot dentries") Reported-by: syzbot+ebea2790904673d7c618@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/000000000000a7b20f061ff2d56a@google.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index f17f89a259e2..b0b821edfd97 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -278,7 +278,7 @@ struct node_footer { #define F2FS_INLINE_DATA 0x02 /* file inline data flag */ #define F2FS_INLINE_DENTRY 0x04 /* file inline dentry flag */ #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ -#define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ +#define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries (obsolete) */ #define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */ #define F2FS_PIN_FILE 0x40 /* file should not be gced */ #define F2FS_COMPRESS_RELEASED 0x80 /* file released compressed blocks */ -- cgit v1.2.3 From 540c830212edc908361c39d40df21772a8bda308 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 9 Sep 2024 20:30:18 +0000 Subject: firmware: imx: remove duplicate scmi_imx_misc_ctrl_get() These two functions have a stub definition when CONFIG_IMX_SCMI_MISC_EXT is not set, which conflict with the global definition: In file included from drivers/firmware/imx/sm-misc.c:6: include/linux/firmware/imx/sm.h:30:1: error: expected identifier or '(' before '{' token 30 | { | ^ drivers/firmware/imx/sm-misc.c:26:5: error: redefinition of 'scmi_imx_misc_ctrl_get' 26 | int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val) | ^~~~~~~~~~~~~~~~~~~~~~ include/linux/firmware/imx/sm.h:24:19: note: previous definition of 'scmi_imx_misc_ctrl_get' with type 'int(u32, u32 *, u32 *)' {aka 'int(unsigned int, unsigned int *, unsigned int *)'} 24 | static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val) | ^~~~~~~~~~~~~~~~~~~~~~ There is no real need for the #ifdef, and removing this avoids the build failure. Fixes: 0b4f8a68b292 ("firmware: imx: Add i.MX95 MISC driver") Link: https://lore.kernel.org/r/20240909203023.1275232-1-arnd@kernel.org Signed-off-by: Arnd Bergmann --- include/linux/firmware/imx/sm.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firmware/imx/sm.h b/include/linux/firmware/imx/sm.h index 62a2690e2abd..9b85a3f028d1 100644 --- a/include/linux/firmware/imx/sm.h +++ b/include/linux/firmware/imx/sm.h @@ -17,18 +17,7 @@ #define SCMI_IMX_CTRL_SAI4_MCLK 4 /* WAKE SAI4 MCLK */ #define SCMI_IMX_CTRL_SAI5_MCLK 5 /* WAKE SAI5 MCLK */ -#if IS_ENABLED(CONFIG_IMX_SCMI_MISC_EXT) int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val); int scmi_imx_misc_ctrl_set(u32 id, u32 val); -#else -static inline int scmi_imx_misc_ctrl_get(u32 id, u32 *num, u32 *val) -{ - return -EOPNOTSUPP; -} -static inline int scmi_imx_misc_ctrl_set(u32 id, u32 val); -{ - return -EOPNOTSUPP; -} -#endif #endif -- cgit v1.2.3 From 4b3fc475c61fa8733a9acc9d743f6cc9ca4915f5 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 6 Sep 2024 16:05:07 +0530 Subject: net: phylink: Add phylink_set_fixed_link() to configure fixed link state in phylink The function allows for the configuration of a fixed link state for a given phylink instance. This addition is particularly useful for network devices that operate with a fixed link configuration, where the link parameters do not change dynamically. By using `phylink_set_fixed_link()`, drivers can easily set up the fixed link state during initialization or configuration changes. Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: Raju Lakkaraju Signed-off-by: David S. Miller --- include/linux/phylink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 2381e07429a2..5c01048860c4 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -598,6 +598,8 @@ int phylink_fwnode_phy_connect(struct phylink *pl, const struct fwnode_handle *fwnode, u32 flags); void phylink_disconnect_phy(struct phylink *); +int phylink_set_fixed_link(struct phylink *, + const struct phylink_link_state *); void phylink_mac_change(struct phylink *, bool up); void phylink_pcs_change(struct phylink_pcs *, bool up); -- cgit v1.2.3 From cef7dde8836ab09a3bfe96ada4f18ef2496eacc9 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:04:57 +0300 Subject: net/mlx5: Expand mkey page size to support 6 bits Protect the usage of the 6th bit with the relevant capability to ensure we are using the new page sizes with FW that supports the bit extension. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-2-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 970c9d8473ef..ec1117d4e441 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1988,7 +1988,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 migratable[0x1]; u8 reserved_at_81[0x11]; u8 query_vuid[0x1]; - u8 reserved_at_93[0xd]; + u8 reserved_at_93[0x5]; + u8 umr_log_entity_size_5[0x1]; + u8 reserved_at_99[0x7]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; @@ -4212,8 +4214,7 @@ struct mlx5_ifc_mkc_bits { u8 reserved_at_1c0[0x19]; u8 relaxed_ordering_read[0x1]; - u8 reserved_at_1d9[0x1]; - u8 log_page_size[0x5]; + u8 log_page_size[0x6]; u8 reserved_at_1e0[0x20]; }; -- cgit v1.2.3 From 6cd9171d04cff79abe78c166927ab8563bf95fe5 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:04:58 +0300 Subject: net/mlx5: Expose HW bits for Memory scheme ODP Expose IFC bits to support the new memory scheme on demand paging. Change the macro reading odp capabilities to be able to read from the new IFC layout and align the code in upper layers to be compiled. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-3-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 4 +++ include/linux/mlx5/mlx5_ifc.h | 57 ++++++++++++++++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ba875a619b97..bd081f276654 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1369,6 +1369,10 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap) +#define MLX5_CAP_ODP_SCHEME(mdev, cap) \ + MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \ + transport_page_fault_scheme_cap.cap) + #define MLX5_CAP_ODP_MAX(mdev, cap)\ MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->max, cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ec1117d4e441..3e3336bb9191 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1326,11 +1326,13 @@ struct mlx5_ifc_atomic_caps_bits { u8 reserved_at_e0[0x720]; }; -struct mlx5_ifc_odp_cap_bits { +struct mlx5_ifc_odp_scheme_cap_bits { u8 reserved_at_0[0x40]; u8 sig[0x1]; - u8 reserved_at_41[0x1f]; + u8 reserved_at_41[0x4]; + u8 page_prefetch[0x1]; + u8 reserved_at_46[0x1a]; u8 reserved_at_60[0x20]; @@ -1344,7 +1346,20 @@ struct mlx5_ifc_odp_cap_bits { struct mlx5_ifc_odp_per_transport_service_cap_bits dc_odp_caps; - u8 reserved_at_120[0x6E0]; + u8 reserved_at_120[0xe0]; +}; + +struct mlx5_ifc_odp_cap_bits { + struct mlx5_ifc_odp_scheme_cap_bits transport_page_fault_scheme_cap; + + struct mlx5_ifc_odp_scheme_cap_bits memory_page_fault_scheme_cap; + + u8 reserved_at_400[0x200]; + + u8 mem_page_fault[0x1]; + u8 reserved_at_601[0x1f]; + + u8 reserved_at_620[0x1e0]; }; struct mlx5_ifc_tls_cap_bits { @@ -2034,7 +2049,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 min_mkey_log_entity_size_fixed_buffer[0x5]; u8 ec_vf_vport_base[0x10]; - u8 reserved_at_3a0[0x10]; + u8 reserved_at_3a0[0xa]; + u8 max_mkey_log_entity_size_mtt[0x6]; u8 max_rqt_vhca_id[0x10]; u8 reserved_at_3c0[0x20]; @@ -7258,6 +7274,30 @@ struct mlx5_ifc_qp_2err_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_trans_page_fault_info_bits { + u8 error[0x1]; + u8 reserved_at_1[0x4]; + u8 page_fault_type[0x3]; + u8 wq_number[0x18]; + + u8 reserved_at_20[0x8]; + u8 fault_token[0x18]; +}; + +struct mlx5_ifc_mem_page_fault_info_bits { + u8 error[0x1]; + u8 reserved_at_1[0xf]; + u8 fault_token_47_32[0x10]; + + u8 fault_token_31_0[0x20]; +}; + +union mlx5_ifc_page_fault_resume_in_page_fault_info_auto_bits { + struct mlx5_ifc_trans_page_fault_info_bits trans_page_fault_info; + struct mlx5_ifc_mem_page_fault_info_bits mem_page_fault_info; + u8 reserved_at_0[0x40]; +}; + struct mlx5_ifc_page_fault_resume_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -7274,13 +7314,8 @@ struct mlx5_ifc_page_fault_resume_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 error[0x1]; - u8 reserved_at_41[0x4]; - u8 page_fault_type[0x3]; - u8 wq_number[0x18]; - - u8 reserved_at_60[0x8]; - u8 token[0x18]; + union mlx5_ifc_page_fault_resume_in_page_fault_info_auto_bits + page_fault_info; }; struct mlx5_ifc_nop_out_bits { -- cgit v1.2.3 From 64c68385a39bb676c76da36164ab696e8da78842 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:04:59 +0300 Subject: RDMA/mlx5: Add new ODP memory scheme eqe format Add new fields to support the new memory scheme page fault and extend the token field to u64 as in the new scheme the token is 48 bit. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-4-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index bd081f276654..154095256d0d 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -211,6 +211,7 @@ enum { enum { MLX5_PFAULT_SUBTYPE_WQE = 0, MLX5_PFAULT_SUBTYPE_RDMA = 1, + MLX5_PFAULT_SUBTYPE_MEMORY = 2, }; enum wqe_page_fault_type { @@ -646,10 +647,11 @@ struct mlx5_eqe_page_req { __be32 rsvd1[5]; }; +#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096 struct mlx5_eqe_page_fault { - __be32 bytes_committed; union { struct { + __be32 bytes_committed; u16 reserved1; __be16 wqe_index; u16 reserved2; @@ -659,6 +661,7 @@ struct mlx5_eqe_page_fault { __be32 pftype_wq; } __packed wqe; struct { + __be32 bytes_committed; __be32 r_key; u16 reserved1; __be16 packet_length; @@ -666,6 +669,23 @@ struct mlx5_eqe_page_fault { __be64 rdma_va; __be32 pftype_token; } __packed rdma; + struct { + u8 flags; + u8 reserved1; + __be16 post_demand_fault_pages; + __be16 pre_demand_fault_pages; + __be16 token47_32; + __be32 token31_0; + /* + * FW changed from specifying the fault size in byte + * count to 4k pages granularity. The size specified + * in pages uses bits 31:12, to keep backward + * compatibility. + */ + __be32 demand_fault_pages; + __be32 mkey; + __be64 va; + } __packed memory; } __packed; } __packed; -- cgit v1.2.3 From c912ac66b3fc92acb0079fff7d030e9be0756c36 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 9 Sep 2024 15:41:05 +0300 Subject: platform/x86: intel_scu_ipc: Move intel_scu_ipc.h out of arch/x86/include/asm This is a platform/x86 library that is mostly being used by other drivers not directly under arch/x86 anyway (with the exception of the Intel MID setup code) so it makes sense that it lives under the platform_data/x86/ directory instead. No functional changes intended. Suggested-by: Andy Shevchenko Signed-off-by: Mika Westerberg Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240909124952.1152017-3-andriy.shevchenko@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/x86/intel_scu_ipc.h | 68 +++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 include/linux/platform_data/x86/intel_scu_ipc.h (limited to 'include/linux') diff --git a/include/linux/platform_data/x86/intel_scu_ipc.h b/include/linux/platform_data/x86/intel_scu_ipc.h new file mode 100644 index 000000000000..0ca9962e97f2 --- /dev/null +++ b/include/linux/platform_data/x86/intel_scu_ipc.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PLATFORM_X86_INTEL_SCU_IPC_H_ +#define __PLATFORM_X86_INTEL_SCU_IPC_H_ + +#include + +struct device; +struct intel_scu_ipc_dev; + +/** + * struct intel_scu_ipc_data - Data used to configure SCU IPC + * @mem: Base address of SCU IPC MMIO registers + * @irq: The IRQ number used for SCU (optional) + */ +struct intel_scu_ipc_data { + struct resource mem; + int irq; +}; + +struct intel_scu_ipc_dev * +__intel_scu_ipc_register(struct device *parent, + const struct intel_scu_ipc_data *scu_data, + struct module *owner); + +#define intel_scu_ipc_register(parent, scu_data) \ + __intel_scu_ipc_register(parent, scu_data, THIS_MODULE) + +void intel_scu_ipc_unregister(struct intel_scu_ipc_dev *scu); + +struct intel_scu_ipc_dev * +__devm_intel_scu_ipc_register(struct device *parent, + const struct intel_scu_ipc_data *scu_data, + struct module *owner); + +#define devm_intel_scu_ipc_register(parent, scu_data) \ + __devm_intel_scu_ipc_register(parent, scu_data, THIS_MODULE) + +struct intel_scu_ipc_dev *intel_scu_ipc_dev_get(void); +void intel_scu_ipc_dev_put(struct intel_scu_ipc_dev *scu); +struct intel_scu_ipc_dev *devm_intel_scu_ipc_dev_get(struct device *dev); + +int intel_scu_ipc_dev_ioread8(struct intel_scu_ipc_dev *scu, u16 addr, + u8 *data); +int intel_scu_ipc_dev_iowrite8(struct intel_scu_ipc_dev *scu, u16 addr, + u8 data); +int intel_scu_ipc_dev_readv(struct intel_scu_ipc_dev *scu, u16 *addr, + u8 *data, size_t len); +int intel_scu_ipc_dev_writev(struct intel_scu_ipc_dev *scu, u16 *addr, + u8 *data, size_t len); + +int intel_scu_ipc_dev_update(struct intel_scu_ipc_dev *scu, u16 addr, + u8 data, u8 mask); + +int intel_scu_ipc_dev_simple_command(struct intel_scu_ipc_dev *scu, int cmd, + int sub); +int intel_scu_ipc_dev_command_with_size(struct intel_scu_ipc_dev *scu, int cmd, + int sub, const void *in, size_t inlen, + size_t size, void *out, size_t outlen); + +static inline int intel_scu_ipc_dev_command(struct intel_scu_ipc_dev *scu, int cmd, + int sub, const void *in, size_t inlen, + void *out, size_t outlen) +{ + return intel_scu_ipc_dev_command_with_size(scu, cmd, sub, in, inlen, + inlen, out, outlen); +} + +#endif -- cgit v1.2.3 From 5f1cda51107fd5e53cd012bd1f6f39aede96bdbe Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 9 Sep 2024 15:41:06 +0300 Subject: platform/x86: intel_scu_wdt: Move intel_scu_wdt.h to x86 subfolder This is a platform/x86 library that can only be used on x86 devices. so it makes sense that it lives under the platform_data/x86/ directory instead. No functional changes intended. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240909124952.1152017-4-andriy.shevchenko@linux.intel.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- include/linux/platform_data/intel-mid_wdt.h | 19 ------------------- include/linux/platform_data/x86/intel-mid_wdt.h | 19 +++++++++++++++++++ 2 files changed, 19 insertions(+), 19 deletions(-) delete mode 100644 include/linux/platform_data/intel-mid_wdt.h create mode 100644 include/linux/platform_data/x86/intel-mid_wdt.h (limited to 'include/linux') diff --git a/include/linux/platform_data/intel-mid_wdt.h b/include/linux/platform_data/intel-mid_wdt.h deleted file mode 100644 index 8dba70b4b020..000000000000 --- a/include/linux/platform_data/intel-mid_wdt.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * intel-mid_wdt: generic Intel MID SCU watchdog driver - * - * Copyright (C) 2014 Intel Corporation. All rights reserved. - * Contact: David Cohen - */ - -#ifndef __INTEL_MID_WDT_H__ -#define __INTEL_MID_WDT_H__ - -#include - -struct intel_mid_wdt_pdata { - int irq; - int (*probe)(struct platform_device *pdev); -}; - -#endif /*__INTEL_MID_WDT_H__*/ diff --git a/include/linux/platform_data/x86/intel-mid_wdt.h b/include/linux/platform_data/x86/intel-mid_wdt.h new file mode 100644 index 000000000000..e5c0210d0fec --- /dev/null +++ b/include/linux/platform_data/x86/intel-mid_wdt.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * intel-mid_wdt: generic Intel MID SCU watchdog driver + * + * Copyright (C) 2014 Intel Corporation. All rights reserved. + * Contact: David Cohen + */ + +#ifndef __PLATFORM_X86_INTEL_MID_WDT_H_ +#define __PLATFORM_X86_INTEL_MID_WDT_H_ + +#include + +struct intel_mid_wdt_pdata { + int irq; + int (*probe)(struct platform_device *pdev); +}; + +#endif /* __PLATFORM_X86_INTEL_MID_WDT_H_ */ -- cgit v1.2.3 From d3bfbfb1248498656cd25c51e41c1e31219bd0dd Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Wed, 11 Sep 2024 12:19:34 +0530 Subject: mm: release number of pages of a folio Add a new function unpin_user_folio() to put the refs of a folio by npages count. The check for BIO_PAGE_PINNED flag is removed as it is already checked in bio_release_pages(). Signed-off-by: Kundan Kumar Tested-by: Luis Chamberlain Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240911064935.5630-4-kundan.kumar@samsung.com Signed-off-by: Jens Axboe --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c4b238a20b76..2fd88cd5997d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1597,6 +1597,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); +void unpin_user_folio(struct folio *folio, unsigned long npages); void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) -- cgit v1.2.3 From b03ffc76b83c1a7d058454efbcf1bf0e345ef1c2 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 6 Sep 2024 15:13:31 +0200 Subject: soc: qcom: geni-se: add GP_LENGTH/IRQ_EN_SET/IRQ_EN_CLEAR registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For UART devices the M_GP_LENGTH is the TX word count. For other devices this is the transaction word count. For UART devices the S_GP_LENGTH is the RX word count. The IRQ_EN set/clear registers allow you to set or clear bits in the IRQ_EN register without needing a read-modify-write. Acked-by: Bjorn Andersson Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20240610152420.v4.1.Ife7ced506aef1be3158712aa3ff34a006b973559@changeid Tested-by: Nícolas F. R. A. Prado Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20240906131336.23625-4-johan+linaro@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/soc/qcom/geni-se.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/geni-se.h b/include/linux/soc/qcom/geni-se.h index 0f038a1a0330..c3bca9c0bf2c 100644 --- a/include/linux/soc/qcom/geni-se.h +++ b/include/linux/soc/qcom/geni-se.h @@ -88,11 +88,15 @@ struct geni_se { #define SE_GENI_M_IRQ_STATUS 0x610 #define SE_GENI_M_IRQ_EN 0x614 #define SE_GENI_M_IRQ_CLEAR 0x618 +#define SE_GENI_M_IRQ_EN_SET 0x61c +#define SE_GENI_M_IRQ_EN_CLEAR 0x620 #define SE_GENI_S_CMD0 0x630 #define SE_GENI_S_CMD_CTRL_REG 0x634 #define SE_GENI_S_IRQ_STATUS 0x640 #define SE_GENI_S_IRQ_EN 0x644 #define SE_GENI_S_IRQ_CLEAR 0x648 +#define SE_GENI_S_IRQ_EN_SET 0x64c +#define SE_GENI_S_IRQ_EN_CLEAR 0x650 #define SE_GENI_TX_FIFOn 0x700 #define SE_GENI_RX_FIFOn 0x780 #define SE_GENI_TX_FIFO_STATUS 0x800 @@ -101,6 +105,8 @@ struct geni_se { #define SE_GENI_RX_WATERMARK_REG 0x810 #define SE_GENI_RX_RFR_WATERMARK_REG 0x814 #define SE_GENI_IOS 0x908 +#define SE_GENI_M_GP_LENGTH 0x910 +#define SE_GENI_S_GP_LENGTH 0x914 #define SE_DMA_TX_IRQ_STAT 0xc40 #define SE_DMA_TX_IRQ_CLR 0xc44 #define SE_DMA_TX_FSM_RST 0xc58 @@ -234,6 +240,9 @@ struct geni_se { #define IO2_DATA_IN BIT(1) #define RX_DATA_IN BIT(0) +/* SE_GENI_M_GP_LENGTH and SE_GENI_S_GP_LENGTH fields */ +#define GP_LENGTH GENMASK(31, 0) + /* SE_DMA_TX_IRQ_STAT Register fields */ #define TX_DMA_DONE BIT(0) #define TX_EOT BIT(1) -- cgit v1.2.3 From 4c59c59ef3ab9275f2a8f84dcffbd16c285ce8ea Mon Sep 17 00:00:00 2001 From: Nick Chan Date: Wed, 11 Sep 2024 13:02:11 +0800 Subject: tty: serial: samsung: Use bit manipulation macros for APPLE_S5L_* New entries using BIT() will be added soon, so change the existing ones to use bit manipulation macros including BIT() and GENMASK() for consistency. Suggested-by: Krzysztof Kozlowski Tested-by: Janne Grunau Reviewed-by: Neal Gompa Signed-off-by: Nick Chan Reviewed-by: Andi Shyti Link: https://lore.kernel.org/r/20240911050741.14477-2-towinchenmi@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_s3c.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h index 1672cf0810ef..2a934e20ca4b 100644 --- a/include/linux/serial_s3c.h +++ b/include/linux/serial_s3c.h @@ -249,9 +249,9 @@ #define APPLE_S5L_UCON_RXTO_ENA 9 #define APPLE_S5L_UCON_RXTHRESH_ENA 12 #define APPLE_S5L_UCON_TXTHRESH_ENA 13 -#define APPLE_S5L_UCON_RXTO_ENA_MSK (1 << APPLE_S5L_UCON_RXTO_ENA) -#define APPLE_S5L_UCON_RXTHRESH_ENA_MSK (1 << APPLE_S5L_UCON_RXTHRESH_ENA) -#define APPLE_S5L_UCON_TXTHRESH_ENA_MSK (1 << APPLE_S5L_UCON_TXTHRESH_ENA) +#define APPLE_S5L_UCON_RXTO_ENA_MSK BIT(APPLE_S5L_UCON_RXTO_ENA) +#define APPLE_S5L_UCON_RXTHRESH_ENA_MSK BIT(APPLE_S5L_UCON_RXTHRESH_ENA) +#define APPLE_S5L_UCON_TXTHRESH_ENA_MSK BIT(APPLE_S5L_UCON_TXTHRESH_ENA) #define APPLE_S5L_UCON_DEFAULT (S3C2410_UCON_TXIRQMODE | \ S3C2410_UCON_RXIRQMODE | \ @@ -260,10 +260,10 @@ APPLE_S5L_UCON_RXTHRESH_ENA_MSK | \ APPLE_S5L_UCON_TXTHRESH_ENA_MSK) -#define APPLE_S5L_UTRSTAT_RXTHRESH (1<<4) -#define APPLE_S5L_UTRSTAT_TXTHRESH (1<<5) -#define APPLE_S5L_UTRSTAT_RXTO (1<<9) -#define APPLE_S5L_UTRSTAT_ALL_FLAGS (0x3f0) +#define APPLE_S5L_UTRSTAT_RXTHRESH BIT(4) +#define APPLE_S5L_UTRSTAT_TXTHRESH BIT(5) +#define APPLE_S5L_UTRSTAT_RXTO BIT(9) +#define APPLE_S5L_UTRSTAT_ALL_FLAGS GENMASK(9, 4) #ifndef __ASSEMBLY__ -- cgit v1.2.3 From 5ed771f174726ae879945d4f148a9005ac909cb7 Mon Sep 17 00:00:00 2001 From: Nick Chan Date: Wed, 11 Sep 2024 13:02:13 +0800 Subject: tty: serial: samsung: Fix serial rx on Apple A7-A9 Apple's older A7-A9 SoCs seems to use bit 3 in UTRSTAT as RXTO, which is enabled by bit 11 in UCON. Access these bits in addition to the original RXTO and RXTO enable bits, to allow serial rx to function on A7-A9 SoCs. This change does not appear to affect the A10 SoC and up. Tested-by: Janne Grunau Reviewed-by: Neal Gompa Signed-off-by: Nick Chan Reviewed-by: Andi Shyti Link: https://lore.kernel.org/r/20240911050741.14477-4-towinchenmi@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_s3c.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h index 2a934e20ca4b..102aa33d956c 100644 --- a/include/linux/serial_s3c.h +++ b/include/linux/serial_s3c.h @@ -246,24 +246,28 @@ S5PV210_UFCON_TXTRIG4 | \ S5PV210_UFCON_RXTRIG4) -#define APPLE_S5L_UCON_RXTO_ENA 9 -#define APPLE_S5L_UCON_RXTHRESH_ENA 12 -#define APPLE_S5L_UCON_TXTHRESH_ENA 13 -#define APPLE_S5L_UCON_RXTO_ENA_MSK BIT(APPLE_S5L_UCON_RXTO_ENA) -#define APPLE_S5L_UCON_RXTHRESH_ENA_MSK BIT(APPLE_S5L_UCON_RXTHRESH_ENA) -#define APPLE_S5L_UCON_TXTHRESH_ENA_MSK BIT(APPLE_S5L_UCON_TXTHRESH_ENA) +#define APPLE_S5L_UCON_RXTO_ENA 9 +#define APPLE_S5L_UCON_RXTO_LEGACY_ENA 11 +#define APPLE_S5L_UCON_RXTHRESH_ENA 12 +#define APPLE_S5L_UCON_TXTHRESH_ENA 13 +#define APPLE_S5L_UCON_RXTO_ENA_MSK BIT(APPLE_S5L_UCON_RXTO_ENA) +#define APPLE_S5L_UCON_RXTO_LEGACY_ENA_MSK BIT(APPLE_S5L_UCON_RXTO_LEGACY_ENA) +#define APPLE_S5L_UCON_RXTHRESH_ENA_MSK BIT(APPLE_S5L_UCON_RXTHRESH_ENA) +#define APPLE_S5L_UCON_TXTHRESH_ENA_MSK BIT(APPLE_S5L_UCON_TXTHRESH_ENA) #define APPLE_S5L_UCON_DEFAULT (S3C2410_UCON_TXIRQMODE | \ S3C2410_UCON_RXIRQMODE | \ S3C2410_UCON_RXFIFO_TOI) #define APPLE_S5L_UCON_MASK (APPLE_S5L_UCON_RXTO_ENA_MSK | \ + APPLE_S5L_UCON_RXTO_LEGACY_ENA_MSK | \ APPLE_S5L_UCON_RXTHRESH_ENA_MSK | \ APPLE_S5L_UCON_TXTHRESH_ENA_MSK) +#define APPLE_S5L_UTRSTAT_RXTO_LEGACY BIT(3) #define APPLE_S5L_UTRSTAT_RXTHRESH BIT(4) #define APPLE_S5L_UTRSTAT_TXTHRESH BIT(5) #define APPLE_S5L_UTRSTAT_RXTO BIT(9) -#define APPLE_S5L_UTRSTAT_ALL_FLAGS GENMASK(9, 4) +#define APPLE_S5L_UTRSTAT_ALL_FLAGS GENMASK(9, 3) #ifndef __ASSEMBLY__ -- cgit v1.2.3 From 6746ee4c3a189f8b60694f01e7e29bc5ff7972e0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 11 Sep 2024 17:34:37 +0100 Subject: io_uring/cmd: expose iowq to cmds When an io_uring request needs blocking context we offload it to the io_uring's thread pool called io-wq. We can get there off ->uring_cmd by returning -EAGAIN, but there is no straightforward way of doing that from an asynchronous callback. Add a helper that would transfer a command to a blocking context. Note, we do an extra hop via task_work before io_queue_iowq(), that's a limitation of io_uring infra we have that can likely be lifted later if that would ever become a problem. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f735f807d7c8ba50c9452c69dfe5d3e9e535037b.1726072086.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 447fbfd32215..86ceb3383e49 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -48,6 +48,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags); +/* Execute the request from a blocking context */ +void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); + #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) @@ -67,6 +70,9 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags) { } +static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) +{ +} #endif /* -- cgit v1.2.3 From a6ccb48e13662bcb98282e051512b9686b02d353 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 11 Sep 2024 17:34:38 +0100 Subject: io_uring/cmd: give inline space in request to cmds Some io_uring commands can use some inline space in io_kiocb. We have 32 bytes in struct io_uring_cmd, expose it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7ca779a61ee5e166e535d70df9c7f07b15d8a0ce.1726072086.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 86ceb3383e49..c189d36ad55e 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -23,6 +23,15 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) return sqe->cmd; } +static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) +{ + BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu)); +} +#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \ + io_uring_cmd_private_sz_check(sizeof(pdu_type)), \ + ((pdu_type *)&(cmd)->pdu) \ +) + #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); -- cgit v1.2.3 From a12c883a0a6a005cfb3ad01feaf783e2248bfc3e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 11 Sep 2024 17:34:39 +0100 Subject: filemap: introduce filemap_invalidate_pages kiocb_invalidate_pages() is useful for the write path, however not everything is backed by kiocb and we want to reuse the function for bio based discard implementation. Extract and and reuse a new helper called filemap_invalidate_pages(), which takes a argument indicating whether it should be non-blocking and might return -EAGAIN. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f81374b52c92d0dce0f01a279d1eed42b54056aa.1726072086.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d9c7edb6422b..e39c3a7ce33c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -32,6 +32,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count); +int filemap_invalidate_pages(struct address_space *mapping, + loff_t pos, loff_t end, bool nowait); int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); -- cgit v1.2.3 From 45b8fc3096542a53bfd245a9ad8ef870384b4897 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:27 -0700 Subject: lib/buildid: rename build_id_parse() into build_id_parse_nofault() Make it clear that build_id_parse() assumes that it can take no page fault by renaming it and current few users to build_id_parse_nofault(). Also add build_id_parse() stub which for now falls back to non-sleepable implementation, but will be changed in subsequent patches to take advantage of sleepable context. PROCMAP_QUERY ioctl() on /proc//maps file is using build_id_parse() and will automatically take advantage of more reliable sleepable context implementation. Reviewed-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/buildid.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 20aa3c2d89f7..014a88c41073 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -7,8 +7,8 @@ #define BUILD_ID_SIZE_MAX 20 struct vm_area_struct; -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, - __u32 *size); +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); +int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) -- cgit v1.2.3 From d4dd9775ec242425576af93daadb80a34083a53c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:31 -0700 Subject: bpf: wire up sleepable bpf_get_stack() and bpf_get_task_stack() helpers Add sleepable implementations of bpf_get_stack() and bpf_get_task_stack() helpers and allow them to be used from sleepable BPF program (e.g., sleepable uprobes). Note, the stack trace IPs capturing itself is not sleepable (that would need to be a separate project), only build ID fetching is sleepable and thus more reliable, as it will wait for data to be paged in, if necessary. For that we make use of sleepable build_id_parse() implementation. Now that build ID related internals in kernel/bpf/stackmap.c can be used both in sleepable and non-sleepable contexts, we need to add additional rcu_read_lock()/rcu_read_unlock() protection around fetching perf_callchain_entry, but with the refactoring in previous commit it's now pretty straightforward. We make sure to do rcu_read_unlock (in sleepable mode only) right before stack_map_get_build_id_offset() call which can sleep. By that time we don't have any more use of perf_callchain_entry. Note, bpf_get_task_stack() will fail for user mode if task != current. And for kernel mode build ID are irrelevant. So in that sense adding sleepable bpf_get_task_stack() implementation is a no-op. It feel right to wire this up for symmetry and completeness, but I'm open to just dropping it until we support `user && crosstask` condition. Reviewed-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6f87fb014fba..7f3e8477d5e7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3200,7 +3200,9 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; +extern const struct bpf_func_proto bpf_get_stack_sleepable_proto; extern const struct bpf_func_proto bpf_get_task_stack_proto; +extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto; extern const struct bpf_func_proto bpf_get_stackid_proto_pe; extern const struct bpf_func_proto bpf_get_stack_proto_pe; extern const struct bpf_func_proto bpf_sock_map_update_proto; -- cgit v1.2.3 From 6513eb3d3191574b58859ef2d6dc26c0277c6f81 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 10 Sep 2024 17:35:35 -0400 Subject: net: tighten bad gso csum offset check in virtio_net_hdr The referenced commit drops bad input, but has false positives. Tighten the check to avoid these. The check detects illegal checksum offload requests, which produce csum_start/csum_off beyond end of packet after segmentation. But it is based on two incorrect assumptions: 1. virtio_net_hdr_to_skb with VIRTIO_NET_HDR_GSO_TCP[46] implies GSO. True in callers that inject into the tx path, such as tap. But false in callers that inject into rx, like virtio-net. Here, the flags indicate GRO, and CHECKSUM_UNNECESSARY or CHECKSUM_NONE without VIRTIO_NET_HDR_F_NEEDS_CSUM is normal. 2. TSO requires checksum offload, i.e., ip_summed == CHECKSUM_PARTIAL. False, as tcp[46]_gso_segment will fix up csum_start and offset for all other ip_summed by calling __tcp_v4_send_check. Because of 2, we can limit the scope of the fix to virtio_net_hdr that do try to set these fields, with a bogus value. Link: https://lore.kernel.org/netdev/20240909094527.GA3048202@port70.net/ Fixes: 89add40066f9 ("net: drop bad gso csum_start and offset in virtio_net_hdr") Signed-off-by: Willem de Bruijn Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20240910213553.839926-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 6c395a2600e8..276ca543ef44 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -173,7 +173,8 @@ retry: break; case SKB_GSO_TCPV4: case SKB_GSO_TCPV6: - if (skb->csum_offset != offsetof(struct tcphdr, check)) + if (skb->ip_summed == CHECKSUM_PARTIAL && + skb->csum_offset != offsetof(struct tcphdr, check)) return -EINVAL; break; } -- cgit v1.2.3 From 170aafe35cb98e0f3fbacb446ea86389fbce22ea Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 10 Sep 2024 17:14:47 +0000 Subject: netdev: support binding dma-buf to netdevice Add a netdev_dmabuf_binding struct which represents the dma-buf-to-netdevice binding. The netlink API will bind the dma-buf to rx queues on the netdevice. On the binding, the dma_buf_attach & dma_buf_map_attachment will occur. The entries in the sg_table from mapping will be inserted into a genpool to make it ready for allocation. The chunks in the genpool are owned by a dmabuf_chunk_owner struct which holds the dma-buf offset of the base of the chunk and the dma_addr of the chunk. Both are needed to use allocations that come from this chunk. We create a new type that represents an allocation from the genpool: net_iov. We setup the net_iov allocation size in the genpool to PAGE_SIZE for simplicity: to match the PAGE_SIZE normally allocated by the page pool and given to the drivers. The user can unbind the dmabuf from the netdevice by closing the netlink socket that established the binding. We do this so that the binding is automatically unbound even if the userspace process crashes. The binding and unbinding leaves an indicator in struct netdev_rx_queue that the given queue is bound, and the binding is actuated by resetting the rx queue using the queue API. The netdev_dmabuf_binding struct is refcounted, and releases its resources only when all the refs are released. Signed-off-by: Willem de Bruijn Signed-off-by: Kaiyuan Zhang Signed-off-by: Mina Almasry Reviewed-by: Pavel Begunkov # excluding netlink Acked-by: Daniel Vetter Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20240910171458.219195-4-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2465bdb6037f..e87b5e488325 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3953,6 +3953,8 @@ u8 dev_xdp_prog_count(struct net_device *dev); int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode); +u32 dev_get_min_mp_channel_count(const struct net_device *dev); + int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb); -- cgit v1.2.3 From 9f6b619edf2e85746f261b42ae8f818a59d126f7 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 10 Sep 2024 17:14:51 +0000 Subject: net: support non paged skb frags Make skb_frag_page() fail in the case where the frag is not backed by a page, and fix its relevant callers to handle this case. Signed-off-by: Mina Almasry Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20240910171458.219195-8-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 42 +++++++++++++++++++++++++++++++++++++++++- include/linux/skbuff_ref.h | 9 ++++----- 2 files changed, 45 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5803eb8a157d..1019f14583c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3524,21 +3524,58 @@ static inline void skb_frag_off_copy(skb_frag_t *fragto, fragto->offset = fragfrom->offset; } +/* Return: true if the skb_frag contains a net_iov. */ +static inline bool skb_frag_is_net_iov(const skb_frag_t *frag) +{ + return netmem_is_net_iov(frag->netmem); +} + +/** + * skb_frag_net_iov - retrieve the net_iov referred to by fragment + * @frag: the fragment + * + * Return: the &struct net_iov associated with @frag. Returns NULL if this + * frag has no associated net_iov. + */ +static inline struct net_iov *skb_frag_net_iov(const skb_frag_t *frag) +{ + if (!skb_frag_is_net_iov(frag)) + return NULL; + + return netmem_to_net_iov(frag->netmem); +} + /** * skb_frag_page - retrieve the page referred to by a paged fragment * @frag: the paged fragment * - * Returns the &struct page associated with @frag. + * Return: the &struct page associated with @frag. Returns NULL if this frag + * has no associated page. */ static inline struct page *skb_frag_page(const skb_frag_t *frag) { + if (skb_frag_is_net_iov(frag)) + return NULL; + return netmem_to_page(frag->netmem); } +/** + * skb_frag_netmem - retrieve the netmem referred to by a fragment + * @frag: the fragment + * + * Return: the &netmem_ref associated with @frag. + */ +static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag) +{ + return frag->netmem; +} + int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, struct bpf_prog *prog); + /** * skb_frag_address - gets the address of the data contained in a paged fragment * @frag: the paged fragment buffer @@ -3548,6 +3585,9 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, */ static inline void *skb_frag_address(const skb_frag_t *frag) { + if (!skb_frag_page(frag)) + return NULL; + return page_address(skb_frag_page(frag)) + skb_frag_off(frag); } diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h index 16c241a23472..0f3c58007488 100644 --- a/include/linux/skbuff_ref.h +++ b/include/linux/skbuff_ref.h @@ -34,14 +34,13 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) bool napi_pp_put_page(netmem_ref netmem); -static inline void -skb_page_unref(struct page *page, bool recycle) +static inline void skb_page_unref(netmem_ref netmem, bool recycle) { #ifdef CONFIG_PAGE_POOL - if (recycle && napi_pp_put_page(page_to_netmem(page))) + if (recycle && napi_pp_put_page(netmem)) return; #endif - put_page(page); + put_page(netmem_to_page(netmem)); } /** @@ -54,7 +53,7 @@ skb_page_unref(struct page *page, bool recycle) */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { - skb_page_unref(skb_frag_page(frag), recycle); + skb_page_unref(skb_frag_netmem(frag), recycle); } /** -- cgit v1.2.3 From 65249feb6b3df9e17bab5911ee56fa7b0971e231 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 10 Sep 2024 17:14:52 +0000 Subject: net: add support for skbs with unreadable frags For device memory TCP, we expect the skb headers to be available in host memory for access, and we expect the skb frags to be in device memory and unaccessible to the host. We expect there to be no mixing and matching of device memory frags (unaccessible) with host memory frags (accessible) in the same skb. Add a skb->devmem flag which indicates whether the frags in this skb are device memory frags or not. __skb_fill_netmem_desc() now checks frags added to skbs for net_iov, and marks the skb as skb->devmem accordingly. Add checks through the network stack to avoid accessing the frags of devmem skbs and avoid coalescing devmem skbs with non devmem skbs. Signed-off-by: Willem de Bruijn Signed-off-by: Kaiyuan Zhang Signed-off-by: Mina Almasry Reviewed-by: Eric Dumazet Reviewed-by: Jakub Kicinski Link: https://patch.msgid.link/20240910171458.219195-9-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1019f14583c9..39f1d16f3628 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -827,6 +827,8 @@ enum skb_tstamp_type { * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) + * @unreadable: indicates that at least 1 of the fragments in this skb is + * unreadable. * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required @@ -1008,7 +1010,7 @@ struct sk_buff { #if IS_ENABLED(CONFIG_IP_SCTP) __u8 csum_not_inet:1; #endif - + __u8 unreadable:1; #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif @@ -1824,6 +1826,12 @@ static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb) __skb_zcopy_downgrade_managed(skb); } +/* Return true if frags in this skb are readable by the host. */ +static inline bool skb_frags_readable(const struct sk_buff *skb) +{ + return !skb->unreadable; +} + static inline void skb_mark_not_on_list(struct sk_buff *skb) { skb->next = NULL; @@ -2540,10 +2548,17 @@ static inline void skb_len_add(struct sk_buff *skb, int delta) static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i, netmem_ref netmem, int off, int size) { - struct page *page = netmem_to_page(netmem); + struct page *page; __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size); + if (netmem_is_net_iov(netmem)) { + skb->unreadable = true; + return; + } + + page = netmem_to_page(netmem); + /* Propagate page pfmemalloc to the skb if we can. The problem is * that not all callers have unique ownership of the page but rely * on page_is_pfmemalloc doing the right thing(tm). -- cgit v1.2.3 From 8f0b3cc9a4c102c24808c87f1bc943659d7a7f9f Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 10 Sep 2024 17:14:53 +0000 Subject: tcp: RX path for devmem TCP In tcp_recvmsg_locked(), detect if the skb being received by the user is a devmem skb. In this case - if the user provided the MSG_SOCK_DEVMEM flag - pass it to tcp_recvmsg_devmem() for custom handling. tcp_recvmsg_devmem() copies any data in the skb header to the linear buffer, and returns a cmsg to the user indicating the number of bytes returned in the linear buffer. tcp_recvmsg_devmem() then loops over the unaccessible devmem skb frags, and returns to the user a cmsg_devmem indicating the location of the data in the dmabuf device memory. cmsg_devmem contains this information: 1. the offset into the dmabuf where the payload starts. 'frag_offset'. 2. the size of the frag. 'frag_size'. 3. an opaque token 'frag_token' to return to the kernel when the buffer is to be released. The pages awaiting freeing are stored in the newly added sk->sk_user_frags, and each page passed to userspace is get_page()'d. This reference is dropped once the userspace indicates that it is done reading this page. All pages are released when the socket is destroyed. Signed-off-by: Willem de Bruijn Signed-off-by: Kaiyuan Zhang Signed-off-by: Mina Almasry Reviewed-by: Pavel Begunkov Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20240910171458.219195-10-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index df9cdb8bbfb8..d18cc47e89bd 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -327,6 +327,7 @@ struct ucred { * plain text and require encryption */ +#define MSG_SOCK_DEVMEM 0x2000000 /* Receive devmem skbs as cmsg */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ -- cgit v1.2.3 From aa58bec064ab16224645776ce8e0af2fee46136a Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 9 Sep 2024 13:55:02 +0530 Subject: net: ethernet: oa_tc6: implement register write operation Implement register write operation according to the control communication specified in the OPEN Alliance 10BASE-T1x MACPHY Serial Interface document. Control write commands are used by the SPI host to write registers within the MAC-PHY. Each control write commands are composed of a 32 bits control command header followed by register write data. The MAC-PHY ignores the final 32 bits of data from the SPI host at the end of the control write command. The write command and data is also echoed from the MAC-PHY back to the SPI host to enable the SPI host to identify which register write failed in the case of any bus errors. Control write commands can write either a single register or multiple consecutive registers. When multiple consecutive registers are written, the address is automatically post-incremented by the MAC-PHY. Writing to any unimplemented or undefined registers shall be ignored and yield no effect. Reviewed-by: Andrew Lunn Signed-off-by: Parthiban Veerasooran Link: https://patch.msgid.link/20240909082514.262942-3-Parthiban.Veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- include/linux/oa_tc6.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/linux/oa_tc6.h (limited to 'include/linux') diff --git a/include/linux/oa_tc6.h b/include/linux/oa_tc6.h new file mode 100644 index 000000000000..99c490f1c8a8 --- /dev/null +++ b/include/linux/oa_tc6.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * OPEN Alliance 10BASE‑T1x MAC‑PHY Serial Interface framework + * + * Link: https://opensig.org/download/document/OPEN_Alliance_10BASET1x_MAC-PHY_Serial_Interface_V1.1.pdf + * + * Author: Parthiban Veerasooran + */ + +#include + +struct oa_tc6; + +struct oa_tc6 *oa_tc6_init(struct spi_device *spi); +int oa_tc6_write_register(struct oa_tc6 *tc6, u32 address, u32 value); +int oa_tc6_write_registers(struct oa_tc6 *tc6, u32 address, u32 value[], + u8 length); -- cgit v1.2.3 From 375d1e0278cca70ce801d52c6b95c7a3c00f249c Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 9 Sep 2024 13:55:03 +0530 Subject: net: ethernet: oa_tc6: implement register read operation Implement register read operation according to the control communication specified in the OPEN Alliance 10BASE-T1x MACPHY Serial Interface document. Control read commands are used by the SPI host to read registers within the MAC-PHY. Each control read commands are composed of a 32 bits control command header. The MAC-PHY ignores all data from the SPI host following the control header for the remainder of the control read command. Control read commands can read either a single register or multiple consecutive registers. When multiple consecutive registers are read, the address is automatically post-incremented by the MAC-PHY. Reading any unimplemented or undefined registers shall return zero. Reviewed-by: Andrew Lunn Signed-off-by: Parthiban Veerasooran Link: https://patch.msgid.link/20240909082514.262942-4-Parthiban.Veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- include/linux/oa_tc6.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/oa_tc6.h b/include/linux/oa_tc6.h index 99c490f1c8a8..85aeecf87306 100644 --- a/include/linux/oa_tc6.h +++ b/include/linux/oa_tc6.h @@ -15,3 +15,6 @@ struct oa_tc6 *oa_tc6_init(struct spi_device *spi); int oa_tc6_write_register(struct oa_tc6 *tc6, u32 address, u32 value); int oa_tc6_write_registers(struct oa_tc6 *tc6, u32 address, u32 value[], u8 length); +int oa_tc6_read_register(struct oa_tc6 *tc6, u32 address, u32 *value); +int oa_tc6_read_registers(struct oa_tc6 *tc6, u32 address, u32 value[], + u8 length); -- cgit v1.2.3 From 8f9bf857e43b3f75a098e3af3a6fec2d03203a1e Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 9 Sep 2024 13:55:06 +0530 Subject: net: ethernet: oa_tc6: implement internal PHY initialization Internal PHY is initialized as per the PHY register capability supported by the MAC-PHY. Direct PHY Register Access Capability indicates if PHY registers are directly accessible within the SPI register memory space. Indirect PHY Register Access Capability indicates if PHY registers are indirectly accessible through the MDIO/MDC registers MDIOACCn defined in OPEN Alliance specification. Currently the direct register access is only supported. Reviewed-by: Andrew Lunn Signed-off-by: Parthiban Veerasooran Link: https://patch.msgid.link/20240909082514.262942-7-Parthiban.Veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- include/linux/oa_tc6.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/oa_tc6.h b/include/linux/oa_tc6.h index 85aeecf87306..606ba9f1e663 100644 --- a/include/linux/oa_tc6.h +++ b/include/linux/oa_tc6.h @@ -7,11 +7,13 @@ * Author: Parthiban Veerasooran */ +#include #include struct oa_tc6; -struct oa_tc6 *oa_tc6_init(struct spi_device *spi); +struct oa_tc6 *oa_tc6_init(struct spi_device *spi, struct net_device *netdev); +void oa_tc6_exit(struct oa_tc6 *tc6); int oa_tc6_write_register(struct oa_tc6 *tc6, u32 address, u32 value); int oa_tc6_write_registers(struct oa_tc6 *tc6, u32 address, u32 value[], u8 length); -- cgit v1.2.3 From 53fbde8ab21e8c2c6187159cc17fc10cbf20900a Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 9 Sep 2024 13:55:09 +0530 Subject: net: ethernet: oa_tc6: implement transmit path to transfer tx ethernet frames The transmit ethernet frame will be converted into multiple transmit data chunks. Each transmit data chunk consists of a 4 bytes header followed by a 64 bytes transmit data chunk payload. The 4 bytes data header occurs at the beginning of each transmit data chunk on MOSI. The data header contains the information needed to determine the validity and location of the transmit frame data within the data chunk payload. The number of transmit data chunks transmitted to mac-phy is limited to the number transmit credits available in the mac-phy. Initially the transmit credits will be updated from the buffer status register and then it will be updated from the footer received on each spi data transfer. The received footer will be examined for the transmit errors if any. Reviewed-by: Andrew Lunn Signed-off-by: Parthiban Veerasooran Link: https://patch.msgid.link/20240909082514.262942-10-Parthiban.Veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- include/linux/oa_tc6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/oa_tc6.h b/include/linux/oa_tc6.h index 606ba9f1e663..5c7811ac9cbe 100644 --- a/include/linux/oa_tc6.h +++ b/include/linux/oa_tc6.h @@ -20,3 +20,4 @@ int oa_tc6_write_registers(struct oa_tc6 *tc6, u32 address, u32 value[], int oa_tc6_read_register(struct oa_tc6 *tc6, u32 address, u32 *value); int oa_tc6_read_registers(struct oa_tc6 *tc6, u32 address, u32 value[], u8 length); +netdev_tx_t oa_tc6_start_xmit(struct oa_tc6 *tc6, struct sk_buff *skb); -- cgit v1.2.3 From afd42170c8a6e68edd3f3a7f2aacd2bfbedb58b2 Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 9 Sep 2024 13:55:12 +0530 Subject: net: ethernet: oa_tc6: add helper function to enable zero align rx frame Zero align receive frame feature can be enabled to align all receive ethernet frames data to start at the beginning of any receive data chunk payload with a start word offset (SWO) of zero. Receive frames may begin anywhere within the receive data chunk payload when this feature is not enabled. Reviewed-by: Andrew Lunn Signed-off-by: Parthiban Veerasooran Link: https://patch.msgid.link/20240909082514.262942-13-Parthiban.Veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- include/linux/oa_tc6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/oa_tc6.h b/include/linux/oa_tc6.h index 5c7811ac9cbe..15f58e3c56c7 100644 --- a/include/linux/oa_tc6.h +++ b/include/linux/oa_tc6.h @@ -21,3 +21,4 @@ int oa_tc6_read_register(struct oa_tc6 *tc6, u32 address, u32 *value); int oa_tc6_read_registers(struct oa_tc6 *tc6, u32 address, u32 value[], u8 length); netdev_tx_t oa_tc6_start_xmit(struct oa_tc6 *tc6, struct sk_buff *skb); +int oa_tc6_zero_align_receive_frame_enable(struct oa_tc6 *tc6); -- cgit v1.2.3 From 5e9b50dea970ae6d3e1309d4254157099734a2af Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 30 Aug 2024 15:04:59 +0200 Subject: fs: add f_pipe Only regular files with FMODE_ATOMIC_POS and directories need f_pos_lock. Place a new f_pipe member in a union with f_pos_lock that they can use and make them stop abusing f_version in follow-up patches. Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-18-6d3e4816aa7b@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/fs.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3e6b3c1afb31..ca4925008244 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1001,6 +1001,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_cred: stashed credentials of creator/opener * @f_path: path of the file * @f_pos_lock: lock protecting file position + * @f_pipe: specific to pipes * @f_pos: file position * @f_version: file version * @f_security: LSM security context of this file @@ -1026,7 +1027,12 @@ struct file { const struct cred *f_cred; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; - struct mutex f_pos_lock; + union { + /* regular files (with FMODE_ATOMIC_POS) and directories */ + struct mutex f_pos_lock; + /* pipes */ + u64 f_pipe; + }; loff_t f_pos; u64 f_version; /* --- cacheline 2 boundary (128 bytes) --- */ -- cgit v1.2.3 From 11068e0b64cbb540b96e577fcca0926242ecaf58 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 30 Aug 2024 15:05:01 +0200 Subject: fs: remove f_version Now that detecting concurrent seeks is done by the filesystems that require it we can remove f_version and free up 8 bytes for future extensions. Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-20-6d3e4816aa7b@kernel.org Reviewed-by: Jan Kara Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ca4925008244..7e11ce172140 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1003,7 +1003,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position - * @f_version: file version * @f_security: LSM security context of this file * @f_owner: file owner * @f_wb_err: writeback error @@ -1034,11 +1033,10 @@ struct file { u64 f_pipe; }; loff_t f_pos; - u64 f_version; - /* --- cacheline 2 boundary (128 bytes) --- */ #ifdef CONFIG_SECURITY void *f_security; #endif + /* --- cacheline 2 boundary (128 bytes) --- */ struct fown_struct *f_owner; errseq_t f_wb_err; errseq_t f_sb_err; -- cgit v1.2.3 From 2077006d4725c82c6e9612cec3a6c140921b067f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 10 Sep 2024 10:16:39 +0200 Subject: uidgid: make sure we fit into one cacheline When I expanded uidgid mappings I intended for a struct uid_gid_map to fit into a single cacheline on x86 as they tend to be pretty performance sensitive (idmapped mounts etc). But a 4 byte hole was added that brought it over 64 bytes. Fix that by adding the static extent array and the extent counter into a substruct. C's type punning for unions guarantees that we can access ->nr_extents even if the last written to member wasn't within the same object. This is also what we rely on in struct_group() and friends. This of course relies on non-strict aliasing which we don't do. 99) If the member used to read the contents of a union object is not the same as the member last used to store a value in the object, the appropriate part of the object representation of the value is reinterpreted as an object representation in the new type as described in 6.2.6 (a process sometimes called "type punning"). Link: https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2310.pdf Link: https://lore.kernel.org/r/20240910-work-uid_gid_map-v1-1-e6bc761363ed@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/user_namespace.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 6030a8235617..3625096d5f85 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -21,9 +21,11 @@ struct uid_gid_extent { }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ - u32 nr_extents; union { - struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; + struct { + struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; + u32 nr_extents; + }; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; -- cgit v1.2.3 From db0aa2e9566fda2d23dc8f6c102856ead95578a4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 19 Jun 2024 00:20:42 +0100 Subject: mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios Define a data structure, struct folio_queue, to represent a sequence of folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a list of folio_queue structures to be used to provide a buffer to iov_iter-taking functions, such as sendmsg and recvmsg. The folio_queue structure looks like: struct folio_queue { struct folio_batch vec; u8 orders[PAGEVEC_SIZE]; struct folio_queue *next; struct folio_queue *prev; unsigned long marks; unsigned long marks2; }; It does not use a list_head so that next and/or prev can be set to NULL at the ends of the list, allowing iov_iter-handling routines to determine that they *are* the ends without needing to store a head pointer in the iov_iter struct. A folio_batch struct is used to hold the folio pointers which allows the batch to be passed to batch handling functions. Two mark bits are available per slot. The intention is to use at least one of them to mark folios that need putting, but that might not be ultimately necessary. Accessor functions are used to access the slots to do the masking and an additional accessor function is used to indicate the size of the array. The order of each folio is also stored in the structure to avoid the need for iov_iter_advance() and iov_iter_revert() to have to query each folio to find its size. With careful barriering, this can be used as an extending buffer with new folios inserted and new folio_queue structs added without the need for a lock. Further, provided we always keep at least one struct in the buffer, we can also remove consumed folios and consumed structs from the head end as we without the need for locks. [Questions/thoughts] (1) To manage this, I need a head pointer, a tail pointer, a tail slot number (assuming insertion happens at the tail end and the next pointers point from head to tail). Should I put these into a struct of their own, say "folio_queue_head" or "rolling_buffer"? I will end up with two of these in netfs_io_request eventually, one keeping track of the pagecache I'm dealing with for buffered I/O and the other to hold a bounce buffer when we need one. (2) Should I make the slots {folio,off,len} or bio_vec? (3) This is intended to replace ITER_XARRAY eventually. Using an xarray in I/O iteration requires the taking of the RCU read lock, doing copying under the RCU read lock, walking the xarray (which may change under us), handling retries and dealing with special values. The advantage of ITER_XARRAY is that when we're dealing with the pagecache directly, we don't need any allocation - but if we're doing encrypted comms, there's a good chance we'd be using a bounce buffer anyway. This will require afs, erofs, cifs, orangefs and fscache to be converted to not use this. afs still uses it for dirs and symlinks; some of erofs usages should be easy to change, but there's one which won't be so easy; ceph's use via fscache can be fixed by porting ceph to netfslib; cifs is using xarray as a bounce buffer - that can be moved to use sheaves instead; and orangefs has a similar problem to erofs - maybe orangefs could use netfslib? Signed-off-by: David Howells cc: Matthew Wilcox cc: Jeff Layton cc: Steve French cc: Ilya Dryomov cc: Gao Xiang cc: Mike Marshall cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: ceph-devel@vger.kernel.org cc: linux-erofs@lists.ozlabs.org cc: devel@lists.orangefs.org Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/folio_queue.h | 138 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/iov_iter.h | 57 ++++++++++++++++++ include/linux/uio.h | 12 ++++ 3 files changed, 207 insertions(+) create mode 100644 include/linux/folio_queue.h (limited to 'include/linux') diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h new file mode 100644 index 000000000000..52773613bf23 --- /dev/null +++ b/include/linux/folio_queue.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Queue of folios definitions + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#ifndef _LINUX_FOLIO_QUEUE_H +#define _LINUX_FOLIO_QUEUE_H + +#include + +/* + * Segment in a queue of running buffers. Each segment can hold a number of + * folios and a portion of the queue can be referenced with the ITER_FOLIOQ + * iterator. The possibility exists of inserting non-folio elements into the + * queue (such as gaps). + * + * Explicit prev and next pointers are used instead of a list_head to make it + * easier to add segments to tail and remove them from the head without the + * need for a lock. + */ +struct folio_queue { + struct folio_batch vec; /* Folios in the queue segment */ + u8 orders[PAGEVEC_SIZE]; /* Order of each folio */ + struct folio_queue *next; /* Next queue segment or NULL */ + struct folio_queue *prev; /* Previous queue segment of NULL */ + unsigned long marks; /* 1-bit mark per folio */ + unsigned long marks2; /* Second 1-bit mark per folio */ +#if PAGEVEC_SIZE > BITS_PER_LONG +#error marks is not big enough +#endif +}; + +static inline void folioq_init(struct folio_queue *folioq) +{ + folio_batch_init(&folioq->vec); + folioq->next = NULL; + folioq->prev = NULL; + folioq->marks = 0; + folioq->marks2 = 0; +} + +static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) +{ + return PAGEVEC_SIZE; +} + +static inline unsigned int folioq_count(struct folio_queue *folioq) +{ + return folio_batch_count(&folioq->vec); +} + +static inline bool folioq_full(struct folio_queue *folioq) +{ + //return !folio_batch_space(&folioq->vec); + return folioq_count(folioq) >= folioq_nr_slots(folioq); +} + +static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks); +} + +static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks); +} + +static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks); +} + +static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks2); +} + +static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks2); +} + +static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks2); +} + +static inline unsigned int __folio_order(struct folio *folio) +{ + if (!folio_test_large(folio)) + return 0; + return folio->_flags_1 & 0xff; +} + +static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio) +{ + unsigned int slot = folioq->vec.nr++; + + folioq->vec.folios[slot] = folio; + folioq->orders[slot] = __folio_order(folio); + return slot; +} + +static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio) +{ + unsigned int slot = folioq->vec.nr++; + + folioq->vec.folios[slot] = folio; + folioq->orders[slot] = __folio_order(folio); + folioq_mark(folioq, slot); + return slot; +} + +static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot) +{ + return folioq->vec.folios[slot]; +} + +static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot) +{ + return folioq->orders[slot]; +} + +static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot) +{ + return PAGE_SIZE << folioq_folio_order(folioq, slot); +} + +static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) +{ + folioq->vec.folios[slot] = NULL; + folioq_unmark(folioq, slot); + folioq_unmark2(folioq, slot); +} + +#endif /* _LINUX_FOLIO_QUEUE_H */ diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h index 270454a6703d..a223370a59a7 100644 --- a/include/linux/iov_iter.h +++ b/include/linux/iov_iter.h @@ -10,6 +10,7 @@ #include #include +#include typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len, void *priv, void *priv2); @@ -140,6 +141,60 @@ size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, return progress; } +/* + * Handle ITER_FOLIOQ. + */ +static __always_inline +size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2, + iov_step_f step) +{ + const struct folio_queue *folioq = iter->folioq; + unsigned int slot = iter->folioq_slot; + size_t progress = 0, skip = iter->iov_offset; + + if (slot == folioq_nr_slots(folioq)) { + /* The iterator may have been extended. */ + folioq = folioq->next; + slot = 0; + } + + do { + struct folio *folio = folioq_folio(folioq, slot); + size_t part, remain, consumed; + size_t fsize; + void *base; + + if (!folio) + break; + + fsize = folioq_folio_size(folioq, slot); + base = kmap_local_folio(folio, skip); + part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); + remain = step(base, progress, part, priv, priv2); + kunmap_local(base); + consumed = part - remain; + len -= consumed; + progress += consumed; + skip += consumed; + if (skip >= fsize) { + skip = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { + folioq = folioq->next; + slot = 0; + } + } + if (remain) + break; + } while (len); + + iter->folioq_slot = slot; + iter->folioq = folioq; + iter->iov_offset = skip; + iter->count -= progress; + return progress; +} + /* * Handle ITER_XARRAY. */ @@ -249,6 +304,8 @@ size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv, return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); + if (iov_iter_is_folioq(iter)) + return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); diff --git a/include/linux/uio.h b/include/linux/uio.h index 7020adedfa08..845d110acadc 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -11,6 +11,7 @@ #include struct page; +struct folio_queue; typedef unsigned int __bitwise iov_iter_extraction_t; @@ -25,6 +26,7 @@ enum iter_type { ITER_IOVEC, ITER_BVEC, ITER_KVEC, + ITER_FOLIOQ, ITER_XARRAY, ITER_DISCARD, }; @@ -66,6 +68,7 @@ struct iov_iter { const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; + const struct folio_queue *folioq; struct xarray *xarray; void __user *ubuf; }; @@ -74,6 +77,7 @@ struct iov_iter { }; union { unsigned long nr_segs; + u8 folioq_slot; loff_t xarray_start; }; }; @@ -126,6 +130,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i) return iov_iter_type(i) == ITER_DISCARD; } +static inline bool iov_iter_is_folioq(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_FOLIOQ; +} + static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; @@ -273,6 +282,9 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); +void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, + const struct folio_queue *folioq, + unsigned int first_slot, unsigned int offset, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, -- cgit v1.2.3 From 197a3de607d92b3d72e69edf5470e0a8fae548cc Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 14 Aug 2024 16:14:21 +0100 Subject: iov_iter: Provide copy_folio_from_iter() Provide a copy_folio_from_iter() wrapper. Signed-off-by: David Howells cc: Alexander Viro cc: Christian Brauner cc: Matthew Wilcox cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20240814203850.2240469-14-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/uio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 845d110acadc..853f9de5aa05 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -189,6 +189,12 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, return copy_page_to_iter(&folio->page, offset, bytes, i); } +static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset, + size_t bytes, struct iov_iter *i) +{ + return copy_page_from_iter(&folio->page, offset, bytes, i); +} + static inline size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { -- cgit v1.2.3 From cd0277ed0c188dd40e7744e89299af7b78831ca4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 May 2024 21:47:07 +0100 Subject: netfs: Use new folio_queue data type and iterator instead of xarray iter Make the netfs write-side routines use the new folio_queue struct to hold a rolling buffer of folios, with the issuer adding folios at the tail and the collector removing them from the head as they're processed instead of using an xarray. This will allow a subsequent patch to simplify the write collector. The primary mark (as tested by folioq_is_marked()) is used to note if the corresponding folio needs putting. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-16-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 63ab2c775a21..f7b444f4f251 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -38,10 +38,6 @@ static inline void folio_start_private_2(struct folio *folio) folio_set_private_2(folio); } -/* Marks used on xarray-based buffers */ -#define NETFS_BUF_PUT_MARK XA_MARK_0 /* - Page needs putting */ -#define NETFS_BUF_PAGECACHE_MARK XA_MARK_1 /* - Page needs wb/dirty flag wrangling */ - enum netfs_io_source { NETFS_SOURCE_UNKNOWN, NETFS_FILL_WITH_ZEROES, @@ -233,6 +229,8 @@ struct netfs_io_request { struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ #define NR_IO_STREAMS 2 //wreq->nr_io_streams struct netfs_group *group; /* Writeback group being written back */ + struct folio_queue *buffer; /* Head of I/O buffer */ + struct folio_queue *buffer_tail; /* Tail of I/O buffer */ struct iov_iter iter; /* Unencrypted-side iterator */ struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ void *netfs_priv; /* Private data for the netfs */ @@ -254,6 +252,8 @@ struct netfs_io_request { short error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ + u8 buffer_head_slot; /* First slot in ->buffer */ + u8 buffer_tail_slot; /* Next slot in ->buffer_tail */ unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ atomic64_t issued_to; /* Write issuer folio cursor */ -- cgit v1.2.3 From 983cdcf8fe141b0ce16bc71959a5dc55bcb0764d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 6 Jun 2024 07:48:55 +0100 Subject: netfs: Simplify the writeback code Use the new folio_queue structures to simplify the writeback code. The problem with referring to the i_pages xarray directly is that we may have gaps in the sequence of folios we're writing from that we need to skip when we're removing the writeback mark from the folios we're writing back from. At the moment the code tries to deal with this by carefully tracking the gaps in each writeback stream (eg. write to server and write to cache) and divining when there's a gap that spans folios (something that's not helped by folios not being a consistent size). Instead, the folio_queue buffer contains pointers only the folios we're dealing with, has them in ascending order and indicates a gap by placing non-consequitive folios next to each other. This makes it possible to track where we need to clean up to by just keeping track of where we've processed to on each stream and taking the minimum. Note that the I/O iterator is always rounded up to the end of the folio, even if that is beyond the EOF position, so that the cache can do DIO from the page. The excess space is cleared, though mmapped writes clobber it. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-18-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index f7b444f4f251..bd0e3d147822 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -257,7 +257,6 @@ struct netfs_io_request { unsigned long long i_size; /* Size of the file */ unsigned long long start; /* Start position */ atomic64_t issued_to; /* Write issuer folio cursor */ - unsigned long long contiguity; /* Tracking for gaps in the writeback sequence */ unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ -- cgit v1.2.3 From ee4cdf7ba857a894ad1650d6ab77669cbbfa329e Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Jul 2024 00:40:22 +0100 Subject: netfs: Speed up buffered reading Improve the efficiency of buffered reads in a number of ways: (1) Overhaul the algorithm in general so that it's a lot more compact and split the read submission code between buffered and unbuffered versions. The unbuffered version can be vastly simplified. (2) Read-result collection is handed off to a work queue rather than being done in the I/O thread. Multiple subrequests can be processes simultaneously. (3) When a subrequest is collected, any folios it fully spans are collected and "spare" data on either side is donated to either the previous or the next subrequest in the sequence. Notes: (*) Readahead expansion is massively slows down fio, presumably because it causes a load of extra allocations, both folio and xarray, up front before RPC requests can be transmitted. (*) RDMA with cifs does appear to work, both with SIW and RXE. (*) PG_private_2-based reading and copy-to-cache is split out into its own file and altered to use folio_queue. Note that the copy to the cache now creates a new write transaction against the cache and adds the folios to be copied into it. This allows it to use part of the writeback I/O code. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-20-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/folio_queue.h | 18 ++++++++++++++++++ include/linux/netfs.h | 26 ++++++++++++++++++-------- 2 files changed, 36 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h index 52773613bf23..955680c3bb5f 100644 --- a/include/linux/folio_queue.h +++ b/include/linux/folio_queue.h @@ -27,6 +27,7 @@ struct folio_queue { struct folio_queue *prev; /* Previous queue segment of NULL */ unsigned long marks; /* 1-bit mark per folio */ unsigned long marks2; /* Second 1-bit mark per folio */ + unsigned long marks3; /* Third 1-bit mark per folio */ #if PAGEVEC_SIZE > BITS_PER_LONG #error marks is not big enough #endif @@ -39,6 +40,7 @@ static inline void folioq_init(struct folio_queue *folioq) folioq->prev = NULL; folioq->marks = 0; folioq->marks2 = 0; + folioq->marks3 = 0; } static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) @@ -87,6 +89,21 @@ static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) clear_bit(slot, &folioq->marks2); } +static inline bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks3); +} + +static inline void folioq_mark3(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks3); +} + +static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks3); +} + static inline unsigned int __folio_order(struct folio *folio) { if (!folio_test_large(folio)) @@ -133,6 +150,7 @@ static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) folioq->vec.folios[slot] = NULL; folioq_unmark(folioq, slot); folioq_unmark2(folioq, slot); + folioq_unmark3(folioq, slot); } #endif /* _LINUX_FOLIO_QUEUE_H */ diff --git a/include/linux/netfs.h b/include/linux/netfs.h index bd0e3d147822..c0f0c9c87d86 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -178,36 +178,43 @@ struct netfs_io_subrequest { unsigned long long start; /* Where to start the I/O */ size_t len; /* Size of the I/O */ size_t transferred; /* Amount of data transferred */ + size_t consumed; /* Amount of read data consumed */ + size_t prev_donated; /* Amount of data donated from previous subreq */ + size_t next_donated; /* Amount of data donated from next subreq */ refcount_t ref; short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ + unsigned char curr_folioq_slot; /* Folio currently being read */ + unsigned char curr_folio_order; /* Order of folio */ + struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */ unsigned long flags; #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ -#define NETFS_SREQ_SHORT_IO 2 /* Set if the I/O was short */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ #define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ +#define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ #define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ #define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ -#define NETFS_SREQ_HIT_EOF 12 /* Set if we hit the EOF */ }; enum netfs_io_origin { NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READPAGE, /* This read is a synchronous read */ + NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_DIO_WRITE, /* This is a direct I/O write */ + NETFS_PGPRIV2_COPY_TO_CACHE, /* [DEPRECATED] This is writing read data to the cache */ nr__netfs_io_origin } __mode(byte); @@ -224,6 +231,7 @@ struct netfs_io_request { struct address_space *mapping; /* The mapping being accessed */ struct kiocb *iocb; /* AIO completion vector */ struct netfs_cache_resources cache_resources; + struct readahead_control *ractl; /* Readahead descriptor */ struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head subrequests; /* Contributory I/O operations */ struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ @@ -244,12 +252,10 @@ struct netfs_io_request { unsigned int nr_group_rel; /* Number of refs to release on ->group */ spinlock_t lock; /* Lock for queuing subreqs */ atomic_t nr_outstanding; /* Number of ops in progress */ - atomic_t nr_copy_ops; /* Number of copy-to-cache ops in progress */ - size_t upper_len; /* Length can be extended to here */ unsigned long long submitted; /* Amount submitted for I/O so far */ unsigned long long len; /* Length of the request */ size_t transferred; /* Amount to be indicated as transferred */ - short error; /* 0 or error that occurred */ + long error; /* 0 or error that occurred */ enum netfs_io_origin origin; /* Origin of the request */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ u8 buffer_head_slot; /* First slot in ->buffer */ @@ -260,9 +266,9 @@ struct netfs_io_request { unsigned long long collected_to; /* Point we've collected to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ + size_t prev_donated; /* Fallback for subreq->prev_donated */ refcount_t ref; unsigned long flags; -#define NETFS_RREQ_INCOMPLETE_IO 0 /* Some ioreqs terminated short or with error */ #define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ @@ -274,6 +280,7 @@ struct netfs_io_request { #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ +#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; @@ -292,7 +299,7 @@ struct netfs_request_ops { /* Read request handling */ void (*expand_readahead)(struct netfs_io_request *rreq); - bool (*clamp_length)(struct netfs_io_subrequest *subreq); + int (*prepare_read)(struct netfs_io_subrequest *subreq); void (*issue_read)(struct netfs_io_subrequest *subreq); bool (*is_still_valid)(struct netfs_io_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, @@ -422,7 +429,10 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp); vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); /* (Sub)request management API. */ -void netfs_subreq_terminated(struct netfs_io_subrequest *, ssize_t, bool); +void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, + bool was_async); +void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, + int error, bool was_async); void netfs_get_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what); void netfs_put_subrequest(struct netfs_io_subrequest *subreq, -- cgit v1.2.3 From c4f1450ecccc5311db87f806998eda1c824c4e35 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 12 Jul 2024 12:44:30 +0100 Subject: cachefiles, netfs: Fix write to partial block at EOF Because it uses DIO writes, cachefiles is unable to make a write to the backing file if that write is not aligned to and sized according to the backing file's DIO block alignment. This makes it tricky to handle a write to the cache where the EOF on the network file is not correctly aligned. To get around this, netfslib attempts to tell the driver it is calling how much more data there is available beyond the EOF that it can use to pad the write (netfslib preclears the part of the folio above the EOF). However, it tries to tell the cache what the maximum length is, but doesn't calculate this correctly; and, in any case, cachefiles actually ignores the value and just skips the block. Fix this by: (1) Change the value passed to indicate the amount of extra data that can be added to the operation (now ->submit_extendable_to). This is much simpler to calculate as it's just the end of the folio minus the top of the data within the folio - rather than having to account for data spread over multiple folios. (2) Make cachefiles add some of this data if the subrequest it is given ends at the network file's i_size if the extra data is sufficient to pad out to a whole block. Signed-off-by: David Howells cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-22-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index c0f0c9c87d86..5eaceef41e6c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -135,7 +135,7 @@ struct netfs_io_stream { unsigned int sreq_max_segs; /* 0 or max number of segments in an iterator */ unsigned int submit_off; /* Folio offset we're submitting from */ unsigned int submit_len; /* Amount of data left to submit */ - unsigned int submit_max_len; /* Amount I/O can be rounded up to */ + unsigned int submit_extendable_to; /* Amount I/O can be rounded up to */ void (*prepare_write)(struct netfs_io_subrequest *subreq); void (*issue_write)(struct netfs_io_subrequest *subreq); /* Collection tracking */ -- cgit v1.2.3 From 2982c8c19bab020e38da9d503aa21a3b389c53ac Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 26 Jul 2024 20:03:07 +0100 Subject: cifs: Use iterate_and_advance*() routines directly for hashing Replace the bespoke cifs iterators of ITER_BVEC and ITER_KVEC to do hashing with iterate_and_advance_kernel() - a variant on iterate_and_advance() that only supports kernel-internal ITER_* types and not UBUF/IOVEC types. The bespoke ITER_XARRAY is left because we don't really want to be calling crypto_shash_update() under the RCU read lock for large amounts of data; besides, ITER_XARRAY is going to be phased out. Signed-off-by: David Howells cc: Steve French cc: Paulo Alcantara cc: Tom Talpey cc: Enzo Matsumiya cc: linux-cifs@vger.kernel.org Link: https://lore.kernel.org/r/20240814203850.2240469-24-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/iov_iter.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h index a223370a59a7..c4aa58032faf 100644 --- a/include/linux/iov_iter.h +++ b/include/linux/iov_iter.h @@ -328,4 +328,51 @@ size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv, return iterate_and_advance2(iter, len, priv, NULL, ustep, step); } +/** + * iterate_and_advance_kernel - Iterate over a kernel-internal iterator + * @iter: The iterator to iterate over. + * @len: The amount to iterate over. + * @priv: Data for the step functions. + * @priv2: More data for the step functions. + * @step: Function for other iterators; given kernel addresses. + * + * Iterate over the next part of an iterator, up to the specified length. The + * buffer is presented in segments, which for kernel iteration are broken up by + * physical pages and mapped, with the mapped address being presented. + * + * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type + * iterators; it will not handle UBUF or IOVEC-type iterators. + * + * A step functions, @step, must be provided, one for handling mapped kernel + * addresses and the other is given user addresses which have the potential to + * fault since no pinning is performed. + * + * The step functions are passed the address and length of the segment, @priv, + * @priv2 and the amount of data so far iterated over (which can, for example, + * be added to @priv to point to the right part of a second buffer). The step + * functions should return the amount of the segment they didn't process (ie. 0 + * indicates complete processsing). + * + * This function returns the amount of data processed (ie. 0 means nothing was + * processed and the value of @len means processes to completion). + */ +static __always_inline +size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv, + void *priv2, iov_step_f step) +{ + if (unlikely(iter->count < len)) + len = iter->count; + if (unlikely(!len)) + return 0; + if (iov_iter_is_bvec(iter)) + return iterate_bvec(iter, len, priv, priv2, step); + if (iov_iter_is_kvec(iter)) + return iterate_kvec(iter, len, priv, priv2, step); + if (iov_iter_is_folioq(iter)) + return iterate_folioq(iter, len, priv, priv2, step); + if (iov_iter_is_xarray(iter)) + return iterate_xarray(iter, len, priv, priv2, step); + return iterate_discard(iter, len, priv, priv2, step); +} + #endif /* _LINUX_IOV_ITER_H */ -- cgit v1.2.3 From c45b9a07b6392fa224ca76b89f24dae1046eef09 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 12 Sep 2024 22:30:34 +0900 Subject: Revert "firewire: core: use mutex to coordinate concurrent calls to flush completions" This reverts commit d9605d67562505e27dcc0f71af418118d3db91e5, since this commit is on the following reverted changes. Link: https://lore.kernel.org/r/20240912133038.238786-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 19e8c5f9537c..f815d12deda0 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -512,7 +512,6 @@ union fw_iso_callback { struct fw_iso_context { struct fw_card *card; struct work_struct work; - struct mutex flushing_completions_mutex; int type; int channel; int speed; -- cgit v1.2.3 From 4010cb1efda08ec6fd02ec5db9da909322ef352e Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 12 Sep 2024 22:30:37 +0900 Subject: firewire: core: update documentation of kernel APIs for flushing completions There is a slight difference between fw_iso_context_flush_completions() and fw_iso_context_schedule_flush_completions(). This commit updates the documentations for them. Link: https://lore.kernel.org/r/20240912133038.238786-5-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index f815d12deda0..b632eec3ab52 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -537,9 +537,11 @@ int fw_iso_context_flush_completions(struct fw_iso_context *ctx); * @ctx: the isochronous context * * Schedule a work item on workqueue to process the isochronous context. The registered callback - * function is called in the worker if some packets have been already transferred since the last - * time. If it is required to process the context in the current context, - * fw_iso_context_flush_completions() is available instead. + * function is called by the worker when a queued packet buffer with the interrupt flag is + * completed, either after transmission in the IT context or after being filled in the IR context. + * The callback function is also called when the header buffer in the context becomes full, If it + * is required to process the context in the current context, fw_iso_context_flush_completions() is + * available instead. * * Context: Any context. */ -- cgit v1.2.3 From ede5bbe488d162bcd572880e58f9044c9df84050 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Mon, 9 Sep 2024 11:10:27 +0300 Subject: ARM: ep93xx: add regmap aux_dev The following driver's should be instantiated by ep93xx syscon driver: - reboot - pinctrl - clock They all require access to DEVCFG register with a shared lock held, to avoid conflict writing to swlocked parts of DEVCFG. Provide common resources such as base, regmap and spinlock via auxiliary bus framework. Signed-off-by: Nikita Shubin Tested-by: Alexander Sverdlin Reviewed-by: Linus Walleij Reviewed-by: Stephen Boyd Acked-by: Vinod Koul Signed-off-by: Arnd Bergmann --- include/linux/soc/cirrus/ep93xx.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/cirrus/ep93xx.h b/include/linux/soc/cirrus/ep93xx.h index 56fbe2dc59b1..a27447971302 100644 --- a/include/linux/soc/cirrus/ep93xx.h +++ b/include/linux/soc/cirrus/ep93xx.h @@ -3,6 +3,18 @@ #define _SOC_EP93XX_H struct platform_device; +struct regmap; +struct spinlock_t; + +enum ep93xx_soc_model { + EP93XX_9301_SOC, + EP93XX_9307_SOC, + EP93XX_9312_SOC, +}; + +#include +#include +#include #define EP93XX_CHIP_REV_D0 3 #define EP93XX_CHIP_REV_D1 4 @@ -10,6 +22,20 @@ struct platform_device; #define EP93XX_CHIP_REV_E1 6 #define EP93XX_CHIP_REV_E2 7 +struct ep93xx_regmap_adev { + struct auxiliary_device adev; + struct regmap *map; + void __iomem *base; + spinlock_t *lock; + void (*write)(struct regmap *map, spinlock_t *lock, unsigned int reg, + unsigned int val); + void (*update_bits)(struct regmap *map, spinlock_t *lock, + unsigned int reg, unsigned int mask, unsigned int val); +}; + +#define to_ep93xx_regmap_adev(_adev) \ + container_of((_adev), struct ep93xx_regmap_adev, adev) + #ifdef CONFIG_ARCH_EP93XX int ep93xx_pwm_acquire_gpio(struct platform_device *pdev); void ep93xx_pwm_release_gpio(struct platform_device *pdev); -- cgit v1.2.3 From 2e7f55ce430240a5547b8a94b4c532fc8c20b18b Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Mon, 9 Sep 2024 11:10:34 +0300 Subject: dmaengine: cirrus: Convert to DT for Cirrus EP93xx Convert Cirrus EP93xx DMA to device tree usage: - add OF ID match table with data - add of_probe for device tree - add xlate for m2m/m2p - drop subsys_initcall code - drop platform probe - drop platform structs usage >From now on it only supports device tree probing. Co-developed-by: Alexander Sverdlin Signed-off-by: Alexander Sverdlin Acked-by: Vinod Koul Signed-off-by: Nikita Shubin Tested-by: Alexander Sverdlin Signed-off-by: Arnd Bergmann --- include/linux/platform_data/dma-ep93xx.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-ep93xx.h b/include/linux/platform_data/dma-ep93xx.h index eb9805bb3fe8..9ec5cdd5a1eb 100644 --- a/include/linux/platform_data/dma-ep93xx.h +++ b/include/linux/platform_data/dma-ep93xx.h @@ -3,8 +3,11 @@ #define __ASM_ARCH_DMA_H #include +#include #include #include +#include +#include /* * M2P channels. @@ -70,6 +73,9 @@ struct ep93xx_dma_platform_data { static inline bool ep93xx_dma_chan_is_m2p(struct dma_chan *chan) { + if (device_is_compatible(chan->device->dev, "cirrus,ep9301-dma-m2p")) + return true; + return !strcmp(dev_name(chan->device->dev), "ep93xx-dma-m2p"); } -- cgit v1.2.3 From b3ab5787e7acb02874ae86cbe13969d4dd01a585 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Mon, 9 Sep 2024 11:10:48 +0300 Subject: input: keypad: ep93xx: add DT support for Cirrus EP93xx - drop flags, they were not used anyway - add OF ID match table - process "autorepeat", "debounce-delay-ms", prescale from device tree - drop platform data usage and it's header - keymap goes from device tree now on Signed-off-by: Nikita Shubin Acked-by: Dmitry Torokhov Acked-by: Vinod Koul Signed-off-by: Arnd Bergmann --- include/linux/soc/cirrus/ep93xx.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/cirrus/ep93xx.h b/include/linux/soc/cirrus/ep93xx.h index a27447971302..8942bfaf1545 100644 --- a/include/linux/soc/cirrus/ep93xx.h +++ b/include/linux/soc/cirrus/ep93xx.h @@ -41,8 +41,6 @@ int ep93xx_pwm_acquire_gpio(struct platform_device *pdev); void ep93xx_pwm_release_gpio(struct platform_device *pdev); int ep93xx_ide_acquire_gpio(struct platform_device *pdev); void ep93xx_ide_release_gpio(struct platform_device *pdev); -int ep93xx_keypad_acquire_gpio(struct platform_device *pdev); -void ep93xx_keypad_release_gpio(struct platform_device *pdev); int ep93xx_i2s_acquire(void); void ep93xx_i2s_release(void); unsigned int ep93xx_chip_revision(void); @@ -52,8 +50,6 @@ static inline int ep93xx_pwm_acquire_gpio(struct platform_device *pdev) { return static inline void ep93xx_pwm_release_gpio(struct platform_device *pdev) {} static inline int ep93xx_ide_acquire_gpio(struct platform_device *pdev) { return 0; } static inline void ep93xx_ide_release_gpio(struct platform_device *pdev) {} -static inline int ep93xx_keypad_acquire_gpio(struct platform_device *pdev) { return 0; } -static inline void ep93xx_keypad_release_gpio(struct platform_device *pdev) {} static inline int ep93xx_i2s_acquire(void) { return 0; } static inline void ep93xx_i2s_release(void) {} static inline unsigned int ep93xx_chip_revision(void) { return 0; } -- cgit v1.2.3 From a48ac3dc569771c18fcafbc8351d820cc343c54a Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Mon, 9 Sep 2024 11:10:58 +0300 Subject: pwm: ep93xx: drop legacy pinctrl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop legacy gpio request/free since we are using pinctrl for this now. Signed-off-by: Nikita Shubin Acked-by: Uwe Kleine-König Acked-by: Thierry Reding Acked-by: Linus Walleij Acked-by: Vinod Koul Signed-off-by: Arnd Bergmann --- include/linux/soc/cirrus/ep93xx.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/cirrus/ep93xx.h b/include/linux/soc/cirrus/ep93xx.h index 8942bfaf1545..f6376edc1b33 100644 --- a/include/linux/soc/cirrus/ep93xx.h +++ b/include/linux/soc/cirrus/ep93xx.h @@ -37,8 +37,6 @@ struct ep93xx_regmap_adev { container_of((_adev), struct ep93xx_regmap_adev, adev) #ifdef CONFIG_ARCH_EP93XX -int ep93xx_pwm_acquire_gpio(struct platform_device *pdev); -void ep93xx_pwm_release_gpio(struct platform_device *pdev); int ep93xx_ide_acquire_gpio(struct platform_device *pdev); void ep93xx_ide_release_gpio(struct platform_device *pdev); int ep93xx_i2s_acquire(void); @@ -46,8 +44,6 @@ void ep93xx_i2s_release(void); unsigned int ep93xx_chip_revision(void); #else -static inline int ep93xx_pwm_acquire_gpio(struct platform_device *pdev) { return 0; } -static inline void ep93xx_pwm_release_gpio(struct platform_device *pdev) {} static inline int ep93xx_ide_acquire_gpio(struct platform_device *pdev) { return 0; } static inline void ep93xx_ide_release_gpio(struct platform_device *pdev) {} static inline int ep93xx_i2s_acquire(void) { return 0; } -- cgit v1.2.3 From a632229be268dde8f6d407638b5cfba8b78201d6 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Mon, 9 Sep 2024 11:10:59 +0300 Subject: ata: pata_ep93xx: remove legacy pinctrl use Drop legacy acquire/release since we are using pinctrl for this now. Signed-off-by: Nikita Shubin Tested-by: Alexander Sverdlin Reviewed-by: Sergey Shtylyov Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Reviewed-by: Mark Brown Reviewed-by: Krzysztof Kozlowski Reviewed-by: Andy Shevchenko Acked-by: Damien Le Moal Acked-by: Vinod Koul Signed-off-by: Arnd Bergmann --- include/linux/soc/cirrus/ep93xx.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/cirrus/ep93xx.h b/include/linux/soc/cirrus/ep93xx.h index f6376edc1b33..142c33a2d7db 100644 --- a/include/linux/soc/cirrus/ep93xx.h +++ b/include/linux/soc/cirrus/ep93xx.h @@ -37,15 +37,11 @@ struct ep93xx_regmap_adev { container_of((_adev), struct ep93xx_regmap_adev, adev) #ifdef CONFIG_ARCH_EP93XX -int ep93xx_ide_acquire_gpio(struct platform_device *pdev); -void ep93xx_ide_release_gpio(struct platform_device *pdev); int ep93xx_i2s_acquire(void); void ep93xx_i2s_release(void); unsigned int ep93xx_chip_revision(void); #else -static inline int ep93xx_ide_acquire_gpio(struct platform_device *pdev) { return 0; } -static inline void ep93xx_ide_release_gpio(struct platform_device *pdev) {} static inline int ep93xx_i2s_acquire(void) { return 0; } static inline void ep93xx_i2s_release(void) {} static inline unsigned int ep93xx_chip_revision(void) { return 0; } -- cgit v1.2.3 From e5ef574dda702e62081d5c7991949cbdb7f65c08 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Mon, 9 Sep 2024 11:11:00 +0300 Subject: ARM: ep93xx: delete all boardfiles Delete the ep93xx board files. Signed-off-by: Nikita Shubin Tested-by: Alexander Sverdlin Acked-by: Vinod Koul Signed-off-by: Arnd Bergmann --- include/linux/platform_data/spi-ep93xx.h | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 include/linux/platform_data/spi-ep93xx.h (limited to 'include/linux') diff --git a/include/linux/platform_data/spi-ep93xx.h b/include/linux/platform_data/spi-ep93xx.h deleted file mode 100644 index b439f2a896e0..000000000000 --- a/include/linux/platform_data/spi-ep93xx.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_MACH_EP93XX_SPI_H -#define __ASM_MACH_EP93XX_SPI_H - -struct spi_device; - -/** - * struct ep93xx_spi_info - EP93xx specific SPI descriptor - * @use_dma: use DMA for the transfers - */ -struct ep93xx_spi_info { - bool use_dma; -}; - -#endif /* __ASM_MACH_EP93XX_SPI_H */ -- cgit v1.2.3 From 43528a72526152f17a918973b3b8b6f383b90f98 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Mon, 9 Sep 2024 11:11:01 +0300 Subject: ARM: ep93xx: soc: drop defines Remove unnecessary defines, as we dropped board files. Signed-off-by: Nikita Shubin Tested-by: Alexander Sverdlin Acked-by: Vinod Koul Signed-off-by: Arnd Bergmann --- include/linux/platform_data/eth-ep93xx.h | 10 --------- include/linux/platform_data/keypad-ep93xx.h | 32 ----------------------------- include/linux/soc/cirrus/ep93xx.h | 13 ------------ 3 files changed, 55 deletions(-) delete mode 100644 include/linux/platform_data/eth-ep93xx.h delete mode 100644 include/linux/platform_data/keypad-ep93xx.h (limited to 'include/linux') diff --git a/include/linux/platform_data/eth-ep93xx.h b/include/linux/platform_data/eth-ep93xx.h deleted file mode 100644 index 8eef637a804d..000000000000 --- a/include/linux/platform_data/eth-ep93xx.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_PLATFORM_DATA_ETH_EP93XX -#define _LINUX_PLATFORM_DATA_ETH_EP93XX - -struct ep93xx_eth_data { - unsigned char dev_addr[6]; - unsigned char phy_id; -}; - -#endif diff --git a/include/linux/platform_data/keypad-ep93xx.h b/include/linux/platform_data/keypad-ep93xx.h deleted file mode 100644 index 3054fced8509..000000000000 --- a/include/linux/platform_data/keypad-ep93xx.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __KEYPAD_EP93XX_H -#define __KEYPAD_EP93XX_H - -struct matrix_keymap_data; - -/* flags for the ep93xx_keypad driver */ -#define EP93XX_KEYPAD_DISABLE_3_KEY (1<<0) /* disable 3-key reset */ -#define EP93XX_KEYPAD_DIAG_MODE (1<<1) /* diagnostic mode */ -#define EP93XX_KEYPAD_BACK_DRIVE (1<<2) /* back driving mode */ -#define EP93XX_KEYPAD_TEST_MODE (1<<3) /* scan only column 0 */ -#define EP93XX_KEYPAD_AUTOREPEAT (1<<4) /* enable key autorepeat */ - -/** - * struct ep93xx_keypad_platform_data - platform specific device structure - * @keymap_data: pointer to &matrix_keymap_data - * @debounce: debounce start count; terminal count is 0xff - * @prescale: row/column counter pre-scaler load value - * @flags: see above - */ -struct ep93xx_keypad_platform_data { - struct matrix_keymap_data *keymap_data; - unsigned int debounce; - unsigned int prescale; - unsigned int flags; - unsigned int clk_rate; -}; - -#define EP93XX_MATRIX_ROWS (8) -#define EP93XX_MATRIX_COLS (8) - -#endif /* __KEYPAD_EP93XX_H */ diff --git a/include/linux/soc/cirrus/ep93xx.h b/include/linux/soc/cirrus/ep93xx.h index 142c33a2d7db..3e6cf2b25a97 100644 --- a/include/linux/soc/cirrus/ep93xx.h +++ b/include/linux/soc/cirrus/ep93xx.h @@ -2,7 +2,6 @@ #ifndef _SOC_EP93XX_H #define _SOC_EP93XX_H -struct platform_device; struct regmap; struct spinlock_t; @@ -36,16 +35,4 @@ struct ep93xx_regmap_adev { #define to_ep93xx_regmap_adev(_adev) \ container_of((_adev), struct ep93xx_regmap_adev, adev) -#ifdef CONFIG_ARCH_EP93XX -int ep93xx_i2s_acquire(void); -void ep93xx_i2s_release(void); -unsigned int ep93xx_chip_revision(void); - -#else -static inline int ep93xx_i2s_acquire(void) { return 0; } -static inline void ep93xx_i2s_release(void) {} -static inline unsigned int ep93xx_chip_revision(void) { return 0; } - -#endif - #endif -- cgit v1.2.3 From a015b1828653b591de0aa5303c0dbc4235935f94 Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Mon, 9 Sep 2024 11:11:03 +0300 Subject: dmaengine: cirrus: remove platform code Remove DMA platform header, from now on we use device tree for DMA clients. Acked-by: Vinod Koul Signed-off-by: Nikita Shubin Tested-by: Alexander Sverdlin Signed-off-by: Arnd Bergmann --- include/linux/platform_data/dma-ep93xx.h | 100 ------------------------------- 1 file changed, 100 deletions(-) delete mode 100644 include/linux/platform_data/dma-ep93xx.h (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-ep93xx.h b/include/linux/platform_data/dma-ep93xx.h deleted file mode 100644 index 9ec5cdd5a1eb..000000000000 --- a/include/linux/platform_data/dma-ep93xx.h +++ /dev/null @@ -1,100 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_ARCH_DMA_H -#define __ASM_ARCH_DMA_H - -#include -#include -#include -#include -#include -#include - -/* - * M2P channels. - * - * Note that these values are also directly used for setting the PPALLOC - * register. - */ -#define EP93XX_DMA_I2S1 0 -#define EP93XX_DMA_I2S2 1 -#define EP93XX_DMA_AAC1 2 -#define EP93XX_DMA_AAC2 3 -#define EP93XX_DMA_AAC3 4 -#define EP93XX_DMA_I2S3 5 -#define EP93XX_DMA_UART1 6 -#define EP93XX_DMA_UART2 7 -#define EP93XX_DMA_UART3 8 -#define EP93XX_DMA_IRDA 9 -/* M2M channels */ -#define EP93XX_DMA_SSP 10 -#define EP93XX_DMA_IDE 11 - -/** - * struct ep93xx_dma_data - configuration data for the EP93xx dmaengine - * @port: peripheral which is requesting the channel - * @direction: TX/RX channel - * @name: optional name for the channel, this is displayed in /proc/interrupts - * - * This information is passed as private channel parameter in a filter - * function. Note that this is only needed for slave/cyclic channels. For - * memcpy channels %NULL data should be passed. - */ -struct ep93xx_dma_data { - int port; - enum dma_transfer_direction direction; - const char *name; -}; - -/** - * struct ep93xx_dma_chan_data - platform specific data for a DMA channel - * @name: name of the channel, used for getting the right clock for the channel - * @base: mapped registers - * @irq: interrupt number used by this channel - */ -struct ep93xx_dma_chan_data { - const char *name; - void __iomem *base; - int irq; -}; - -/** - * struct ep93xx_dma_platform_data - platform data for the dmaengine driver - * @channels: array of channels which are passed to the driver - * @num_channels: number of channels in the array - * - * This structure is passed to the DMA engine driver via platform data. For - * M2P channels, contract is that even channels are for TX and odd for RX. - * There is no requirement for the M2M channels. - */ -struct ep93xx_dma_platform_data { - struct ep93xx_dma_chan_data *channels; - size_t num_channels; -}; - -static inline bool ep93xx_dma_chan_is_m2p(struct dma_chan *chan) -{ - if (device_is_compatible(chan->device->dev, "cirrus,ep9301-dma-m2p")) - return true; - - return !strcmp(dev_name(chan->device->dev), "ep93xx-dma-m2p"); -} - -/** - * ep93xx_dma_chan_direction - returns direction the channel can be used - * @chan: channel - * - * This function can be used in filter functions to find out whether the - * channel supports given DMA direction. Only M2P channels have such - * limitation, for M2M channels the direction is configurable. - */ -static inline enum dma_transfer_direction -ep93xx_dma_chan_direction(struct dma_chan *chan) -{ - if (!ep93xx_dma_chan_is_m2p(chan)) - return DMA_TRANS_NONE; - - /* even channels are for TX, odd for RX */ - return (chan->chan_id % 2 == 0) ? DMA_MEM_TO_DEV : DMA_DEV_TO_MEM; -} - -#endif /* __ASM_ARCH_DMA_H */ -- cgit v1.2.3 From 433d7ce2d86d21274838c9e8c796f4232cd13cdb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Aug 2024 15:38:12 -0700 Subject: security,bpf: constify struct path in bpf_token_create() LSM hook There is no reason why struct path pointer shouldn't be const-qualified when being passed into bpf_token_create() LSM hook. Add that const. Acked-by: Paul Moore (LSM/SELinux) Suggested-by: Al Viro Signed-off-by: Andrii Nakryiko --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 855db460e08b..462b55378241 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -431,7 +431,7 @@ LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, - struct path *path) + const struct path *path) LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token) LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd) LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) diff --git a/include/linux/security.h b/include/linux/security.h index 1390f1efb4f0..31523a2c71c4 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2137,7 +2137,7 @@ extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); extern void security_bpf_prog_free(struct bpf_prog *prog); extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, - struct path *path); + const struct path *path); extern void security_bpf_token_free(struct bpf_token *token); extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd); extern int security_bpf_token_capable(const struct bpf_token *token, int cap); @@ -2177,7 +2177,7 @@ static inline void security_bpf_prog_free(struct bpf_prog *prog) { } static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, - struct path *path) + const struct path *path) { return 0; } -- cgit v1.2.3 From da2f660b3ba1be33310452959ab72d1d7ce39350 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 11 Sep 2024 13:17:46 -0700 Subject: net/mlx5: fs, make get_root_namespace API function As preparation for HW Steering support, where the function get_root_namespace() is needed to get root FDB, make it an API function and rename it to mlx5_get_root_namespace(). Reviewed-by: Yevgeny Kliteynik Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Link: https://patch.msgid.link/20240911201757.1505453-5-saeed@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/mlx5/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 3fb428ce7d1c..b744e554f014 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -342,4 +342,7 @@ void mlx5_packet_reformat_dealloc(struct mlx5_core_dev *dev, struct mlx5_pkt_reformat *reformat); u32 mlx5_flow_table_id(struct mlx5_flow_table *ft); + +struct mlx5_flow_root_namespace * +mlx5_get_root_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type ns_type); #endif -- cgit v1.2.3 From 9947204cdad97d22d171039019a4aad4d6899cdd Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 11 Sep 2024 13:17:51 -0700 Subject: net/mlx5: Add device cap for supporting hot reset in sync reset flow New devices with new FW can support sync reset for firmware activate using hot reset. Add capability for supporting it and add MFRL field to query from FW which type of PCI reset method to use while handling sync reset events. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20240911201757.1505453-10-saeed@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2045575b70d4..620a5c305123 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1856,7 +1856,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_328[0x2]; u8 relaxed_ordering_read[0x1]; u8 log_max_pd[0x5]; - u8 reserved_at_330[0x6]; + u8 reserved_at_330[0x5]; + u8 pcie_reset_using_hotreset_method[0x1]; u8 pci_sync_for_fw_update_with_driver_unload[0x1]; u8 vnic_env_cnt_steering_fail[0x1]; u8 vport_counter_local_loopback[0x1]; @@ -11188,6 +11189,11 @@ struct mlx5_ifc_mcda_reg_bits { u8 data[][0x20]; }; +enum { + MLX5_MFRL_REG_PCI_RESET_METHOD_LINK_TOGGLE = 0, + MLX5_MFRL_REG_PCI_RESET_METHOD_HOT_RESET = 1, +}; + enum { MLX5_MFRL_REG_RESET_STATE_IDLE = 0, MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION = 1, @@ -11215,7 +11221,8 @@ struct mlx5_ifc_mfrl_reg_bits { u8 pci_sync_for_fw_update_start[0x1]; u8 pci_sync_for_fw_update_resp[0x2]; u8 rst_type_sel[0x3]; - u8 reserved_at_28[0x4]; + u8 pci_reset_req_method[0x3]; + u8 reserved_at_2b[0x1]; u8 reset_state[0x4]; u8 reset_type[0x8]; u8 reset_level[0x8]; -- cgit v1.2.3 From 5bd877093fd0b2e9e5f0c03b466669f761b5849c Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 11 Sep 2024 13:17:55 -0700 Subject: net/mlx5: Add NOT_READY command return status Add a new command status MLX5_CMD_STAT_NOT_READY to handle cases where the firmware is not ready. Signed-off-by: Shay Drory Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed Reviewed-by: Jacob Keller Reviewed-by: Kalesh AP Link: https://patch.msgid.link/20240911201757.1505453-14-saeed@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index a94bc9e3af96..d0f7d1f36c5e 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1449,6 +1449,7 @@ enum { MLX5_CMD_STAT_BAD_SYS_STATE_ERR = 0x4, MLX5_CMD_STAT_BAD_RES_ERR = 0x5, MLX5_CMD_STAT_RES_BUSY = 0x6, + MLX5_CMD_STAT_NOT_READY = 0x7, MLX5_CMD_STAT_LIM_ERR = 0x8, MLX5_CMD_STAT_BAD_RES_STATE_ERR = 0x9, MLX5_CMD_STAT_IX_ERR = 0xa, -- cgit v1.2.3 From 907936b6f4e630718cc31ddea79cc76a3e32080a Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 9 Sep 2024 13:05:04 +0300 Subject: net/mlx5: Handle memory scheme ODP capabilities When running over new FW that supports the new memory scheme ODP, set the cap in the FW to signal the FW we are working in the new scheme. In the memory scheme ODP the per_transport_service capabilities are RO for the driver so we skip their setting. Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909100504.29797-9-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 154095256d0d..57c9b18c3adb 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1389,9 +1389,13 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_ODP(mdev, cap)\ MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap) -#define MLX5_CAP_ODP_SCHEME(mdev, cap) \ - MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \ - transport_page_fault_scheme_cap.cap) +#define MLX5_CAP_ODP_SCHEME(mdev, cap) \ + (MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \ + mem_page_fault) ? \ + MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \ + memory_page_fault_scheme_cap.cap) : \ + MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, \ + transport_page_fault_scheme_cap.cap)) #define MLX5_CAP_ODP_MAX(mdev, cap)\ MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->max, cap) -- cgit v1.2.3 From 8d159eb2117b2e3697a31785662b653938f007cb Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Mon, 9 Sep 2024 20:30:23 +0300 Subject: RDMA/mlx5: Use IB set_netdev and get_netdev functions The IB layer provides a common interface to store and get net devices associated to an IB device port (ib_device_set_netdev() and ib_device_get_netdev()). Previously, mlx5_ib stored and managed the associated net devices internally. Replace internal net device management in mlx5_ib with ib_device_set_netdev() when attaching/detaching a net device and ib_device_get_netdev() when retrieving the net device. Export ib_device_get_netdev(). For mlx5 representors/PFs/VFs and lag creation we replace the netdev assignments with the IB set/get netdev functions. In active-backup mode lag the active slave net device is stored in the lag itself. To assure the net device stored in a lag bond IB device is the active slave we implement the following: - mlx5_core: when modifying the slave of a bond we send the internal driver event MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE. - mlx5_ib: when catching the event call ib_device_set_netdev() This patch also ensures the correct IB events are sent in switchdev lag. While at it, when in multiport eswitch mode, only a single IB device is created for all ports. The said IB device will receive all netdev events of its VFs once loaded, thus to avoid overwriting the mapping of PF IB device to PF netdev, ignore NETDEV_REGISTER events if the ib device has already been mapped to a netdev. Signed-off-by: Chiara Meiohas Signed-off-by: Michael Guralnik Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 1 + include/linux/mlx5/driver.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 57c9b18c3adb..dac33cfe9c0c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -371,6 +371,7 @@ enum mlx5_driver_event { MLX5_DRIVER_EVENT_SF_PEER_DEVLINK, MLX5_DRIVER_EVENT_AFFILIATION_DONE, MLX5_DRIVER_EVENT_AFFILIATION_REMOVED, + MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE, }; enum { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a96438ded15f..46a7a3d11048 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -643,6 +643,7 @@ struct mlx5_priv { struct mlx5_sf_hw_table *sf_hw_table; struct mlx5_sf_table *sf_table; #endif + struct blocking_notifier_head lag_nh; }; enum mlx5_device_state { @@ -1181,7 +1182,6 @@ bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev); -struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, struct net_device *slave); int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev, -- cgit v1.2.3 From 96f8052822e03c6f49b6b28fc1d6e5e0522ecbb9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Sep 2024 15:39:54 -0700 Subject: locking/mutex: Define mutex_init() once With CONFIG_PREEMPT_RT disabled __mutex_init() is a function. With CONFIG_PREEMPT_RT enabled, __mutex_init() is a macro. I assume this is why mutex_init() is defined twice as exactly the same macro. Prepare for introducing a new macro for mutex initialization by combining the two identical mutex_init() definitions into a single definition. This patch does not change any functionality because the C preprocessor expands macros when it encounters the macro name and not when a macro definition is encountered. See also commit bb630f9f7a7d ("locking/rtmutex: Add mutex variant for RT"). Acked-by: Peter Zijlstra (Intel) Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240912223956.3554086-2-bvanassche@acm.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mutex.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index a561c629d89f..ef617089db19 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -49,7 +49,6 @@ static inline void mutex_destroy(struct mutex *lock) {} #endif -#ifndef CONFIG_PREEMPT_RT /** * mutex_init - initialize the mutex * @mutex: the mutex to be initialized @@ -65,6 +64,7 @@ do { \ __mutex_init((mutex), #mutex, &__key); \ } while (0) +#ifndef CONFIG_PREEMPT_RT #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ @@ -111,12 +111,6 @@ do { \ __mutex_rt_init((mutex), name, key); \ } while (0) -#define mutex_init(mutex) \ -do { \ - static struct lock_class_key __key; \ - \ - __mutex_init((mutex), #mutex, &__key); \ -} while (0) #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES -- cgit v1.2.3 From e837d833a13461c10f265a65ce6612e6dd43e76f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Sep 2024 15:39:55 -0700 Subject: locking/mutex: Introduce mutex_init_with_key() The following pattern occurs 5 times in kernel drivers: lockdep_register_key(key); __mutex_init(mutex, name, key); In several cases the 'name' argument matches #mutex. Hence, introduce the mutex_init_with_key() macro. This macro derives the 'name' argument from the 'mutex' argument. Suggested-by: Andy Shevchenko Acked-by: Peter Zijlstra (Intel) Reviewed-by: Andy Shevchenko Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240912223956.3554086-3-bvanassche@acm.org Signed-off-by: Greg Kroah-Hartman --- include/linux/mutex.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index ef617089db19..2bf91b57591b 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -64,6 +64,17 @@ do { \ __mutex_init((mutex), #mutex, &__key); \ } while (0) +/** + * mutex_init_with_key - initialize a mutex with a given lockdep key + * @mutex: the mutex to be initialized + * @key: the lockdep key to be associated with the mutex + * + * Initialize the mutex to the unlocked state. + * + * It is not allowed to initialize an already locked mutex. + */ +#define mutex_init_with_key(mutex, key) __mutex_init((mutex), #mutex, (key)) + #ifndef CONFIG_PREEMPT_RT #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ -- cgit v1.2.3 From 4b7ff9ab98af11a477d50f08382bcc4c2f899926 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 13 Sep 2024 10:15:56 +0200 Subject: mm, slab: restore kerneldoc for kmem_cache_create() As kmem_cache_create() became a _Generic() wrapper macro, it currently has no kerneldoc despite being the main API to use. Add it. Also adjust kmem_cache_create_usercopy() kerneldoc to indicate it is now a legacy wrapper. Also expand the kerneldoc for struct kmem_cache_args, especially for the freeptr_offset field, where important details were removed with the removal of kmem_cache_create_rcu(). Signed-off-by: Vlastimil Babka Reviewed-by: Christian Brauner --- include/linux/slab.h | 114 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 91 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 331412a9f4f2..6a8ab7ef3af7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -242,19 +242,72 @@ bool slab_is_available(void); /** * struct kmem_cache_args - Less common arguments for kmem_cache_create() - * @align: The required alignment for the objects. - * @useroffset: Usercopy region offset - * @usersize: Usercopy region size - * @freeptr_offset: Custom offset for the free pointer in RCU caches - * @use_freeptr_offset: Whether a @freeptr_offset is used - * @ctor: A constructor for the objects. + * + * Any uninitialized fields of the structure are interpreted as unused. The + * exception is @freeptr_offset where %0 is a valid value, so + * @use_freeptr_offset must be also set to %true in order to interpret the field + * as used. For @useroffset %0 is also valid, but only with non-%0 + * @usersize. + * + * When %NULL args is passed to kmem_cache_create(), it is equivalent to all + * fields unused. */ struct kmem_cache_args { + /** + * @align: The required alignment for the objects. + * + * %0 means no specific alignment is requested. + */ unsigned int align; + /** + * @useroffset: Usercopy region offset. + * + * %0 is a valid offset, when @usersize is non-%0 + */ unsigned int useroffset; + /** + * @usersize: Usercopy region size. + * + * %0 means no usercopy region is specified. + */ unsigned int usersize; + /** + * @freeptr_offset: Custom offset for the free pointer + * in &SLAB_TYPESAFE_BY_RCU caches + * + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer + * outside of the object. This might cause the object to grow in size. + * Cache creators that have a reason to avoid this can specify a custom + * free pointer offset in their struct where the free pointer will be + * placed. + * + * Note that placing the free pointer inside the object requires the + * caller to ensure that no fields are invalidated that are required to + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for + * details). + * + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset + * is specified, %use_freeptr_offset must be set %true. + * + * Note that @ctor currently isn't supported with custom free pointers + * as a @ctor requires an external free pointer. + */ unsigned int freeptr_offset; + /** + * @use_freeptr_offset: Whether a @freeptr_offset is used. + */ bool use_freeptr_offset; + /** + * @ctor: A constructor for the objects. + * + * The constructor is invoked for each object in a newly allocated slab + * page. It is the cache user's responsibility to free object in the + * same state as after calling the constructor, or deal appropriately + * with any differences between a freshly constructed and a reallocated + * object. + * + * %NULL means no constructor. + */ void (*ctor)(void *); }; @@ -275,30 +328,20 @@ __kmem_cache_create(const char *name, unsigned int size, unsigned int align, } /** - * kmem_cache_create_usercopy - Create a cache with a region suitable - * for copying to userspace + * kmem_cache_create_usercopy - Create a kmem cache with a region suitable + * for copying to userspace. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. * @align: The required alignment for the objects. * @flags: SLAB flags * @useroffset: Usercopy region offset * @usersize: Usercopy region size - * @ctor: A constructor for the objects. - * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are + * @ctor: A constructor for the objects, or %NULL. * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. + * This is a legacy wrapper, new code should use either KMEM_CACHE_USERCOPY() + * if whitelisting a single field is sufficient, or kmem_cache_create() with + * the necessary parameters passed via the args parameter (see + * &struct kmem_cache_args) * * Return: a pointer to the cache on success, NULL on failure. */ @@ -333,6 +376,31 @@ __kmem_cache_default_args(const char *name, unsigned int size, return __kmem_cache_create_args(name, size, &kmem_default_args, flags); } +/** + * kmem_cache_create - Create a kmem cache. + * @__name: A string which is used in /proc/slabinfo to identify this cache. + * @__object_size: The size of objects to be created in this cache. + * @__args: Optional arguments, see &struct kmem_cache_args. Passing %NULL + * means defaults will be used for all the arguments. + * + * This is currently implemented as a macro using ``_Generic()`` to call + * either the new variant of the function, or a legacy one. + * + * The new variant has 4 parameters: + * ``kmem_cache_create(name, object_size, args, flags)`` + * + * See __kmem_cache_create_args() which implements this. + * + * The legacy variant has 5 parameters: + * ``kmem_cache_create(name, object_size, align, flags, ctor)`` + * + * The align and ctor parameters map to the respective fields of + * &struct kmem_cache_args + * + * Context: Cannot be called within a interrupt, but can be interrupted. + * + * Return: a pointer to the cache on success, NULL on failure. + */ #define kmem_cache_create(__name, __object_size, __args, ...) \ _Generic((__args), \ struct kmem_cache_args *: __kmem_cache_create_args, \ -- cgit v1.2.3 From 6a36d828bdef0e02b1e6c12e2160f5b83be6aab5 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 13 Sep 2024 02:09:55 +0100 Subject: driver core: attribute_container: Remove unused functions I can't find any use of 'attribute_container_add_class_device_adapter' or 'attribute_container_trigger' in git history. Their export decls went in 2006: commit 1740757e8f94 ("[PATCH] Driver Core: remove unused exports") and their docs disappeared in 2016: commit 47cb398dd75a ("Docs: sphinxify device-drivers.tmpl") Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20240913010955.1393995-1-linux@treblig.org Signed-off-by: Greg Kroah-Hartman --- include/linux/attribute_container.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/attribute_container.h b/include/linux/attribute_container.h index e4004d1e6725..b3643de9931d 100644 --- a/include/linux/attribute_container.h +++ b/include/linux/attribute_container.h @@ -61,14 +61,8 @@ int attribute_container_device_trigger_safe(struct device *dev, int (*undo)(struct attribute_container *, struct device *, struct device *)); -void attribute_container_trigger(struct device *dev, - int (*fn)(struct attribute_container *, - struct device *)); int attribute_container_add_attrs(struct device *classdev); int attribute_container_add_class_device(struct device *classdev); -int attribute_container_add_class_device_adapter(struct attribute_container *cont, - struct device *dev, - struct device *classdev); void attribute_container_remove_attrs(struct device *classdev); void attribute_container_class_device_del(struct device *classdev); struct attribute_container *attribute_container_classdev_to_container(struct device *); -- cgit v1.2.3 From 2cb4acf2140be8a4f299c0b887cc314845ef6ec8 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 13 Sep 2024 07:11:42 -0700 Subject: hwmon: Remove devm_hwmon_device_unregister() API function devm_hwmon_device_unregister() has no in-tree user, and its implementation is wrong since it does not pass the to-be-removed hardware monitoring device as parameter. I do not envision a valid use for it; drivers needing it should not have called devm_hwmon_device_register_with_info() in the first place. Remove it. Reported-by: Matthew Sanders Closes: https://lore.kernel.org/linux-hwmon/488b3bdf870ea76c4b943dbe5fd15ac8113019dc.camel@kernel.org/ Signed-off-by: Guenter Roeck --- include/linux/hwmon.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index e94314760aab..5c6a421ad580 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -481,7 +481,6 @@ devm_hwmon_device_register_with_info(struct device *dev, const struct attribute_group **extra_groups); void hwmon_device_unregister(struct device *dev); -void devm_hwmon_device_unregister(struct device *dev); int hwmon_notify_event(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel); -- cgit v1.2.3 From d175ee98fe545d2c56df22751314584cce228307 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 2 Sep 2024 21:17:18 +0200 Subject: mm: Define VM_DROPPABLE for powerpc/32 Commit 9651fcedf7b9 ("mm: add MAP_DROPPABLE for designating always lazily freeable mappings") only adds VM_DROPPABLE for 64 bits architectures. In order to also use the getrandom vDSO implementation on powerpc/32, use VM_ARCH_1 for VM_DROPPABLE on powerpc/32. This is possible because VM_ARCH_1 is used for VM_SAO on powerpc and VM_SAO is only for powerpc/64. It is used in combination with PROT_SAO in some parts of code that are restricted to CONFIG_PPC64 through #ifdefs, it is therefore possible to define VM_SAO for CONFIG_PPC64 only. Signed-off-by: Christophe Leroy Acked-by: Michael Ellerman Signed-off-by: Jason A. Donenfeld --- include/linux/mm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6549d0979b28..028847f39442 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -359,7 +359,7 @@ extern unsigned int kobjsize(const void *objp); #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ -#elif defined(CONFIG_PPC) +#elif defined(CONFIG_PPC64) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 @@ -409,6 +409,8 @@ extern unsigned int kobjsize(const void *objp); #ifdef CONFIG_64BIT #define VM_DROPPABLE_BIT 40 #define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) +#elif defined(CONFIG_PPC32) +#define VM_DROPPABLE VM_ARCH_1 #else #define VM_DROPPABLE VM_NONE #endif -- cgit v1.2.3 From 2b018086143d638de8d67ae5be6e8c1afb413193 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 11:28:46 -0700 Subject: blk-mq: unconditional nr_integrity_segments Always defining the field will make using it easier and less error prone in future patches. There shouldn't be any downside to this: the field fits in what would otherwise be a 2-byte hole, so we're not saving space by conditionally leaving it out. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913182854.2445457-2-kbusch@meta.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8d304b1d16b1..4fecf46ef681 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -149,10 +149,7 @@ struct request { * physical address coalescing is performed. */ unsigned short nr_phys_segments; - -#ifdef CONFIG_BLK_DEV_INTEGRITY unsigned short nr_integrity_segments; -#endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; -- cgit v1.2.3 From d2c5b1faccd5ef6352456f817e941945d3b3fe62 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 11:28:50 -0700 Subject: block: provide a request helper for user integrity segments Provide a helper to keep the request flags and nr_integrity_segments in sync with the bio's integrity payload. This is an integrity equivalent to the normal data helper function, 'blk_rq_map_user()'. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913182854.2445457-6-kbusch@meta.com Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index de98049b7ded..793dbb1e0672 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -28,6 +28,8 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); +int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, + ssize_t bytes, u32 seed); static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) @@ -102,6 +104,13 @@ static inline int blk_rq_map_integrity_sg(struct request_queue *q, { return 0; } +static inline int blk_rq_integrity_map_user(struct request *rq, + void __user *ubuf, + ssize_t bytes, + u32 seed) +{ + return -EINVAL; +} static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) { return NULL; -- cgit v1.2.3 From 76c313f658d2752e8527610677164aa7094ef7a5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Sep 2024 12:17:46 -0700 Subject: blk-integrity: improved sg segment mapping Make the integrity mapping more like data mapping, blk_rq_map_sg. Use the request to validate the segment count, and update the callers so they don't have to. Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20240913191746.2628196-1-kbusch@meta.com Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 793dbb1e0672..676f8f860c47 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -25,8 +25,7 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, } #ifdef CONFIG_BLK_DEV_INTEGRITY -int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, - struct scatterlist *); +int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes, u32 seed); @@ -98,8 +97,7 @@ static inline int blk_rq_count_integrity_sg(struct request_queue *q, { return 0; } -static inline int blk_rq_map_integrity_sg(struct request_queue *q, - struct bio *b, +static inline int blk_rq_map_integrity_sg(struct request *q, struct scatterlist *s) { return 0; -- cgit v1.2.3 From 32556ce93bc45c730829083cb60f95a2728ea48b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 Sep 2024 21:17:48 +0200 Subject: bpf: Fix helper writes to read-only maps Lonial found an issue that despite user- and BPF-side frozen BPF map (like in case of .rodata), it was still possible to write into it from a BPF program side through specific helpers having ARG_PTR_TO_{LONG,INT} as arguments. In check_func_arg() when the argument is as mentioned, the meta->raw_mode is never set. Later, check_helper_mem_access(), under the case of PTR_TO_MAP_VALUE as register base type, it assumes BPF_READ for the subsequent call to check_map_access_type() and given the BPF map is read-only it succeeds. The helpers really need to be annotated as ARG_PTR_TO_{LONG,INT} | MEM_UNINIT when results are written into them as opposed to read out of them. The latter indicates that it's okay to pass a pointer to uninitialized memory as the memory is written to anyway. However, ARG_PTR_TO_{LONG,INT} is a special case of ARG_PTR_TO_FIXED_SIZE_MEM just with additional alignment requirement. So it is better to just get rid of the ARG_PTR_TO_{LONG,INT} special cases altogether and reuse the fixed size memory types. For this, add MEM_ALIGNED to additionally ensure alignment given these helpers write directly into the args via * = val. The .arg*_size has been initialized reflecting the actual sizeof(*). MEM_ALIGNED can only be used in combination with MEM_FIXED_SIZE annotated argument types, since in !MEM_FIXED_SIZE cases the verifier does not know the buffer size a priori and therefore cannot blindly write * = val. Fixes: 57c3bb725a3d ("bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types") Reported-by: Lonial Con Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20240913191754.13290-3-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7f3e8477d5e7..0c3893c47171 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -695,6 +695,11 @@ enum bpf_type_flag { /* DYNPTR points to xdp_buff */ DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS), + /* Memory must be aligned on some architectures, used in combination with + * MEM_FIXED_SIZE. + */ + MEM_ALIGNED = BIT(17 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -732,8 +737,6 @@ enum bpf_arg_type { ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ - ARG_PTR_TO_INT, /* pointer to int */ - ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_RINGBUF_MEM, /* pointer to dynamically reserved ringbuf memory */ -- cgit v1.2.3 From 0cca961a026177af69044f10d6ae76d8ce043764 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 12 Sep 2024 11:00:25 +0530 Subject: PCI: Pass domain number to pci_bus_release_domain_nr() explicitly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pci_bus_release_domain_nr() API is supposed to free the domain number allocated by pci_bus_find_domain_nr(). Most of the callers of pci_bus_find_domain_nr(), store the domain number in pci_bus::domain_nr. As such, the pci_bus_release_domain_nr() implicitly frees the domain number by dereferencing 'struct pci_bus'. However, one of the callers of this API, the PCI endpoint subsystem, doesn't have 'struct pci_bus', so it only passes NULL. Due to this, the API will end up dereferencing the NULL pointer. To fix this issue, pass the domain number to this API explicitly. Since 'struct pci_bus' is not used for anything else other than extracting the domain number, it makes sense to pass the domain number directly. Fixes: 0328947c5032 ("PCI: endpoint: Assign PCI domain number for endpoint controllers") Closes: https://lore.kernel.org/linux-pci/c0c40ddb-bf64-4b22-9dd1-8dbb18aa2813@stanley.mountain Link: https://lore.kernel.org/linux-pci/20240912053025.25314-1-manivannan.sadhasivam@linaro.org Reported-by: Dan Carpenter Signed-off-by: Manivannan Sadhasivam [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci.h b/include/linux/pci.h index 4cf89a4b4cbc..37d97bef060f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1884,7 +1884,7 @@ static inline int acpi_pci_bus_find_domain_nr(struct pci_bus *bus) { return 0; } #endif int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent); -void pci_bus_release_domain_nr(struct pci_bus *bus, struct device *parent); +void pci_bus_release_domain_nr(struct device *parent, int domain_nr); #endif /* Some architectures require additional setup to direct VGA traffic */ -- cgit v1.2.3 From 1b8c9cb3151a541a4197d2bb449233064f747832 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Thu, 12 Sep 2024 09:19:49 +0800 Subject: MIPS: Remove the obsoleted code for include/linux/mv643xx.h Most of the drivers which used this header have been deleted, most of these code is obsoleted, move the only defines that are actually used into arch/powerpc/platforms/chrp/pegasos_eth.c and delete the file completely. Signed-off-by: Gaosheng Cui Link: https://patch.msgid.link/20240912011949.2726928-1-cuigaosheng1@huawei.com Signed-off-by: Jakub Kicinski --- include/linux/mv643xx.h | 921 ------------------------------------------------ 1 file changed, 921 deletions(-) delete mode 100644 include/linux/mv643xx.h (limited to 'include/linux') diff --git a/include/linux/mv643xx.h b/include/linux/mv643xx.h deleted file mode 100644 index 000b126acfb6..000000000000 --- a/include/linux/mv643xx.h +++ /dev/null @@ -1,921 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * mv643xx.h - MV-643XX Internal registers definition file. - * - * Copyright 2002 Momentum Computer, Inc. - * Author: Matthew Dharm - * Copyright 2002 GALILEO TECHNOLOGY, LTD. - */ -#ifndef __ASM_MV643XX_H -#define __ASM_MV643XX_H - -#include -#include -#include - -/****************************************/ -/* Processor Address Space */ -/****************************************/ - -/* DDR SDRAM BAR and size registers */ - -#define MV64340_CS_0_BASE_ADDR 0x008 -#define MV64340_CS_0_SIZE 0x010 -#define MV64340_CS_1_BASE_ADDR 0x208 -#define MV64340_CS_1_SIZE 0x210 -#define MV64340_CS_2_BASE_ADDR 0x018 -#define MV64340_CS_2_SIZE 0x020 -#define MV64340_CS_3_BASE_ADDR 0x218 -#define MV64340_CS_3_SIZE 0x220 - -/* Devices BAR and size registers */ - -#define MV64340_DEV_CS0_BASE_ADDR 0x028 -#define MV64340_DEV_CS0_SIZE 0x030 -#define MV64340_DEV_CS1_BASE_ADDR 0x228 -#define MV64340_DEV_CS1_SIZE 0x230 -#define MV64340_DEV_CS2_BASE_ADDR 0x248 -#define MV64340_DEV_CS2_SIZE 0x250 -#define MV64340_DEV_CS3_BASE_ADDR 0x038 -#define MV64340_DEV_CS3_SIZE 0x040 -#define MV64340_BOOTCS_BASE_ADDR 0x238 -#define MV64340_BOOTCS_SIZE 0x240 - -/* PCI 0 BAR and size registers */ - -#define MV64340_PCI_0_IO_BASE_ADDR 0x048 -#define MV64340_PCI_0_IO_SIZE 0x050 -#define MV64340_PCI_0_MEMORY0_BASE_ADDR 0x058 -#define MV64340_PCI_0_MEMORY0_SIZE 0x060 -#define MV64340_PCI_0_MEMORY1_BASE_ADDR 0x080 -#define MV64340_PCI_0_MEMORY1_SIZE 0x088 -#define MV64340_PCI_0_MEMORY2_BASE_ADDR 0x258 -#define MV64340_PCI_0_MEMORY2_SIZE 0x260 -#define MV64340_PCI_0_MEMORY3_BASE_ADDR 0x280 -#define MV64340_PCI_0_MEMORY3_SIZE 0x288 - -/* PCI 1 BAR and size registers */ -#define MV64340_PCI_1_IO_BASE_ADDR 0x090 -#define MV64340_PCI_1_IO_SIZE 0x098 -#define MV64340_PCI_1_MEMORY0_BASE_ADDR 0x0a0 -#define MV64340_PCI_1_MEMORY0_SIZE 0x0a8 -#define MV64340_PCI_1_MEMORY1_BASE_ADDR 0x0b0 -#define MV64340_PCI_1_MEMORY1_SIZE 0x0b8 -#define MV64340_PCI_1_MEMORY2_BASE_ADDR 0x2a0 -#define MV64340_PCI_1_MEMORY2_SIZE 0x2a8 -#define MV64340_PCI_1_MEMORY3_BASE_ADDR 0x2b0 -#define MV64340_PCI_1_MEMORY3_SIZE 0x2b8 - -/* SRAM base address */ -#define MV64340_INTEGRATED_SRAM_BASE_ADDR 0x268 - -/* internal registers space base address */ -#define MV64340_INTERNAL_SPACE_BASE_ADDR 0x068 - -/* Enables the CS , DEV_CS , PCI 0 and PCI 1 - windows above */ -#define MV64340_BASE_ADDR_ENABLE 0x278 - -/****************************************/ -/* PCI remap registers */ -/****************************************/ - /* PCI 0 */ -#define MV64340_PCI_0_IO_ADDR_REMAP 0x0f0 -#define MV64340_PCI_0_MEMORY0_LOW_ADDR_REMAP 0x0f8 -#define MV64340_PCI_0_MEMORY0_HIGH_ADDR_REMAP 0x320 -#define MV64340_PCI_0_MEMORY1_LOW_ADDR_REMAP 0x100 -#define MV64340_PCI_0_MEMORY1_HIGH_ADDR_REMAP 0x328 -#define MV64340_PCI_0_MEMORY2_LOW_ADDR_REMAP 0x2f8 -#define MV64340_PCI_0_MEMORY2_HIGH_ADDR_REMAP 0x330 -#define MV64340_PCI_0_MEMORY3_LOW_ADDR_REMAP 0x300 -#define MV64340_PCI_0_MEMORY3_HIGH_ADDR_REMAP 0x338 - /* PCI 1 */ -#define MV64340_PCI_1_IO_ADDR_REMAP 0x108 -#define MV64340_PCI_1_MEMORY0_LOW_ADDR_REMAP 0x110 -#define MV64340_PCI_1_MEMORY0_HIGH_ADDR_REMAP 0x340 -#define MV64340_PCI_1_MEMORY1_LOW_ADDR_REMAP 0x118 -#define MV64340_PCI_1_MEMORY1_HIGH_ADDR_REMAP 0x348 -#define MV64340_PCI_1_MEMORY2_LOW_ADDR_REMAP 0x310 -#define MV64340_PCI_1_MEMORY2_HIGH_ADDR_REMAP 0x350 -#define MV64340_PCI_1_MEMORY3_LOW_ADDR_REMAP 0x318 -#define MV64340_PCI_1_MEMORY3_HIGH_ADDR_REMAP 0x358 - -#define MV64340_CPU_PCI_0_HEADERS_RETARGET_CONTROL 0x3b0 -#define MV64340_CPU_PCI_0_HEADERS_RETARGET_BASE 0x3b8 -#define MV64340_CPU_PCI_1_HEADERS_RETARGET_CONTROL 0x3c0 -#define MV64340_CPU_PCI_1_HEADERS_RETARGET_BASE 0x3c8 -#define MV64340_CPU_GE_HEADERS_RETARGET_CONTROL 0x3d0 -#define MV64340_CPU_GE_HEADERS_RETARGET_BASE 0x3d8 -#define MV64340_CPU_IDMA_HEADERS_RETARGET_CONTROL 0x3e0 -#define MV64340_CPU_IDMA_HEADERS_RETARGET_BASE 0x3e8 - -/****************************************/ -/* CPU Control Registers */ -/****************************************/ - -#define MV64340_CPU_CONFIG 0x000 -#define MV64340_CPU_MODE 0x120 -#define MV64340_CPU_MASTER_CONTROL 0x160 -#define MV64340_CPU_CROSS_BAR_CONTROL_LOW 0x150 -#define MV64340_CPU_CROSS_BAR_CONTROL_HIGH 0x158 -#define MV64340_CPU_CROSS_BAR_TIMEOUT 0x168 - -/****************************************/ -/* SMP RegisterS */ -/****************************************/ - -#define MV64340_SMP_WHO_AM_I 0x200 -#define MV64340_SMP_CPU0_DOORBELL 0x214 -#define MV64340_SMP_CPU0_DOORBELL_CLEAR 0x21C -#define MV64340_SMP_CPU1_DOORBELL 0x224 -#define MV64340_SMP_CPU1_DOORBELL_CLEAR 0x22C -#define MV64340_SMP_CPU0_DOORBELL_MASK 0x234 -#define MV64340_SMP_CPU1_DOORBELL_MASK 0x23C -#define MV64340_SMP_SEMAPHOR0 0x244 -#define MV64340_SMP_SEMAPHOR1 0x24c -#define MV64340_SMP_SEMAPHOR2 0x254 -#define MV64340_SMP_SEMAPHOR3 0x25c -#define MV64340_SMP_SEMAPHOR4 0x264 -#define MV64340_SMP_SEMAPHOR5 0x26c -#define MV64340_SMP_SEMAPHOR6 0x274 -#define MV64340_SMP_SEMAPHOR7 0x27c - -/****************************************/ -/* CPU Sync Barrier Register */ -/****************************************/ - -#define MV64340_CPU_0_SYNC_BARRIER_TRIGGER 0x0c0 -#define MV64340_CPU_0_SYNC_BARRIER_VIRTUAL 0x0c8 -#define MV64340_CPU_1_SYNC_BARRIER_TRIGGER 0x0d0 -#define MV64340_CPU_1_SYNC_BARRIER_VIRTUAL 0x0d8 - -/****************************************/ -/* CPU Access Protect */ -/****************************************/ - -#define MV64340_CPU_PROTECT_WINDOW_0_BASE_ADDR 0x180 -#define MV64340_CPU_PROTECT_WINDOW_0_SIZE 0x188 -#define MV64340_CPU_PROTECT_WINDOW_1_BASE_ADDR 0x190 -#define MV64340_CPU_PROTECT_WINDOW_1_SIZE 0x198 -#define MV64340_CPU_PROTECT_WINDOW_2_BASE_ADDR 0x1a0 -#define MV64340_CPU_PROTECT_WINDOW_2_SIZE 0x1a8 -#define MV64340_CPU_PROTECT_WINDOW_3_BASE_ADDR 0x1b0 -#define MV64340_CPU_PROTECT_WINDOW_3_SIZE 0x1b8 - - -/****************************************/ -/* CPU Error Report */ -/****************************************/ - -#define MV64340_CPU_ERROR_ADDR_LOW 0x070 -#define MV64340_CPU_ERROR_ADDR_HIGH 0x078 -#define MV64340_CPU_ERROR_DATA_LOW 0x128 -#define MV64340_CPU_ERROR_DATA_HIGH 0x130 -#define MV64340_CPU_ERROR_PARITY 0x138 -#define MV64340_CPU_ERROR_CAUSE 0x140 -#define MV64340_CPU_ERROR_MASK 0x148 - -/****************************************/ -/* CPU Interface Debug Registers */ -/****************************************/ - -#define MV64340_PUNIT_SLAVE_DEBUG_LOW 0x360 -#define MV64340_PUNIT_SLAVE_DEBUG_HIGH 0x368 -#define MV64340_PUNIT_MASTER_DEBUG_LOW 0x370 -#define MV64340_PUNIT_MASTER_DEBUG_HIGH 0x378 -#define MV64340_PUNIT_MMASK 0x3e4 - -/****************************************/ -/* Integrated SRAM Registers */ -/****************************************/ - -#define MV64340_SRAM_CONFIG 0x380 -#define MV64340_SRAM_TEST_MODE 0X3F4 -#define MV64340_SRAM_ERROR_CAUSE 0x388 -#define MV64340_SRAM_ERROR_ADDR 0x390 -#define MV64340_SRAM_ERROR_ADDR_HIGH 0X3F8 -#define MV64340_SRAM_ERROR_DATA_LOW 0x398 -#define MV64340_SRAM_ERROR_DATA_HIGH 0x3a0 -#define MV64340_SRAM_ERROR_DATA_PARITY 0x3a8 - -/****************************************/ -/* SDRAM Configuration */ -/****************************************/ - -#define MV64340_SDRAM_CONFIG 0x1400 -#define MV64340_D_UNIT_CONTROL_LOW 0x1404 -#define MV64340_D_UNIT_CONTROL_HIGH 0x1424 -#define MV64340_SDRAM_TIMING_CONTROL_LOW 0x1408 -#define MV64340_SDRAM_TIMING_CONTROL_HIGH 0x140c -#define MV64340_SDRAM_ADDR_CONTROL 0x1410 -#define MV64340_SDRAM_OPEN_PAGES_CONTROL 0x1414 -#define MV64340_SDRAM_OPERATION 0x1418 -#define MV64340_SDRAM_MODE 0x141c -#define MV64340_EXTENDED_DRAM_MODE 0x1420 -#define MV64340_SDRAM_CROSS_BAR_CONTROL_LOW 0x1430 -#define MV64340_SDRAM_CROSS_BAR_CONTROL_HIGH 0x1434 -#define MV64340_SDRAM_CROSS_BAR_TIMEOUT 0x1438 -#define MV64340_SDRAM_ADDR_CTRL_PADS_CALIBRATION 0x14c0 -#define MV64340_SDRAM_DATA_PADS_CALIBRATION 0x14c4 - -/****************************************/ -/* SDRAM Error Report */ -/****************************************/ - -#define MV64340_SDRAM_ERROR_DATA_LOW 0x1444 -#define MV64340_SDRAM_ERROR_DATA_HIGH 0x1440 -#define MV64340_SDRAM_ERROR_ADDR 0x1450 -#define MV64340_SDRAM_RECEIVED_ECC 0x1448 -#define MV64340_SDRAM_CALCULATED_ECC 0x144c -#define MV64340_SDRAM_ECC_CONTROL 0x1454 -#define MV64340_SDRAM_ECC_ERROR_COUNTER 0x1458 - -/******************************************/ -/* Controlled Delay Line (CDL) Registers */ -/******************************************/ - -#define MV64340_DFCDL_CONFIG0 0x1480 -#define MV64340_DFCDL_CONFIG1 0x1484 -#define MV64340_DLL_WRITE 0x1488 -#define MV64340_DLL_READ 0x148c -#define MV64340_SRAM_ADDR 0x1490 -#define MV64340_SRAM_DATA0 0x1494 -#define MV64340_SRAM_DATA1 0x1498 -#define MV64340_SRAM_DATA2 0x149c -#define MV64340_DFCL_PROBE 0x14a0 - -/******************************************/ -/* Debug Registers */ -/******************************************/ - -#define MV64340_DUNIT_DEBUG_LOW 0x1460 -#define MV64340_DUNIT_DEBUG_HIGH 0x1464 -#define MV64340_DUNIT_MMASK 0X1b40 - -/****************************************/ -/* Device Parameters */ -/****************************************/ - -#define MV64340_DEVICE_BANK0_PARAMETERS 0x45c -#define MV64340_DEVICE_BANK1_PARAMETERS 0x460 -#define MV64340_DEVICE_BANK2_PARAMETERS 0x464 -#define MV64340_DEVICE_BANK3_PARAMETERS 0x468 -#define MV64340_DEVICE_BOOT_BANK_PARAMETERS 0x46c -#define MV64340_DEVICE_INTERFACE_CONTROL 0x4c0 -#define MV64340_DEVICE_INTERFACE_CROSS_BAR_CONTROL_LOW 0x4c8 -#define MV64340_DEVICE_INTERFACE_CROSS_BAR_CONTROL_HIGH 0x4cc -#define MV64340_DEVICE_INTERFACE_CROSS_BAR_TIMEOUT 0x4c4 - -/****************************************/ -/* Device interrupt registers */ -/****************************************/ - -#define MV64340_DEVICE_INTERRUPT_CAUSE 0x4d0 -#define MV64340_DEVICE_INTERRUPT_MASK 0x4d4 -#define MV64340_DEVICE_ERROR_ADDR 0x4d8 -#define MV64340_DEVICE_ERROR_DATA 0x4dc -#define MV64340_DEVICE_ERROR_PARITY 0x4e0 - -/****************************************/ -/* Device debug registers */ -/****************************************/ - -#define MV64340_DEVICE_DEBUG_LOW 0x4e4 -#define MV64340_DEVICE_DEBUG_HIGH 0x4e8 -#define MV64340_RUNIT_MMASK 0x4f0 - -/****************************************/ -/* PCI Slave Address Decoding registers */ -/****************************************/ - -#define MV64340_PCI_0_CS_0_BANK_SIZE 0xc08 -#define MV64340_PCI_1_CS_0_BANK_SIZE 0xc88 -#define MV64340_PCI_0_CS_1_BANK_SIZE 0xd08 -#define MV64340_PCI_1_CS_1_BANK_SIZE 0xd88 -#define MV64340_PCI_0_CS_2_BANK_SIZE 0xc0c -#define MV64340_PCI_1_CS_2_BANK_SIZE 0xc8c -#define MV64340_PCI_0_CS_3_BANK_SIZE 0xd0c -#define MV64340_PCI_1_CS_3_BANK_SIZE 0xd8c -#define MV64340_PCI_0_DEVCS_0_BANK_SIZE 0xc10 -#define MV64340_PCI_1_DEVCS_0_BANK_SIZE 0xc90 -#define MV64340_PCI_0_DEVCS_1_BANK_SIZE 0xd10 -#define MV64340_PCI_1_DEVCS_1_BANK_SIZE 0xd90 -#define MV64340_PCI_0_DEVCS_2_BANK_SIZE 0xd18 -#define MV64340_PCI_1_DEVCS_2_BANK_SIZE 0xd98 -#define MV64340_PCI_0_DEVCS_3_BANK_SIZE 0xc14 -#define MV64340_PCI_1_DEVCS_3_BANK_SIZE 0xc94 -#define MV64340_PCI_0_DEVCS_BOOT_BANK_SIZE 0xd14 -#define MV64340_PCI_1_DEVCS_BOOT_BANK_SIZE 0xd94 -#define MV64340_PCI_0_P2P_MEM0_BAR_SIZE 0xd1c -#define MV64340_PCI_1_P2P_MEM0_BAR_SIZE 0xd9c -#define MV64340_PCI_0_P2P_MEM1_BAR_SIZE 0xd20 -#define MV64340_PCI_1_P2P_MEM1_BAR_SIZE 0xda0 -#define MV64340_PCI_0_P2P_I_O_BAR_SIZE 0xd24 -#define MV64340_PCI_1_P2P_I_O_BAR_SIZE 0xda4 -#define MV64340_PCI_0_CPU_BAR_SIZE 0xd28 -#define MV64340_PCI_1_CPU_BAR_SIZE 0xda8 -#define MV64340_PCI_0_INTERNAL_SRAM_BAR_SIZE 0xe00 -#define MV64340_PCI_1_INTERNAL_SRAM_BAR_SIZE 0xe80 -#define MV64340_PCI_0_EXPANSION_ROM_BAR_SIZE 0xd2c -#define MV64340_PCI_1_EXPANSION_ROM_BAR_SIZE 0xd9c -#define MV64340_PCI_0_BASE_ADDR_REG_ENABLE 0xc3c -#define MV64340_PCI_1_BASE_ADDR_REG_ENABLE 0xcbc -#define MV64340_PCI_0_CS_0_BASE_ADDR_REMAP 0xc48 -#define MV64340_PCI_1_CS_0_BASE_ADDR_REMAP 0xcc8 -#define MV64340_PCI_0_CS_1_BASE_ADDR_REMAP 0xd48 -#define MV64340_PCI_1_CS_1_BASE_ADDR_REMAP 0xdc8 -#define MV64340_PCI_0_CS_2_BASE_ADDR_REMAP 0xc4c -#define MV64340_PCI_1_CS_2_BASE_ADDR_REMAP 0xccc -#define MV64340_PCI_0_CS_3_BASE_ADDR_REMAP 0xd4c -#define MV64340_PCI_1_CS_3_BASE_ADDR_REMAP 0xdcc -#define MV64340_PCI_0_CS_0_BASE_HIGH_ADDR_REMAP 0xF04 -#define MV64340_PCI_1_CS_0_BASE_HIGH_ADDR_REMAP 0xF84 -#define MV64340_PCI_0_CS_1_BASE_HIGH_ADDR_REMAP 0xF08 -#define MV64340_PCI_1_CS_1_BASE_HIGH_ADDR_REMAP 0xF88 -#define MV64340_PCI_0_CS_2_BASE_HIGH_ADDR_REMAP 0xF0C -#define MV64340_PCI_1_CS_2_BASE_HIGH_ADDR_REMAP 0xF8C -#define MV64340_PCI_0_CS_3_BASE_HIGH_ADDR_REMAP 0xF10 -#define MV64340_PCI_1_CS_3_BASE_HIGH_ADDR_REMAP 0xF90 -#define MV64340_PCI_0_DEVCS_0_BASE_ADDR_REMAP 0xc50 -#define MV64340_PCI_1_DEVCS_0_BASE_ADDR_REMAP 0xcd0 -#define MV64340_PCI_0_DEVCS_1_BASE_ADDR_REMAP 0xd50 -#define MV64340_PCI_1_DEVCS_1_BASE_ADDR_REMAP 0xdd0 -#define MV64340_PCI_0_DEVCS_2_BASE_ADDR_REMAP 0xd58 -#define MV64340_PCI_1_DEVCS_2_BASE_ADDR_REMAP 0xdd8 -#define MV64340_PCI_0_DEVCS_3_BASE_ADDR_REMAP 0xc54 -#define MV64340_PCI_1_DEVCS_3_BASE_ADDR_REMAP 0xcd4 -#define MV64340_PCI_0_DEVCS_BOOTCS_BASE_ADDR_REMAP 0xd54 -#define MV64340_PCI_1_DEVCS_BOOTCS_BASE_ADDR_REMAP 0xdd4 -#define MV64340_PCI_0_P2P_MEM0_BASE_ADDR_REMAP_LOW 0xd5c -#define MV64340_PCI_1_P2P_MEM0_BASE_ADDR_REMAP_LOW 0xddc -#define MV64340_PCI_0_P2P_MEM0_BASE_ADDR_REMAP_HIGH 0xd60 -#define MV64340_PCI_1_P2P_MEM0_BASE_ADDR_REMAP_HIGH 0xde0 -#define MV64340_PCI_0_P2P_MEM1_BASE_ADDR_REMAP_LOW 0xd64 -#define MV64340_PCI_1_P2P_MEM1_BASE_ADDR_REMAP_LOW 0xde4 -#define MV64340_PCI_0_P2P_MEM1_BASE_ADDR_REMAP_HIGH 0xd68 -#define MV64340_PCI_1_P2P_MEM1_BASE_ADDR_REMAP_HIGH 0xde8 -#define MV64340_PCI_0_P2P_I_O_BASE_ADDR_REMAP 0xd6c -#define MV64340_PCI_1_P2P_I_O_BASE_ADDR_REMAP 0xdec -#define MV64340_PCI_0_CPU_BASE_ADDR_REMAP_LOW 0xd70 -#define MV64340_PCI_1_CPU_BASE_ADDR_REMAP_LOW 0xdf0 -#define MV64340_PCI_0_CPU_BASE_ADDR_REMAP_HIGH 0xd74 -#define MV64340_PCI_1_CPU_BASE_ADDR_REMAP_HIGH 0xdf4 -#define MV64340_PCI_0_INTEGRATED_SRAM_BASE_ADDR_REMAP 0xf00 -#define MV64340_PCI_1_INTEGRATED_SRAM_BASE_ADDR_REMAP 0xf80 -#define MV64340_PCI_0_EXPANSION_ROM_BASE_ADDR_REMAP 0xf38 -#define MV64340_PCI_1_EXPANSION_ROM_BASE_ADDR_REMAP 0xfb8 -#define MV64340_PCI_0_ADDR_DECODE_CONTROL 0xd3c -#define MV64340_PCI_1_ADDR_DECODE_CONTROL 0xdbc -#define MV64340_PCI_0_HEADERS_RETARGET_CONTROL 0xF40 -#define MV64340_PCI_1_HEADERS_RETARGET_CONTROL 0xFc0 -#define MV64340_PCI_0_HEADERS_RETARGET_BASE 0xF44 -#define MV64340_PCI_1_HEADERS_RETARGET_BASE 0xFc4 -#define MV64340_PCI_0_HEADERS_RETARGET_HIGH 0xF48 -#define MV64340_PCI_1_HEADERS_RETARGET_HIGH 0xFc8 - -/***********************************/ -/* PCI Control Register Map */ -/***********************************/ - -#define MV64340_PCI_0_DLL_STATUS_AND_COMMAND 0x1d20 -#define MV64340_PCI_1_DLL_STATUS_AND_COMMAND 0x1da0 -#define MV64340_PCI_0_MPP_PADS_DRIVE_CONTROL 0x1d1C -#define MV64340_PCI_1_MPP_PADS_DRIVE_CONTROL 0x1d9C -#define MV64340_PCI_0_COMMAND 0xc00 -#define MV64340_PCI_1_COMMAND 0xc80 -#define MV64340_PCI_0_MODE 0xd00 -#define MV64340_PCI_1_MODE 0xd80 -#define MV64340_PCI_0_RETRY 0xc04 -#define MV64340_PCI_1_RETRY 0xc84 -#define MV64340_PCI_0_READ_BUFFER_DISCARD_TIMER 0xd04 -#define MV64340_PCI_1_READ_BUFFER_DISCARD_TIMER 0xd84 -#define MV64340_PCI_0_MSI_TRIGGER_TIMER 0xc38 -#define MV64340_PCI_1_MSI_TRIGGER_TIMER 0xcb8 -#define MV64340_PCI_0_ARBITER_CONTROL 0x1d00 -#define MV64340_PCI_1_ARBITER_CONTROL 0x1d80 -#define MV64340_PCI_0_CROSS_BAR_CONTROL_LOW 0x1d08 -#define MV64340_PCI_1_CROSS_BAR_CONTROL_LOW 0x1d88 -#define MV64340_PCI_0_CROSS_BAR_CONTROL_HIGH 0x1d0c -#define MV64340_PCI_1_CROSS_BAR_CONTROL_HIGH 0x1d8c -#define MV64340_PCI_0_CROSS_BAR_TIMEOUT 0x1d04 -#define MV64340_PCI_1_CROSS_BAR_TIMEOUT 0x1d84 -#define MV64340_PCI_0_SYNC_BARRIER_TRIGGER_REG 0x1D18 -#define MV64340_PCI_1_SYNC_BARRIER_TRIGGER_REG 0x1D98 -#define MV64340_PCI_0_SYNC_BARRIER_VIRTUAL_REG 0x1d10 -#define MV64340_PCI_1_SYNC_BARRIER_VIRTUAL_REG 0x1d90 -#define MV64340_PCI_0_P2P_CONFIG 0x1d14 -#define MV64340_PCI_1_P2P_CONFIG 0x1d94 - -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_0_LOW 0x1e00 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_0_HIGH 0x1e04 -#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_0 0x1e08 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_1_LOW 0x1e10 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_1_HIGH 0x1e14 -#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_1 0x1e18 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_2_LOW 0x1e20 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_2_HIGH 0x1e24 -#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_2 0x1e28 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_3_LOW 0x1e30 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_3_HIGH 0x1e34 -#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_3 0x1e38 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_4_LOW 0x1e40 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_4_HIGH 0x1e44 -#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_4 0x1e48 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_5_LOW 0x1e50 -#define MV64340_PCI_0_ACCESS_CONTROL_BASE_5_HIGH 0x1e54 -#define MV64340_PCI_0_ACCESS_CONTROL_SIZE_5 0x1e58 - -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_0_LOW 0x1e80 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_0_HIGH 0x1e84 -#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_0 0x1e88 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_1_LOW 0x1e90 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_1_HIGH 0x1e94 -#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_1 0x1e98 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_2_LOW 0x1ea0 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_2_HIGH 0x1ea4 -#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_2 0x1ea8 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_3_LOW 0x1eb0 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_3_HIGH 0x1eb4 -#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_3 0x1eb8 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_4_LOW 0x1ec0 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_4_HIGH 0x1ec4 -#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_4 0x1ec8 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_5_LOW 0x1ed0 -#define MV64340_PCI_1_ACCESS_CONTROL_BASE_5_HIGH 0x1ed4 -#define MV64340_PCI_1_ACCESS_CONTROL_SIZE_5 0x1ed8 - -/****************************************/ -/* PCI Configuration Access Registers */ -/****************************************/ - -#define MV64340_PCI_0_CONFIG_ADDR 0xcf8 -#define MV64340_PCI_0_CONFIG_DATA_VIRTUAL_REG 0xcfc -#define MV64340_PCI_1_CONFIG_ADDR 0xc78 -#define MV64340_PCI_1_CONFIG_DATA_VIRTUAL_REG 0xc7c -#define MV64340_PCI_0_INTERRUPT_ACKNOWLEDGE_VIRTUAL_REG 0xc34 -#define MV64340_PCI_1_INTERRUPT_ACKNOWLEDGE_VIRTUAL_REG 0xcb4 - -/****************************************/ -/* PCI Error Report Registers */ -/****************************************/ - -#define MV64340_PCI_0_SERR_MASK 0xc28 -#define MV64340_PCI_1_SERR_MASK 0xca8 -#define MV64340_PCI_0_ERROR_ADDR_LOW 0x1d40 -#define MV64340_PCI_1_ERROR_ADDR_LOW 0x1dc0 -#define MV64340_PCI_0_ERROR_ADDR_HIGH 0x1d44 -#define MV64340_PCI_1_ERROR_ADDR_HIGH 0x1dc4 -#define MV64340_PCI_0_ERROR_ATTRIBUTE 0x1d48 -#define MV64340_PCI_1_ERROR_ATTRIBUTE 0x1dc8 -#define MV64340_PCI_0_ERROR_COMMAND 0x1d50 -#define MV64340_PCI_1_ERROR_COMMAND 0x1dd0 -#define MV64340_PCI_0_ERROR_CAUSE 0x1d58 -#define MV64340_PCI_1_ERROR_CAUSE 0x1dd8 -#define MV64340_PCI_0_ERROR_MASK 0x1d5c -#define MV64340_PCI_1_ERROR_MASK 0x1ddc - -/****************************************/ -/* PCI Debug Registers */ -/****************************************/ - -#define MV64340_PCI_0_MMASK 0X1D24 -#define MV64340_PCI_1_MMASK 0X1DA4 - -/*********************************************/ -/* PCI Configuration, Function 0, Registers */ -/*********************************************/ - -#define MV64340_PCI_DEVICE_AND_VENDOR_ID 0x000 -#define MV64340_PCI_STATUS_AND_COMMAND 0x004 -#define MV64340_PCI_CLASS_CODE_AND_REVISION_ID 0x008 -#define MV64340_PCI_BIST_HEADER_TYPE_LATENCY_TIMER_CACHE_LINE 0x00C - -#define MV64340_PCI_SCS_0_BASE_ADDR_LOW 0x010 -#define MV64340_PCI_SCS_0_BASE_ADDR_HIGH 0x014 -#define MV64340_PCI_SCS_1_BASE_ADDR_LOW 0x018 -#define MV64340_PCI_SCS_1_BASE_ADDR_HIGH 0x01C -#define MV64340_PCI_INTERNAL_REG_MEM_MAPPED_BASE_ADDR_LOW 0x020 -#define MV64340_PCI_INTERNAL_REG_MEM_MAPPED_BASE_ADDR_HIGH 0x024 -#define MV64340_PCI_SUBSYSTEM_ID_AND_SUBSYSTEM_VENDOR_ID 0x02c -#define MV64340_PCI_EXPANSION_ROM_BASE_ADDR_REG 0x030 -#define MV64340_PCI_CAPABILTY_LIST_POINTER 0x034 -#define MV64340_PCI_INTERRUPT_PIN_AND_LINE 0x03C - /* capability list */ -#define MV64340_PCI_POWER_MANAGEMENT_CAPABILITY 0x040 -#define MV64340_PCI_POWER_MANAGEMENT_STATUS_AND_CONTROL 0x044 -#define MV64340_PCI_VPD_ADDR 0x048 -#define MV64340_PCI_VPD_DATA 0x04c -#define MV64340_PCI_MSI_MESSAGE_CONTROL 0x050 -#define MV64340_PCI_MSI_MESSAGE_ADDR 0x054 -#define MV64340_PCI_MSI_MESSAGE_UPPER_ADDR 0x058 -#define MV64340_PCI_MSI_MESSAGE_DATA 0x05c -#define MV64340_PCI_X_COMMAND 0x060 -#define MV64340_PCI_X_STATUS 0x064 -#define MV64340_PCI_COMPACT_PCI_HOT_SWAP 0x068 - -/***********************************************/ -/* PCI Configuration, Function 1, Registers */ -/***********************************************/ - -#define MV64340_PCI_SCS_2_BASE_ADDR_LOW 0x110 -#define MV64340_PCI_SCS_2_BASE_ADDR_HIGH 0x114 -#define MV64340_PCI_SCS_3_BASE_ADDR_LOW 0x118 -#define MV64340_PCI_SCS_3_BASE_ADDR_HIGH 0x11c -#define MV64340_PCI_INTERNAL_SRAM_BASE_ADDR_LOW 0x120 -#define MV64340_PCI_INTERNAL_SRAM_BASE_ADDR_HIGH 0x124 - -/***********************************************/ -/* PCI Configuration, Function 2, Registers */ -/***********************************************/ - -#define MV64340_PCI_DEVCS_0_BASE_ADDR_LOW 0x210 -#define MV64340_PCI_DEVCS_0_BASE_ADDR_HIGH 0x214 -#define MV64340_PCI_DEVCS_1_BASE_ADDR_LOW 0x218 -#define MV64340_PCI_DEVCS_1_BASE_ADDR_HIGH 0x21c -#define MV64340_PCI_DEVCS_2_BASE_ADDR_LOW 0x220 -#define MV64340_PCI_DEVCS_2_BASE_ADDR_HIGH 0x224 - -/***********************************************/ -/* PCI Configuration, Function 3, Registers */ -/***********************************************/ - -#define MV64340_PCI_DEVCS_3_BASE_ADDR_LOW 0x310 -#define MV64340_PCI_DEVCS_3_BASE_ADDR_HIGH 0x314 -#define MV64340_PCI_BOOT_CS_BASE_ADDR_LOW 0x318 -#define MV64340_PCI_BOOT_CS_BASE_ADDR_HIGH 0x31c -#define MV64340_PCI_CPU_BASE_ADDR_LOW 0x220 -#define MV64340_PCI_CPU_BASE_ADDR_HIGH 0x224 - -/***********************************************/ -/* PCI Configuration, Function 4, Registers */ -/***********************************************/ - -#define MV64340_PCI_P2P_MEM0_BASE_ADDR_LOW 0x410 -#define MV64340_PCI_P2P_MEM0_BASE_ADDR_HIGH 0x414 -#define MV64340_PCI_P2P_MEM1_BASE_ADDR_LOW 0x418 -#define MV64340_PCI_P2P_MEM1_BASE_ADDR_HIGH 0x41c -#define MV64340_PCI_P2P_I_O_BASE_ADDR 0x420 -#define MV64340_PCI_INTERNAL_REGS_I_O_MAPPED_BASE_ADDR 0x424 - -/****************************************/ -/* Messaging Unit Registers (I20) */ -/****************************************/ - -#define MV64340_I2O_INBOUND_MESSAGE_REG0_PCI_0_SIDE 0x010 -#define MV64340_I2O_INBOUND_MESSAGE_REG1_PCI_0_SIDE 0x014 -#define MV64340_I2O_OUTBOUND_MESSAGE_REG0_PCI_0_SIDE 0x018 -#define MV64340_I2O_OUTBOUND_MESSAGE_REG1_PCI_0_SIDE 0x01C -#define MV64340_I2O_INBOUND_DOORBELL_REG_PCI_0_SIDE 0x020 -#define MV64340_I2O_INBOUND_INTERRUPT_CAUSE_REG_PCI_0_SIDE 0x024 -#define MV64340_I2O_INBOUND_INTERRUPT_MASK_REG_PCI_0_SIDE 0x028 -#define MV64340_I2O_OUTBOUND_DOORBELL_REG_PCI_0_SIDE 0x02C -#define MV64340_I2O_OUTBOUND_INTERRUPT_CAUSE_REG_PCI_0_SIDE 0x030 -#define MV64340_I2O_OUTBOUND_INTERRUPT_MASK_REG_PCI_0_SIDE 0x034 -#define MV64340_I2O_INBOUND_QUEUE_PORT_VIRTUAL_REG_PCI_0_SIDE 0x040 -#define MV64340_I2O_OUTBOUND_QUEUE_PORT_VIRTUAL_REG_PCI_0_SIDE 0x044 -#define MV64340_I2O_QUEUE_CONTROL_REG_PCI_0_SIDE 0x050 -#define MV64340_I2O_QUEUE_BASE_ADDR_REG_PCI_0_SIDE 0x054 -#define MV64340_I2O_INBOUND_FREE_HEAD_POINTER_REG_PCI_0_SIDE 0x060 -#define MV64340_I2O_INBOUND_FREE_TAIL_POINTER_REG_PCI_0_SIDE 0x064 -#define MV64340_I2O_INBOUND_POST_HEAD_POINTER_REG_PCI_0_SIDE 0x068 -#define MV64340_I2O_INBOUND_POST_TAIL_POINTER_REG_PCI_0_SIDE 0x06C -#define MV64340_I2O_OUTBOUND_FREE_HEAD_POINTER_REG_PCI_0_SIDE 0x070 -#define MV64340_I2O_OUTBOUND_FREE_TAIL_POINTER_REG_PCI_0_SIDE 0x074 -#define MV64340_I2O_OUTBOUND_POST_HEAD_POINTER_REG_PCI_0_SIDE 0x0F8 -#define MV64340_I2O_OUTBOUND_POST_TAIL_POINTER_REG_PCI_0_SIDE 0x0FC - -#define MV64340_I2O_INBOUND_MESSAGE_REG0_PCI_1_SIDE 0x090 -#define MV64340_I2O_INBOUND_MESSAGE_REG1_PCI_1_SIDE 0x094 -#define MV64340_I2O_OUTBOUND_MESSAGE_REG0_PCI_1_SIDE 0x098 -#define MV64340_I2O_OUTBOUND_MESSAGE_REG1_PCI_1_SIDE 0x09C -#define MV64340_I2O_INBOUND_DOORBELL_REG_PCI_1_SIDE 0x0A0 -#define MV64340_I2O_INBOUND_INTERRUPT_CAUSE_REG_PCI_1_SIDE 0x0A4 -#define MV64340_I2O_INBOUND_INTERRUPT_MASK_REG_PCI_1_SIDE 0x0A8 -#define MV64340_I2O_OUTBOUND_DOORBELL_REG_PCI_1_SIDE 0x0AC -#define MV64340_I2O_OUTBOUND_INTERRUPT_CAUSE_REG_PCI_1_SIDE 0x0B0 -#define MV64340_I2O_OUTBOUND_INTERRUPT_MASK_REG_PCI_1_SIDE 0x0B4 -#define MV64340_I2O_INBOUND_QUEUE_PORT_VIRTUAL_REG_PCI_1_SIDE 0x0C0 -#define MV64340_I2O_OUTBOUND_QUEUE_PORT_VIRTUAL_REG_PCI_1_SIDE 0x0C4 -#define MV64340_I2O_QUEUE_CONTROL_REG_PCI_1_SIDE 0x0D0 -#define MV64340_I2O_QUEUE_BASE_ADDR_REG_PCI_1_SIDE 0x0D4 -#define MV64340_I2O_INBOUND_FREE_HEAD_POINTER_REG_PCI_1_SIDE 0x0E0 -#define MV64340_I2O_INBOUND_FREE_TAIL_POINTER_REG_PCI_1_SIDE 0x0E4 -#define MV64340_I2O_INBOUND_POST_HEAD_POINTER_REG_PCI_1_SIDE 0x0E8 -#define MV64340_I2O_INBOUND_POST_TAIL_POINTER_REG_PCI_1_SIDE 0x0EC -#define MV64340_I2O_OUTBOUND_FREE_HEAD_POINTER_REG_PCI_1_SIDE 0x0F0 -#define MV64340_I2O_OUTBOUND_FREE_TAIL_POINTER_REG_PCI_1_SIDE 0x0F4 -#define MV64340_I2O_OUTBOUND_POST_HEAD_POINTER_REG_PCI_1_SIDE 0x078 -#define MV64340_I2O_OUTBOUND_POST_TAIL_POINTER_REG_PCI_1_SIDE 0x07C - -#define MV64340_I2O_INBOUND_MESSAGE_REG0_CPU0_SIDE 0x1C10 -#define MV64340_I2O_INBOUND_MESSAGE_REG1_CPU0_SIDE 0x1C14 -#define MV64340_I2O_OUTBOUND_MESSAGE_REG0_CPU0_SIDE 0x1C18 -#define MV64340_I2O_OUTBOUND_MESSAGE_REG1_CPU0_SIDE 0x1C1C -#define MV64340_I2O_INBOUND_DOORBELL_REG_CPU0_SIDE 0x1C20 -#define MV64340_I2O_INBOUND_INTERRUPT_CAUSE_REG_CPU0_SIDE 0x1C24 -#define MV64340_I2O_INBOUND_INTERRUPT_MASK_REG_CPU0_SIDE 0x1C28 -#define MV64340_I2O_OUTBOUND_DOORBELL_REG_CPU0_SIDE 0x1C2C -#define MV64340_I2O_OUTBOUND_INTERRUPT_CAUSE_REG_CPU0_SIDE 0x1C30 -#define MV64340_I2O_OUTBOUND_INTERRUPT_MASK_REG_CPU0_SIDE 0x1C34 -#define MV64340_I2O_INBOUND_QUEUE_PORT_VIRTUAL_REG_CPU0_SIDE 0x1C40 -#define MV64340_I2O_OUTBOUND_QUEUE_PORT_VIRTUAL_REG_CPU0_SIDE 0x1C44 -#define MV64340_I2O_QUEUE_CONTROL_REG_CPU0_SIDE 0x1C50 -#define MV64340_I2O_QUEUE_BASE_ADDR_REG_CPU0_SIDE 0x1C54 -#define MV64340_I2O_INBOUND_FREE_HEAD_POINTER_REG_CPU0_SIDE 0x1C60 -#define MV64340_I2O_INBOUND_FREE_TAIL_POINTER_REG_CPU0_SIDE 0x1C64 -#define MV64340_I2O_INBOUND_POST_HEAD_POINTER_REG_CPU0_SIDE 0x1C68 -#define MV64340_I2O_INBOUND_POST_TAIL_POINTER_REG_CPU0_SIDE 0x1C6C -#define MV64340_I2O_OUTBOUND_FREE_HEAD_POINTER_REG_CPU0_SIDE 0x1C70 -#define MV64340_I2O_OUTBOUND_FREE_TAIL_POINTER_REG_CPU0_SIDE 0x1C74 -#define MV64340_I2O_OUTBOUND_POST_HEAD_POINTER_REG_CPU0_SIDE 0x1CF8 -#define MV64340_I2O_OUTBOUND_POST_TAIL_POINTER_REG_CPU0_SIDE 0x1CFC -#define MV64340_I2O_INBOUND_MESSAGE_REG0_CPU1_SIDE 0x1C90 -#define MV64340_I2O_INBOUND_MESSAGE_REG1_CPU1_SIDE 0x1C94 -#define MV64340_I2O_OUTBOUND_MESSAGE_REG0_CPU1_SIDE 0x1C98 -#define MV64340_I2O_OUTBOUND_MESSAGE_REG1_CPU1_SIDE 0x1C9C -#define MV64340_I2O_INBOUND_DOORBELL_REG_CPU1_SIDE 0x1CA0 -#define MV64340_I2O_INBOUND_INTERRUPT_CAUSE_REG_CPU1_SIDE 0x1CA4 -#define MV64340_I2O_INBOUND_INTERRUPT_MASK_REG_CPU1_SIDE 0x1CA8 -#define MV64340_I2O_OUTBOUND_DOORBELL_REG_CPU1_SIDE 0x1CAC -#define MV64340_I2O_OUTBOUND_INTERRUPT_CAUSE_REG_CPU1_SIDE 0x1CB0 -#define MV64340_I2O_OUTBOUND_INTERRUPT_MASK_REG_CPU1_SIDE 0x1CB4 -#define MV64340_I2O_INBOUND_QUEUE_PORT_VIRTUAL_REG_CPU1_SIDE 0x1CC0 -#define MV64340_I2O_OUTBOUND_QUEUE_PORT_VIRTUAL_REG_CPU1_SIDE 0x1CC4 -#define MV64340_I2O_QUEUE_CONTROL_REG_CPU1_SIDE 0x1CD0 -#define MV64340_I2O_QUEUE_BASE_ADDR_REG_CPU1_SIDE 0x1CD4 -#define MV64340_I2O_INBOUND_FREE_HEAD_POINTER_REG_CPU1_SIDE 0x1CE0 -#define MV64340_I2O_INBOUND_FREE_TAIL_POINTER_REG_CPU1_SIDE 0x1CE4 -#define MV64340_I2O_INBOUND_POST_HEAD_POINTER_REG_CPU1_SIDE 0x1CE8 -#define MV64340_I2O_INBOUND_POST_TAIL_POINTER_REG_CPU1_SIDE 0x1CEC -#define MV64340_I2O_OUTBOUND_FREE_HEAD_POINTER_REG_CPU1_SIDE 0x1CF0 -#define MV64340_I2O_OUTBOUND_FREE_TAIL_POINTER_REG_CPU1_SIDE 0x1CF4 -#define MV64340_I2O_OUTBOUND_POST_HEAD_POINTER_REG_CPU1_SIDE 0x1C78 -#define MV64340_I2O_OUTBOUND_POST_TAIL_POINTER_REG_CPU1_SIDE 0x1C7C - -/****************************************/ -/* Ethernet Unit Registers */ -/****************************************/ - -/*******************************************/ -/* CUNIT Registers */ -/*******************************************/ - - /* Address Decoding Register Map */ - -#define MV64340_CUNIT_BASE_ADDR_REG0 0xf200 -#define MV64340_CUNIT_BASE_ADDR_REG1 0xf208 -#define MV64340_CUNIT_BASE_ADDR_REG2 0xf210 -#define MV64340_CUNIT_BASE_ADDR_REG3 0xf218 -#define MV64340_CUNIT_SIZE0 0xf204 -#define MV64340_CUNIT_SIZE1 0xf20c -#define MV64340_CUNIT_SIZE2 0xf214 -#define MV64340_CUNIT_SIZE3 0xf21c -#define MV64340_CUNIT_HIGH_ADDR_REMAP_REG0 0xf240 -#define MV64340_CUNIT_HIGH_ADDR_REMAP_REG1 0xf244 -#define MV64340_CUNIT_BASE_ADDR_ENABLE_REG 0xf250 -#define MV64340_MPSC0_ACCESS_PROTECTION_REG 0xf254 -#define MV64340_MPSC1_ACCESS_PROTECTION_REG 0xf258 -#define MV64340_CUNIT_INTERNAL_SPACE_BASE_ADDR_REG 0xf25C - - /* Error Report Registers */ - -#define MV64340_CUNIT_INTERRUPT_CAUSE_REG 0xf310 -#define MV64340_CUNIT_INTERRUPT_MASK_REG 0xf314 -#define MV64340_CUNIT_ERROR_ADDR 0xf318 - - /* Cunit Control Registers */ - -#define MV64340_CUNIT_ARBITER_CONTROL_REG 0xf300 -#define MV64340_CUNIT_CONFIG_REG 0xb40c -#define MV64340_CUNIT_CRROSBAR_TIMEOUT_REG 0xf304 - - /* Cunit Debug Registers */ - -#define MV64340_CUNIT_DEBUG_LOW 0xf340 -#define MV64340_CUNIT_DEBUG_HIGH 0xf344 -#define MV64340_CUNIT_MMASK 0xf380 - - /* MPSCs Clocks Routing Registers */ - -#define MV64340_MPSC_ROUTING_REG 0xb400 -#define MV64340_MPSC_RX_CLOCK_ROUTING_REG 0xb404 -#define MV64340_MPSC_TX_CLOCK_ROUTING_REG 0xb408 - - /* MPSCs Interrupts Registers */ - -#define MV64340_MPSC_CAUSE_REG(port) (0xb804 + (port<<3)) -#define MV64340_MPSC_MASK_REG(port) (0xb884 + (port<<3)) - -#define MV64340_MPSC_MAIN_CONFIG_LOW(port) (0x8000 + (port<<12)) -#define MV64340_MPSC_MAIN_CONFIG_HIGH(port) (0x8004 + (port<<12)) -#define MV64340_MPSC_PROTOCOL_CONFIG(port) (0x8008 + (port<<12)) -#define MV64340_MPSC_CHANNEL_REG1(port) (0x800c + (port<<12)) -#define MV64340_MPSC_CHANNEL_REG2(port) (0x8010 + (port<<12)) -#define MV64340_MPSC_CHANNEL_REG3(port) (0x8014 + (port<<12)) -#define MV64340_MPSC_CHANNEL_REG4(port) (0x8018 + (port<<12)) -#define MV64340_MPSC_CHANNEL_REG5(port) (0x801c + (port<<12)) -#define MV64340_MPSC_CHANNEL_REG6(port) (0x8020 + (port<<12)) -#define MV64340_MPSC_CHANNEL_REG7(port) (0x8024 + (port<<12)) -#define MV64340_MPSC_CHANNEL_REG8(port) (0x8028 + (port<<12)) -#define MV64340_MPSC_CHANNEL_REG9(port) (0x802c + (port<<12)) -#define MV64340_MPSC_CHANNEL_REG10(port) (0x8030 + (port<<12)) - - /* MPSC0 Registers */ - - -/***************************************/ -/* SDMA Registers */ -/***************************************/ - -#define MV64340_SDMA_CONFIG_REG(channel) (0x4000 + (channel<<13)) -#define MV64340_SDMA_COMMAND_REG(channel) (0x4008 + (channel<<13)) -#define MV64340_SDMA_CURRENT_RX_DESCRIPTOR_POINTER(channel) (0x4810 + (channel<<13)) -#define MV64340_SDMA_CURRENT_TX_DESCRIPTOR_POINTER(channel) (0x4c10 + (channel<<13)) -#define MV64340_SDMA_FIRST_TX_DESCRIPTOR_POINTER(channel) (0x4c14 + (channel<<13)) - -#define MV64340_SDMA_CAUSE_REG 0xb800 -#define MV64340_SDMA_MASK_REG 0xb880 - -/* BRG Interrupts */ - -#define MV64340_BRG_CONFIG_REG(brg) (0xb200 + (brg<<3)) -#define MV64340_BRG_BAUDE_TUNING_REG(brg) (0xb208 + (brg<<3)) -#define MV64340_BRG_CAUSE_REG 0xb834 -#define MV64340_BRG_MASK_REG 0xb8b4 - -/****************************************/ -/* DMA Channel Control */ -/****************************************/ - -#define MV64340_DMA_CHANNEL0_CONTROL 0x840 -#define MV64340_DMA_CHANNEL0_CONTROL_HIGH 0x880 -#define MV64340_DMA_CHANNEL1_CONTROL 0x844 -#define MV64340_DMA_CHANNEL1_CONTROL_HIGH 0x884 -#define MV64340_DMA_CHANNEL2_CONTROL 0x848 -#define MV64340_DMA_CHANNEL2_CONTROL_HIGH 0x888 -#define MV64340_DMA_CHANNEL3_CONTROL 0x84C -#define MV64340_DMA_CHANNEL3_CONTROL_HIGH 0x88C - - -/****************************************/ -/* IDMA Registers */ -/****************************************/ - -#define MV64340_DMA_CHANNEL0_BYTE_COUNT 0x800 -#define MV64340_DMA_CHANNEL1_BYTE_COUNT 0x804 -#define MV64340_DMA_CHANNEL2_BYTE_COUNT 0x808 -#define MV64340_DMA_CHANNEL3_BYTE_COUNT 0x80C -#define MV64340_DMA_CHANNEL0_SOURCE_ADDR 0x810 -#define MV64340_DMA_CHANNEL1_SOURCE_ADDR 0x814 -#define MV64340_DMA_CHANNEL2_SOURCE_ADDR 0x818 -#define MV64340_DMA_CHANNEL3_SOURCE_ADDR 0x81c -#define MV64340_DMA_CHANNEL0_DESTINATION_ADDR 0x820 -#define MV64340_DMA_CHANNEL1_DESTINATION_ADDR 0x824 -#define MV64340_DMA_CHANNEL2_DESTINATION_ADDR 0x828 -#define MV64340_DMA_CHANNEL3_DESTINATION_ADDR 0x82C -#define MV64340_DMA_CHANNEL0_NEXT_DESCRIPTOR_POINTER 0x830 -#define MV64340_DMA_CHANNEL1_NEXT_DESCRIPTOR_POINTER 0x834 -#define MV64340_DMA_CHANNEL2_NEXT_DESCRIPTOR_POINTER 0x838 -#define MV64340_DMA_CHANNEL3_NEXT_DESCRIPTOR_POINTER 0x83C -#define MV64340_DMA_CHANNEL0_CURRENT_DESCRIPTOR_POINTER 0x870 -#define MV64340_DMA_CHANNEL1_CURRENT_DESCRIPTOR_POINTER 0x874 -#define MV64340_DMA_CHANNEL2_CURRENT_DESCRIPTOR_POINTER 0x878 -#define MV64340_DMA_CHANNEL3_CURRENT_DESCRIPTOR_POINTER 0x87C - - /* IDMA Address Decoding Base Address Registers */ - -#define MV64340_DMA_BASE_ADDR_REG0 0xa00 -#define MV64340_DMA_BASE_ADDR_REG1 0xa08 -#define MV64340_DMA_BASE_ADDR_REG2 0xa10 -#define MV64340_DMA_BASE_ADDR_REG3 0xa18 -#define MV64340_DMA_BASE_ADDR_REG4 0xa20 -#define MV64340_DMA_BASE_ADDR_REG5 0xa28 -#define MV64340_DMA_BASE_ADDR_REG6 0xa30 -#define MV64340_DMA_BASE_ADDR_REG7 0xa38 - - /* IDMA Address Decoding Size Address Register */ - -#define MV64340_DMA_SIZE_REG0 0xa04 -#define MV64340_DMA_SIZE_REG1 0xa0c -#define MV64340_DMA_SIZE_REG2 0xa14 -#define MV64340_DMA_SIZE_REG3 0xa1c -#define MV64340_DMA_SIZE_REG4 0xa24 -#define MV64340_DMA_SIZE_REG5 0xa2c -#define MV64340_DMA_SIZE_REG6 0xa34 -#define MV64340_DMA_SIZE_REG7 0xa3C - - /* IDMA Address Decoding High Address Remap and Access - Protection Registers */ - -#define MV64340_DMA_HIGH_ADDR_REMAP_REG0 0xa60 -#define MV64340_DMA_HIGH_ADDR_REMAP_REG1 0xa64 -#define MV64340_DMA_HIGH_ADDR_REMAP_REG2 0xa68 -#define MV64340_DMA_HIGH_ADDR_REMAP_REG3 0xa6C -#define MV64340_DMA_BASE_ADDR_ENABLE_REG 0xa80 -#define MV64340_DMA_CHANNEL0_ACCESS_PROTECTION_REG 0xa70 -#define MV64340_DMA_CHANNEL1_ACCESS_PROTECTION_REG 0xa74 -#define MV64340_DMA_CHANNEL2_ACCESS_PROTECTION_REG 0xa78 -#define MV64340_DMA_CHANNEL3_ACCESS_PROTECTION_REG 0xa7c -#define MV64340_DMA_ARBITER_CONTROL 0x860 -#define MV64340_DMA_CROSS_BAR_TIMEOUT 0x8d0 - - /* IDMA Headers Retarget Registers */ - -#define MV64340_DMA_HEADERS_RETARGET_CONTROL 0xa84 -#define MV64340_DMA_HEADERS_RETARGET_BASE 0xa88 - - /* IDMA Interrupt Register */ - -#define MV64340_DMA_INTERRUPT_CAUSE_REG 0x8c0 -#define MV64340_DMA_INTERRUPT_CAUSE_MASK 0x8c4 -#define MV64340_DMA_ERROR_ADDR 0x8c8 -#define MV64340_DMA_ERROR_SELECT 0x8cc - - /* IDMA Debug Register ( for internal use ) */ - -#define MV64340_DMA_DEBUG_LOW 0x8e0 -#define MV64340_DMA_DEBUG_HIGH 0x8e4 -#define MV64340_DMA_SPARE 0xA8C - -/****************************************/ -/* Timer_Counter */ -/****************************************/ - -#define MV64340_TIMER_COUNTER0 0x850 -#define MV64340_TIMER_COUNTER1 0x854 -#define MV64340_TIMER_COUNTER2 0x858 -#define MV64340_TIMER_COUNTER3 0x85C -#define MV64340_TIMER_COUNTER_0_3_CONTROL 0x864 -#define MV64340_TIMER_COUNTER_0_3_INTERRUPT_CAUSE 0x868 -#define MV64340_TIMER_COUNTER_0_3_INTERRUPT_MASK 0x86c - -/****************************************/ -/* Watchdog registers */ -/****************************************/ - -#define MV64340_WATCHDOG_CONFIG_REG 0xb410 -#define MV64340_WATCHDOG_VALUE_REG 0xb414 - -/****************************************/ -/* I2C Registers */ -/****************************************/ - -#define MV64XXX_I2C_OFFSET 0xc000 -#define MV64XXX_I2C_REG_BLOCK_SIZE 0x0020 - -/****************************************/ -/* GPP Interface Registers */ -/****************************************/ - -#define MV64340_GPP_IO_CONTROL 0xf100 -#define MV64340_GPP_LEVEL_CONTROL 0xf110 -#define MV64340_GPP_VALUE 0xf104 -#define MV64340_GPP_INTERRUPT_CAUSE 0xf108 -#define MV64340_GPP_INTERRUPT_MASK0 0xf10c -#define MV64340_GPP_INTERRUPT_MASK1 0xf114 -#define MV64340_GPP_VALUE_SET 0xf118 -#define MV64340_GPP_VALUE_CLEAR 0xf11c - -/****************************************/ -/* Interrupt Controller Registers */ -/****************************************/ - -/****************************************/ -/* Interrupts */ -/****************************************/ - -#define MV64340_MAIN_INTERRUPT_CAUSE_LOW 0x004 -#define MV64340_MAIN_INTERRUPT_CAUSE_HIGH 0x00c -#define MV64340_CPU_INTERRUPT0_MASK_LOW 0x014 -#define MV64340_CPU_INTERRUPT0_MASK_HIGH 0x01c -#define MV64340_CPU_INTERRUPT0_SELECT_CAUSE 0x024 -#define MV64340_CPU_INTERRUPT1_MASK_LOW 0x034 -#define MV64340_CPU_INTERRUPT1_MASK_HIGH 0x03c -#define MV64340_CPU_INTERRUPT1_SELECT_CAUSE 0x044 -#define MV64340_INTERRUPT0_MASK_0_LOW 0x054 -#define MV64340_INTERRUPT0_MASK_0_HIGH 0x05c -#define MV64340_INTERRUPT0_SELECT_CAUSE 0x064 -#define MV64340_INTERRUPT1_MASK_0_LOW 0x074 -#define MV64340_INTERRUPT1_MASK_0_HIGH 0x07c -#define MV64340_INTERRUPT1_SELECT_CAUSE 0x084 - -/****************************************/ -/* MPP Interface Registers */ -/****************************************/ - -#define MV64340_MPP_CONTROL0 0xf000 -#define MV64340_MPP_CONTROL1 0xf004 -#define MV64340_MPP_CONTROL2 0xf008 -#define MV64340_MPP_CONTROL3 0xf00c - -/****************************************/ -/* Serial Initialization registers */ -/****************************************/ - -#define MV64340_SERIAL_INIT_LAST_DATA 0xf324 -#define MV64340_SERIAL_INIT_CONTROL 0xf328 -#define MV64340_SERIAL_INIT_STATUS 0xf32c - -extern void mv64340_irq_init(unsigned int base); - -#endif /* __ASM_MV643XX_H */ -- cgit v1.2.3 From 4208c562a27899212e8046080555e0f204e0579a Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 17 Sep 2024 10:24:57 +0530 Subject: block: remove bogus union The union around bi_integrity field is pointless. Remove it. Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20240917045457.429698-1-joshi.k@samsung.com Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 36ed96133217..6d3a0fff2a1d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -248,11 +248,9 @@ struct bio { struct bio_crypt_ctx *bi_crypt_context; #endif - union { #if defined(CONFIG_BLK_DEV_INTEGRITY) - struct bio_integrity_payload *bi_integrity; /* data integrity */ + struct bio_integrity_payload *bi_integrity; /* data integrity */ #endif - }; unsigned short bi_vcnt; /* how many bio_vec's */ -- cgit v1.2.3 From 6857be5fecaebd9773ff27b6d29b6fff3b1abbce Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 26 Aug 2024 16:43:35 -0400 Subject: mm: introduce ARCH_SUPPORTS_HUGE_PFNMAP and special bits to pmd/pud Patch series "mm: Support huge pfnmaps", v2. Overview ======== This series implements huge pfnmaps support for mm in general. Huge pfnmap allows e.g. VM_PFNMAP vmas to map in either PMD or PUD levels, similar to what we do with dax / thp / hugetlb so far to benefit from TLB hits. Now we extend that idea to PFN mappings, e.g. PCI MMIO bars where it can grow as large as 8GB or even bigger. Currently, only x86_64 (1G+2M) and arm64 (2M) are supported. The last patch (from Alex Williamson) will be the first user of huge pfnmap, so as to enable vfio-pci driver to fault in huge pfn mappings. Implementation ============== In reality, it's relatively simple to add such support comparing to many other types of mappings, because of PFNMAP's specialties when there's no vmemmap backing it, so that most of the kernel routines on huge mappings should simply already fail for them, like GUPs or old-school follow_page() (which is recently rewritten to be folio_walk* APIs by David). One trick here is that we're still unmature on PUDs in generic paths here and there, as DAX is so far the only user. This patchset will add the 2nd user of it. Hugetlb can be a 3rd user if the hugetlb unification work can go on smoothly, but to be discussed later. The other trick is how to allow gup-fast working for such huge mappings even if there's no direct sign of knowing whether it's a normal page or MMIO mapping. This series chose to keep the pte_special solution, so that it reuses similar idea on setting a special bit to pfnmap PMDs/PUDs so that gup-fast will be able to identify them and fail properly. Along the way, we'll also notice that the major pgtable pfn walker, aka, follow_pte(), will need to retire soon due to the fact that it only works with ptes. A new set of simple API is introduced (follow_pfnmap* API) to be able to do whatever follow_pte() can already do, plus that it can also process huge pfnmaps now. Half of this series is about that and converting all existing pfnmap walkers to use the new API properly. Hopefully the new API also looks better to avoid exposing e.g. pgtable lock details into the callers, so that it can be used in an even more straightforward way. Here, three more options will be introduced and involved in huge pfnmap: - ARCH_SUPPORTS_HUGE_PFNMAP Arch developers will need to select this option when huge pfnmap is supported in arch's Kconfig. After this patchset applied, both x86_64 and arm64 will start to enable it by default. - ARCH_SUPPORTS_PMD_PFNMAP / ARCH_SUPPORTS_PUD_PFNMAP These options are for driver developers to identify whether current arch / config supports huge pfnmaps, making decision on whether it can use the huge pfnmap APIs to inject them. One can refer to the last vfio-pci patch from Alex on the use of them properly in a device driver. So after the whole set applied, and if one would enable some dynamic debug lines in vfio-pci core files, we should observe things like: vfio-pci 0000:00:06.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x0: 0x100 vfio-pci 0000:00:06.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x200: 0x100 vfio-pci 0000:00:06.0: vfio_pci_mmap_huge_fault(,order = 9) BAR 0 page offset 0x400: 0x100 In this specific case, it says that vfio-pci faults in PMDs properly for a few BAR0 offsets. Patch Layout ============ Patch 1: Introduce the new options mentioned above for huge PFNMAPs Patch 2: A tiny cleanup Patch 3-8: Preparation patches for huge pfnmap (include introduce special bit for pmd/pud) Patch 9-16: Introduce follow_pfnmap*() API, use it everywhere, and then drop follow_pte() API Patch 17: Add huge pfnmap support for x86_64 Patch 18: Add huge pfnmap support for arm64 Patch 19: Add vfio-pci support for all kinds of huge pfnmaps (Alex) TODO ==== More architectures / More page sizes ------------------------------------ Currently only x86_64 (2M+1G) and arm64 (2M) are supported. There seems to have plan to support arm64 1G later on top of this series [2]. Any arch will need to first support THP / THP_1G, then provide a special bit in pmds/puds to support huge pfnmaps. remap_pfn_range() support ------------------------- Currently, remap_pfn_range() still only maps PTEs. With the new option, remap_pfn_range() can logically start to inject either PMDs or PUDs when the alignment requirements match on the VAs. When the support is there, it should be able to silently benefit all drivers that is using remap_pfn_range() in its mmap() handler on better TLB hit rate and overall faster MMIO accesses similar to processor on hugepages. More driver support ------------------- VFIO is so far the only consumer for the huge pfnmaps after this series applied. Besides above remap_pfn_range() generic optimization, device driver can also try to optimize its mmap() on a better VA alignment for either PMD/PUD sizes. This may, iiuc, normally require userspace changes, as the driver doesn't normally decide the VA to map a bar. But I don't think I know all the drivers to know the full picture. Credits all go to Alex on help testing the GPU/NIC use cases above. [0] https://lore.kernel.org/r/73ad9540-3fb8-4154-9a4f-30a0a2b03d41@lucifer.local [1] https://lore.kernel.org/r/20240807194812.819412-1-peterx@redhat.com [2] https://lore.kernel.org/r/498e0731-81a4-4f75-95b4-a8ad0bcc7665@huawei.com This patch (of 19): This patch introduces the option to introduce special pte bit into pmd/puds. Archs can start to define pmd_special / pud_special when supported by selecting the new option. Per-arch support will be added later. Before that, create fallbacks for these helpers so that they are always available. Link: https://lkml.kernel.org/r/20240826204353.2228736-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20240826204353.2228736-2-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alexander Gordeev Cc: Alex Williamson Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David Hildenbrand Cc: Gavin Shan Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Matthew Wilcox Cc: Niklas Schnelle Cc: Paolo Bonzini Cc: Ryan Roberts Cc: Sean Christopherson Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 39f6faace590..c2307a2d275d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2643,6 +2643,30 @@ static inline pte_t pte_mkspecial(pte_t pte) } #endif +#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return false; +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ + +#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return false; +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pud; +} +#endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ + #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { -- cgit v1.2.3 From ef713ec3a566d3e5e011c5d6201eb661ebf94c1f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 26 Aug 2024 16:43:36 -0400 Subject: mm: drop is_huge_zero_pud() It constantly returns false since 2017. One assertion is added in 2019 but it should never have triggered, IOW it means what is checked should be asserted instead. If it didn't exist for 7 years maybe it's good idea to remove it and only add it when it comes. Link: https://lkml.kernel.org/r/20240826204353.2228736-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Cc: Matthew Wilcox Cc: Aneesh Kumar K.V Cc: Alexander Gordeev Cc: Alex Williamson Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gavin Shan Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Ingo Molnar Cc: Niklas Schnelle Cc: Paolo Bonzini Cc: Ryan Roberts Cc: Sean Christopherson Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e3896ebf0f94..ffca706bac81 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -433,11 +433,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); } -static inline bool is_huge_zero_pud(pud_t pud) -{ - return false; -} - struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); @@ -578,11 +573,6 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return false; } -static inline bool is_huge_zero_pud(pud_t pud) -{ - return false; -} - static inline void mm_put_huge_zero_folio(struct mm_struct *mm) { return; -- cgit v1.2.3 From 5dd40721f147e83733ad34848330913cb633046e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 26 Aug 2024 16:43:38 -0400 Subject: mm: allow THP orders for PFNMAPs This enables PFNMAPs to be mapped at either pmd/pud layers. Generalize the dax case into vma_is_special_huge() so as to cover both. Meanwhile, rename the macro to THP_ORDERS_ALL_SPECIAL. Link: https://lkml.kernel.org/r/20240826204353.2228736-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Cc: Matthew Wilcox Cc: Gavin Shan Cc: Ryan Roberts Cc: Zi Yan Cc: Alexander Gordeev Cc: Alex Williamson Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Ingo Molnar Cc: Niklas Schnelle Cc: Paolo Bonzini Cc: Sean Christopherson Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ffca706bac81..0b0539f4ee1a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -76,9 +76,9 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for file THP. Folios in a DAX * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to - * it. + * it. Same to PFNMAPs where there's neither page* nor pagecache. */ -#define THP_ORDERS_ALL_FILE_DAX \ +#define THP_ORDERS_ALL_SPECIAL \ (BIT(PMD_ORDER) | BIT(PUD_ORDER)) #define THP_ORDERS_ALL_FILE_DEFAULT \ ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0)) @@ -87,7 +87,7 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; * Mask of all large folio orders supported for THP. */ #define THP_ORDERS_ALL \ - (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) + (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) #define TVA_SMAPS (1 << 0) /* Will be used for procfs */ #define TVA_IN_PF (1 << 1) /* Page fault handler */ -- cgit v1.2.3 From 0515e022e167cfacf1fee092eb93aa9514e23c0a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 26 Aug 2024 16:43:42 -0400 Subject: mm: always define pxx_pgprot() There're: - 8 archs (arc, arm64, include, mips, powerpc, s390, sh, x86) that support pte_pgprot(). - 2 archs (x86, sparc) that support pmd_pgprot(). - 1 arch (x86) that support pud_pgprot(). Always define them to be used in generic code, and then we don't need to fiddle with "#ifdef"s when doing so. Link: https://lkml.kernel.org/r/20240826204353.2228736-9-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Alex Williamson Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David Hildenbrand Cc: Gavin Shan Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Niklas Schnelle Cc: Paolo Bonzini Cc: Ryan Roberts Cc: Sean Christopherson Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 780f3b439d98..e8b2ac6bd2ae 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1956,6 +1956,18 @@ typedef unsigned int pgtbl_mod_mask; #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif +#ifndef pte_pgprot +#define pte_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pmd_pgprot +#define pmd_pgprot(x) ((pgprot_t) {0}) +#endif + +#ifndef pud_pgprot +#define pud_pgprot(x) ((pgprot_t) {0}) +#endif + /* description of effects of mapping type and prot in current implementation. * this is due to the limited x86 page protection hardware. The expected * behavior is in parens: -- cgit v1.2.3 From 6da8e9634bb7e3fdad9ae0e4db873a05036c4343 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 26 Aug 2024 16:43:43 -0400 Subject: mm: new follow_pfnmap API Introduce a pair of APIs to follow pfn mappings to get entry information. It's very similar to what follow_pte() does before, but different in that it recognizes huge pfn mappings. Link: https://lkml.kernel.org/r/20240826204353.2228736-10-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alexander Gordeev Cc: Alex Williamson Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David Hildenbrand Cc: Gavin Shan Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Matthew Wilcox Cc: Niklas Schnelle Cc: Paolo Bonzini Cc: Ryan Roberts Cc: Sean Christopherson Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c2307a2d275d..62bd89414897 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2373,6 +2373,37 @@ int follow_pte(struct vm_area_struct *vma, unsigned long address, int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); +struct follow_pfnmap_args { + /** + * Inputs: + * @vma: Pointer to @vm_area_struct struct + * @address: the virtual address to walk + */ + struct vm_area_struct *vma; + unsigned long address; + /** + * Internals: + * + * The caller shouldn't touch any of these. + */ + spinlock_t *lock; + pte_t *ptep; + /** + * Outputs: + * + * @pfn: the PFN of the address + * @pgprot: the pgprot_t of the mapping + * @writable: whether the mapping is writable + * @special: whether the mapping is a special mapping (real PFN maps) + */ + unsigned long pfn; + pgprot_t pgprot; + bool writable; + bool special; +}; +int follow_pfnmap_start(struct follow_pfnmap_args *args); +void follow_pfnmap_end(struct follow_pfnmap_args *args); + extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); -- cgit v1.2.3 From b0a1c0d0edcd75a0f8ec5fd19dbd64b8d097f534 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 26 Aug 2024 16:43:50 -0400 Subject: mm: remove follow_pte() follow_pte() users have been converted to follow_pfnmap*(). Remove the API. Link: https://lkml.kernel.org/r/20240826204353.2228736-17-peterx@redhat.com Signed-off-by: Peter Xu Cc: Alexander Gordeev Cc: Alex Williamson Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David Hildenbrand Cc: Gavin Shan Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Matthew Wilcox Cc: Niklas Schnelle Cc: Paolo Bonzini Cc: Ryan Roberts Cc: Sean Christopherson Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 62bd89414897..d750be768121 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2368,8 +2368,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -int follow_pte(struct vm_area_struct *vma, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); -- cgit v1.2.3 From 82ce8e2f31a1eb05b1527c3d807bea40031df913 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 7 Sep 2024 17:40:42 +0200 Subject: set_memory: add __must_check to generic stubs Following query shows that architectures that don't provide asm/set_memory.h don't use set_memory_...() functions. $ git grep set_memory_ alpha arc csky hexagon loongarch m68k microblaze mips nios2 openrisc parisc sh sparc um xtensa Following query shows that all core users of set_memory_...() functions always take returned value into account: $ git grep -w -e set_memory_ro -e set_memory_rw -e set_memory_x -e set_memory_nx -e set_memory_rox `find . -maxdepth 1 -type d | grep -v arch | grep /` set_memory_...() functions can fail, leaving the memory attributes unchanged. Make sure all callers check the returned code. Link: https://github.com/KSPP/linux/issues/7 Link: https://lkml.kernel.org/r/6a89ffc69666de84721216947c6b6c7dcca39d7d.1725723347.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Arnd Bergmann Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/set_memory.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index 95ac8398ee72..e7aec20fb44f 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -8,10 +8,10 @@ #ifdef CONFIG_ARCH_HAS_SET_MEMORY #include #else -static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } -static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } -static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } -static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_ro(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_rw(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_x(unsigned long addr, int numpages) { return 0; } +static inline int __must_check set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif #ifndef set_memory_rox -- cgit v1.2.3 From 325efb16da2c840e165d9b620fec8049d4d664cc Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 9 Sep 2024 11:21:18 +1200 Subject: mm: add nr argument in mem_cgroup_swapin_uncharge_swap() helper to support large folios With large folios swap-in, we might need to uncharge multiple entries all together, add nr argument in mem_cgroup_swapin_uncharge_swap(). For the existing two users, just pass nr=1. Link: https://lkml.kernel.org/r/20240908232119.2157-3-21cnbao@gmail.com Signed-off-by: Barry Song Acked-by: Chris Li Reviewed-by: Yosry Ahmed Cc: Shakeel Butt Cc: Baolin Wang Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Gao Xiang Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Kairui Song Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Minchan Kim Cc: Nhat Pham Cc: Ryan Roberts Cc: Sergey Senozhatsky Cc: Suren Baghdasaryan Cc: Yang Shi Cc: Chuanhua Han Cc: Kanchana P Sridhar Cc: Usama Arif Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2ef94c74847d..34d2da05f2f1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -699,7 +699,8 @@ int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); -void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); + +void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); void __mem_cgroup_uncharge(struct folio *folio); @@ -1206,7 +1207,7 @@ static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, return 0; } -static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry) +static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr) { } -- cgit v1.2.3 From ed8d5b0ce1d738e13c60d6b1a901a56d832e5070 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2024 15:13:20 +0200 Subject: Revert "uprobes: use vm_special_mapping close() functionality" This reverts commit 08e28de1160a712724268fd33d77b32f1bc84d1c. A malicious application can munmap() its "[uprobes]" vma and in this case xol_mapping.close == uprobe_clear_state() will free the memory which can be used by another thread, or the same thread when it hits the uprobe bp afterwards. Link: https://lkml.kernel.org/r/20240911131320.GA3448@redhat.com Signed-off-by: Oleg Nesterov Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andrii Nakryiko Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Michael Ellerman Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sven Schnelle Signed-off-by: Andrew Morton --- include/linux/uprobes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 493dc95d912c..b503fafb7fb3 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -126,6 +126,7 @@ extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); extern bool uprobe_deny_signal(void); extern bool arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs); +extern void uprobe_clear_state(struct mm_struct *mm); extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); -- cgit v1.2.3 From aef79e189ba2b32f78bd35daf2c0b41f3868a321 Mon Sep 17 00:00:00 2001 From: Carlos Song Date: Tue, 10 Sep 2024 13:16:25 +0800 Subject: i3c: master: support to adjust first broadcast address speed According to I3C spec 6.2 Timing Specification, the Open Drain High Period of SCL Clock timing for first broadcast address should be adjusted to 200ns at least. I3C device working as i2c device will see the broadcast to close its Spike Filter then change to work at I3C mode. After that I3C open drain SCL high level should be adjusted back. Signed-off-by: Carlos Song Reviewed-by: Miquel Raynal Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20240910051626.4052552-1-carlos.song@nxp.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 074f632868d9..2a1ed05d5782 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -277,6 +277,20 @@ enum i3c_bus_mode { I3C_BUS_MODE_MIXED_SLOW, }; +/** + * enum i3c_open_drain_speed - I3C open-drain speed + * @I3C_OPEN_DRAIN_SLOW_SPEED: Slow open-drain speed for sending the first + * broadcast address. The first broadcast address at this speed + * will be visible to all devices on the I3C bus. I3C devices + * working in I2C mode will turn off their spike filter when + * switching into I3C mode. + * @I3C_OPEN_DRAIN_NORMAL_SPEED: Normal open-drain speed in I3C bus mode. + */ +enum i3c_open_drain_speed { + I3C_OPEN_DRAIN_SLOW_SPEED, + I3C_OPEN_DRAIN_NORMAL_SPEED, +}; + /** * enum i3c_addr_slot_status - I3C address slot status * @I3C_ADDR_SLOT_FREE: address is free @@ -436,6 +450,7 @@ struct i3c_bus { * NULL. * @enable_hotjoin: enable hot join event detect. * @disable_hotjoin: disable hot join event detect. + * @set_speed: adjust I3C open drain mode timing. */ struct i3c_master_controller_ops { int (*bus_init)(struct i3c_master_controller *master); @@ -464,6 +479,7 @@ struct i3c_master_controller_ops { struct i3c_ibi_slot *slot); int (*enable_hotjoin)(struct i3c_master_controller *master); int (*disable_hotjoin)(struct i3c_master_controller *master); + int (*set_speed)(struct i3c_master_controller *master, enum i3c_open_drain_speed speed); }; /** -- cgit v1.2.3 From 9ba5dcc722de4390a1d3211b2ee3c864f84f5461 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 20 Sep 2024 01:48:17 +0100 Subject: block: Remove unused blk_limits_io_{min,opt} blk_limits_io_min and blk_limits_io_opt are unused since the recent commit 0a94a469a4f0 ("dm: stop using blk_limits_io_{min,opt}") Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20240920004817.676216-1-linux@treblig.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 643c9020a35a..50c3b959da28 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -968,8 +968,6 @@ static inline void blk_queue_disable_write_zeroes(struct request_queue *q) /* * Access functions for manipulating queue properties */ -extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); -extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, -- cgit v1.2.3 From 65f666c6203600053478ce8e34a1db269a8701c9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 19 Sep 2024 10:17:09 +0800 Subject: lib/sbitmap: define swap_lock as raw_spinlock_t When called from sbitmap_queue_get(), sbitmap_deferred_clear() may be run with preempt disabled. In RT kernel, spin_lock() can sleep, then warning of "BUG: sleeping function called from invalid context" can be triggered. Fix it by replacing it with raw_spin_lock. Cc: Yang Yang Fixes: 72d04bdcf3f7 ("sbitmap: fix io hung due to race on sbitmap_word::cleared") Signed-off-by: Ming Lei Reviewed-by: Yang Yang Link: https://lore.kernel.org/r/20240919021709.511329-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index c09cdcc99471..189140bf11fc 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -40,7 +40,7 @@ struct sbitmap_word { /** * @swap_lock: serializes simultaneous updates of ->word and ->cleared */ - spinlock_t swap_lock; + raw_spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** -- cgit v1.2.3 From 652bfcb76fe689f4d85b6f3688025a87fb94f9a1 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Wed, 31 Jul 2024 15:43:13 +0800 Subject: KEYS: Remove unused declarations These declarations are never implemented, remove it. Signed-off-by: Yue Haibing Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/linux/key.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/key.h b/include/linux/key.h index 943a432da3ae..074dca3222b9 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -436,9 +436,6 @@ extern key_ref_t keyring_search(key_ref_t keyring, const char *description, bool recurse); -extern int keyring_add_key(struct key *keyring, - struct key *key); - extern int keyring_restrict(key_ref_t keyring, const char *type, const char *restriction); -- cgit v1.2.3 From 60749cbe3d8ae572a6c7dda675de3e8b25797a18 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 15 Jul 2024 17:14:18 +1000 Subject: sunrpc: change sp_nrthreads from atomic_t to unsigned int. sp_nrthreads is only ever accessed under the service mutex nlmsvc_mutex nfs_callback_mutex nfsd_mutex so these is no need for it to be an atomic_t. The fact that all code using it is single-threaded means that we can simplify svc_pool_victim and remove the temporary elevation of sp_nrthreads. Signed-off-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e4fa25fafa97..99e9345d829e 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -33,9 +33,9 @@ * node traffic on multi-node NUMA NFS servers. */ struct svc_pool { - unsigned int sp_id; /* pool id; also node id on NUMA */ + unsigned int sp_id; /* pool id; also node id on NUMA */ struct lwq sp_xprts; /* pending transports */ - atomic_t sp_nrthreads; /* # of threads in pool */ + unsigned int sp_nrthreads; /* # of threads in pool */ struct list_head sp_all_threads; /* all server threads */ struct llist_head sp_idle_threads; /* idle server threads */ -- cgit v1.2.3 From 3391fc92db8e761f1a2df5612fcb999dac6bc00a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 16 Sep 2024 09:45:40 +1000 Subject: sunrpc: allow svc threads to fail initialisation cleanly If an svc thread needs to perform some initialisation that might fail, it has no good way to handle the failure. Before the thread can exit it must call svc_exit_thread(), but that requires the service mutex to be held. The thread cannot simply take the mutex as that could deadlock if there is a concurrent attempt to shut down all threads (which is unlikely, but not impossible). nfsd currently call svc_exit_thread() unprotected in the unlikely event that unshare_fs_struct() fails. We can clean this up by introducing svc_thread_init_status() by which an svc thread can report whether initialisation has succeeded. If it has, it continues normally into the action loop. If it has not, svc_thread_init_status() immediately aborts the thread. svc_start_kthread() waits for either of these to happen, and calls svc_exit_thread() (under the mutex) if the thread aborted. Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 99e9345d829e..c419a61f60e5 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -21,6 +21,7 @@ #include #include #include +#include /* * @@ -232,6 +233,11 @@ struct svc_rqst { struct net *rq_bc_net; /* pointer to backchannel's * net namespace */ + + int rq_err; /* Thread sets this to inidicate + * initialisation success. + */ + unsigned long bc_to_initval; unsigned int bc_to_retries; void ** rq_lease_breaker; /* The v4 client breaking a lease */ @@ -305,6 +311,31 @@ static inline bool svc_thread_should_stop(struct svc_rqst *rqstp) return test_bit(RQ_VICTIM, &rqstp->rq_flags); } +/** + * svc_thread_init_status - report whether thread has initialised successfully + * @rqstp: the thread in question + * @err: errno code + * + * After performing any initialisation that could fail, and before starting + * normal work, each sunrpc svc_thread must call svc_thread_init_status() + * with an appropriate error, or zero. + * + * If zero is passed, the thread is ready and must continue until + * svc_thread_should_stop() returns true. If a non-zero error is passed + * the call will not return - the thread will exit. + */ +static inline void svc_thread_init_status(struct svc_rqst *rqstp, int err) +{ + rqstp->rq_err = err; + /* memory barrier ensures assignment to error above is visible before + * waitqueue_active() test below completes. + */ + smp_mb(); + wake_up_var(&rqstp->rq_err); + if (err) + kthread_exit(1); +} + struct svc_deferred_req { u32 prot; /* protocol (UDP or TCP) */ struct svc_xprt *xprt; -- cgit v1.2.3 From 36ffa3d0de54c1cf516ea32a5ec556f5c9874795 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 29 Jul 2024 11:47:23 +1000 Subject: nfsd: be more systematic about selecting error codes for internal use. Rather than using ad hoc values for internal errors (30000, 11000, ...) use 'enum' to sequentially allocate numbers starting from the first known available number - now visible as NFS4ERR_FIRST_FREE. The goal is values that are distinct from all be32 error codes. To get those we must first select integers that are not already used, then convert them with cpu_to_be32(). Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever --- include/linux/nfs4.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index f9df88091c6d..8d7430d9f218 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -281,15 +281,18 @@ enum nfsstat4 { /* nfs42 */ NFS4ERR_PARTNER_NOTSUPP = 10088, NFS4ERR_PARTNER_NO_AUTH = 10089, - NFS4ERR_UNION_NOTSUPP = 10090, - NFS4ERR_OFFLOAD_DENIED = 10091, - NFS4ERR_WRONG_LFS = 10092, - NFS4ERR_BADLABEL = 10093, - NFS4ERR_OFFLOAD_NO_REQS = 10094, + NFS4ERR_UNION_NOTSUPP = 10090, + NFS4ERR_OFFLOAD_DENIED = 10091, + NFS4ERR_WRONG_LFS = 10092, + NFS4ERR_BADLABEL = 10093, + NFS4ERR_OFFLOAD_NO_REQS = 10094, /* xattr (RFC8276) */ - NFS4ERR_NOXATTR = 10095, - NFS4ERR_XATTR2BIG = 10096, + NFS4ERR_NOXATTR = 10095, + NFS4ERR_XATTR2BIG = 10096, + + /* can be used for internal errors */ + NFS4ERR_FIRST_FREE }; /* error codes for internal client use */ -- cgit v1.2.3 From c4de97f7c45434985e5dbf2d6ccc9eca676e37fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 29 Jul 2024 16:52:32 -0400 Subject: svcrdma: Handle device removal outside of the CM event handler Synchronously wait for all disconnects to complete to ensure the transports have divested all hardware resources before the underlying RDMA device can safely be removed. Reviewed-by: Sagi Grimberg Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index d33bab33099a..619fc0bd837a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -76,6 +77,7 @@ struct svcxprt_rdma { struct svc_xprt sc_xprt; /* SVC transport structure */ struct rdma_cm_id *sc_cm_id; /* RDMA connection id */ struct list_head sc_accept_q; /* Conn. waiting accept */ + struct rpcrdma_notification sc_rn; /* removal notification */ int sc_ord; /* RDMA read limit */ int sc_max_send_sges; bool sc_snd_w_inv; /* OK to use Send With Invalidate */ -- cgit v1.2.3 From 4b132aacb0768ac1e652cf517097ea6f237214b9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 13 Sep 2024 14:08:13 -0400 Subject: tools: Add xdrgen Add a Python-based tool for translating XDR specifications into XDR encoder and decoder functions written in the Linux kernel's C coding style. The generator attempts to match the usual C coding style of the Linux kernel's SunRPC consumers. This approach is similar to the netlink code generator in tools/net/ynl . The maintainability benefits of machine-generated XDR code include: - Stronger type checking - Reduces the number of bugs introduced by human error - Makes the XDR code easier to audit and analyze - Enables rapid prototyping of new RPC-based protocols - Hardens the layering between protocol logic and marshaling - Makes it easier to add observability on demand - Unit tests might be built for both the tool and (automatically) for the generated code In addition, converting the XDR layer to use memory-safe languages such as Rust will be easier if much of the code can be converted automatically. Tested-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdrgen/_builtins.h | 243 ++++++++++++++++++++++++++++++++ include/linux/sunrpc/xdrgen/_defs.h | 26 ++++ 2 files changed, 269 insertions(+) create mode 100644 include/linux/sunrpc/xdrgen/_builtins.h create mode 100644 include/linux/sunrpc/xdrgen/_defs.h (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdrgen/_builtins.h b/include/linux/sunrpc/xdrgen/_builtins.h new file mode 100644 index 000000000000..68746c59fc9a --- /dev/null +++ b/include/linux/sunrpc/xdrgen/_builtins.h @@ -0,0 +1,243 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Oracle and/or its affiliates. + * + * This header defines XDR data type primitives specified in + * Section 4 of RFC 4506, used by RPC programs implemented + * in the Linux kernel. + */ + +#ifndef _SUNRPC_XDRGEN__BUILTINS_H_ +#define _SUNRPC_XDRGEN__BUILTINS_H_ + +#include + +static inline bool +xdrgen_decode_void(struct xdr_stream *xdr) +{ + return true; +} + +static inline bool +xdrgen_encode_void(struct xdr_stream *xdr) +{ + return true; +} + +static inline bool +xdrgen_decode_bool(struct xdr_stream *xdr, bool *ptr) +{ + __be32 *p = xdr_inline_decode(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *ptr = (*p != xdr_zero); + return true; +} + +static inline bool +xdrgen_encode_bool(struct xdr_stream *xdr, bool val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *p = val ? xdr_one : xdr_zero; + return true; +} + +static inline bool +xdrgen_decode_int(struct xdr_stream *xdr, s32 *ptr) +{ + __be32 *p = xdr_inline_decode(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *ptr = be32_to_cpup(p); + return true; +} + +static inline bool +xdrgen_encode_int(struct xdr_stream *xdr, s32 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *p = cpu_to_be32(val); + return true; +} + +static inline bool +xdrgen_decode_unsigned_int(struct xdr_stream *xdr, u32 *ptr) +{ + __be32 *p = xdr_inline_decode(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *ptr = be32_to_cpup(p); + return true; +} + +static inline bool +xdrgen_encode_unsigned_int(struct xdr_stream *xdr, u32 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *p = cpu_to_be32(val); + return true; +} + +static inline bool +xdrgen_decode_long(struct xdr_stream *xdr, s32 *ptr) +{ + __be32 *p = xdr_inline_decode(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *ptr = be32_to_cpup(p); + return true; +} + +static inline bool +xdrgen_encode_long(struct xdr_stream *xdr, s32 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *p = cpu_to_be32(val); + return true; +} + +static inline bool +xdrgen_decode_unsigned_long(struct xdr_stream *xdr, u32 *ptr) +{ + __be32 *p = xdr_inline_decode(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *ptr = be32_to_cpup(p); + return true; +} + +static inline bool +xdrgen_encode_unsigned_long(struct xdr_stream *xdr, u32 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(!p)) + return false; + *p = cpu_to_be32(val); + return true; +} + +static inline bool +xdrgen_decode_hyper(struct xdr_stream *xdr, s64 *ptr) +{ + __be32 *p = xdr_inline_decode(xdr, XDR_UNIT * 2); + + if (unlikely(!p)) + return false; + *ptr = get_unaligned_be64(p); + return true; +} + +static inline bool +xdrgen_encode_hyper(struct xdr_stream *xdr, s64 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2); + + if (unlikely(!p)) + return false; + put_unaligned_be64(val, p); + return true; +} + +static inline bool +xdrgen_decode_unsigned_hyper(struct xdr_stream *xdr, u64 *ptr) +{ + __be32 *p = xdr_inline_decode(xdr, XDR_UNIT * 2); + + if (unlikely(!p)) + return false; + *ptr = get_unaligned_be64(p); + return true; +} + +static inline bool +xdrgen_encode_unsigned_hyper(struct xdr_stream *xdr, u64 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2); + + if (unlikely(!p)) + return false; + put_unaligned_be64(val, p); + return true; +} + +static inline bool +xdrgen_decode_string(struct xdr_stream *xdr, string *ptr, u32 maxlen) +{ + __be32 *p; + u32 len; + + if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT)) + return false; + if (unlikely(maxlen && len > maxlen)) + return false; + if (len != 0) { + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + return false; + ptr->data = (unsigned char *)p; + } + ptr->len = len; + return true; +} + +static inline bool +xdrgen_encode_string(struct xdr_stream *xdr, string val, u32 maxlen) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(val.len)); + + if (unlikely(!p)) + return false; + xdr_encode_opaque(p, val.data, val.len); + return true; +} + +static inline bool +xdrgen_decode_opaque(struct xdr_stream *xdr, opaque *ptr, u32 maxlen) +{ + __be32 *p; + u32 len; + + if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT)) + return false; + if (unlikely(maxlen && len > maxlen)) + return false; + if (len != 0) { + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + return false; + ptr->data = (u8 *)p; + } + ptr->len = len; + return true; +} + +static inline bool +xdrgen_encode_opaque(struct xdr_stream *xdr, opaque val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(val.len)); + + if (unlikely(!p)) + return false; + xdr_encode_opaque(p, val.data, val.len); + return true; +} + +#endif /* _SUNRPC_XDRGEN__BUILTINS_H_ */ diff --git a/include/linux/sunrpc/xdrgen/_defs.h b/include/linux/sunrpc/xdrgen/_defs.h new file mode 100644 index 000000000000..be9e62371758 --- /dev/null +++ b/include/linux/sunrpc/xdrgen/_defs.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 Oracle and/or its affiliates. + * + * This header defines XDR data type primitives specified in + * Section 4 of RFC 4506, used by RPC programs implemented + * in the Linux kernel. + */ + +#ifndef _SUNRPC_XDRGEN__DEFS_H_ +#define _SUNRPC_XDRGEN__DEFS_H_ + +#define TRUE (true) +#define FALSE (false) + +typedef struct { + u32 len; + unsigned char *data; +} string; + +typedef struct { + u32 len; + u8 *data; +} opaque; + +#endif /* _SUNRPC_XDRGEN__DEFS_H_ */ -- cgit v1.2.3 From 663ad8b1df8724cd5e01df66ea67ce0424fbcdf6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 10 Sep 2024 15:31:19 -0400 Subject: xdrgen: Fix return code checking in built-in XDR decoders xdr_stream_encode_u32() returns XDR_UNIT on success. xdr_stream_decode_u32() returns zero or -EMSGSIZE, but never XDR_UNIT. Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdrgen/_builtins.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdrgen/_builtins.h b/include/linux/sunrpc/xdrgen/_builtins.h index 68746c59fc9a..66ca3ece951a 100644 --- a/include/linux/sunrpc/xdrgen/_builtins.h +++ b/include/linux/sunrpc/xdrgen/_builtins.h @@ -184,7 +184,7 @@ xdrgen_decode_string(struct xdr_stream *xdr, string *ptr, u32 maxlen) __be32 *p; u32 len; - if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT)) + if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0)) return false; if (unlikely(maxlen && len > maxlen)) return false; @@ -215,7 +215,7 @@ xdrgen_decode_opaque(struct xdr_stream *xdr, opaque *ptr, u32 maxlen) __be32 *p; u32 len; - if (unlikely(xdr_stream_decode_u32(xdr, &len) != XDR_UNIT)) + if (unlikely(xdr_stream_decode_u32(xdr, &len) < 0)) return false; if (unlikely(maxlen && len > maxlen)) return false; -- cgit v1.2.3 From bb0e391975f8da826305cbaa3e3d34b03c47e2a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 Sep 2024 09:10:17 +0200 Subject: dma-mapping: fix vmap and mmap of noncontiougs allocations Commit b5c58b2fdc42 ("dma-mapping: direct calls for dma-iommu") switched to use direct calls to dma-iommu, but missed the dma_vmap_noncontiguous, dma_vunmap_noncontiguous and dma_mmap_noncontiguous behavior keyed off the presence of the alloc_noncontiguous method. Fix this by removing the now unused alloc_noncontiguous and free_noncontiguous methods and moving the vmapping and mmaping of the noncontiguous allocations into the iommu code, as it is the only provider of actually noncontiguous allocations. Fixes: b5c58b2fdc42 ("dma-mapping: direct calls for dma-iommu") Reported-by: Xi Ruoyao Signed-off-by: Christoph Hellwig Reviewed-by: Leon Romanovsky Tested-by: Xi Ruoyao --- include/linux/dma-map-ops.h | 19 ------------------- include/linux/iommu-dma.h | 6 ++++++ 2 files changed, 6 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 9668ddf3696e..b7773201414c 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -24,11 +24,6 @@ struct dma_map_ops { gfp_t gfp); void (*free_pages)(struct device *dev, size_t size, struct page *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir); - struct sg_table *(*alloc_noncontiguous)(struct device *dev, size_t size, - enum dma_data_direction dir, gfp_t gfp, - unsigned long attrs); - void (*free_noncontiguous)(struct device *dev, size_t size, - struct sg_table *sgt, enum dma_data_direction dir); int (*mmap)(struct device *, struct vm_area_struct *, void *, dma_addr_t, size_t, unsigned long attrs); @@ -206,20 +201,6 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, } #endif /* CONFIG_DMA_GLOBAL_POOL */ -/* - * This is the actual return value from the ->alloc_noncontiguous method. - * The users of the DMA API should only care about the sg_table, but to make - * the DMA-API internal vmaping and freeing easier we stash away the page - * array as well (except for the fallback case). This can go away any time, - * e.g. when a vmap-variant that takes a scatterlist comes along. - */ -struct dma_sgt_handle { - struct sg_table sgt; - struct page **pages; -}; -#define sgt_handle(sgt) \ - container_of((sgt), struct dma_sgt_handle, sgt) - int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index 1bb55ca1ab79..7bf145a52d6a 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -44,6 +44,12 @@ struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); void iommu_dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir); +void *iommu_dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt); +#define iommu_dma_vunmap_noncontiguous(dev, vaddr) \ + vunmap(vaddr); +int iommu_dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt); void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, -- cgit v1.2.3 From 3d09ff45469eb2912ce2227c47efac297230d6d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 Sep 2024 09:21:10 +0200 Subject: iommu/dma: remove most stubs in iommu-dma.h The direct calls from mapping.c all guarded by use_dma_iommu(), so don't bother to provide stubs, but instead just expose the prototypes unconditionally. Signed-off-by: Christoph Hellwig Reviewed-by: Leon Romanovsky --- include/linux/iommu-dma.h | 108 ++++------------------------------------------ 1 file changed, 8 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iommu-dma.h b/include/linux/iommu-dma.h index 7bf145a52d6a..508beaa44c39 100644 --- a/include/linux/iommu-dma.h +++ b/include/linux/iommu-dma.h @@ -14,6 +14,13 @@ static inline bool use_dma_iommu(struct device *dev) { return dev->dma_iommu; } +#else +static inline bool use_dma_iommu(struct device *dev) +{ + return false; +} +#endif /* CONFIG_IOMMU_DMA */ + dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); @@ -58,104 +65,5 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir); void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nelems, enum dma_data_direction dir); -#else -static inline bool use_dma_iommu(struct device *dev) -{ - return false; -} -static inline dma_addr_t iommu_dma_map_page(struct device *dev, - struct page *page, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - return DMA_MAPPING_ERROR; -} -static inline void iommu_dma_unmap_page(struct device *dev, - dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ -} -static inline int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - return -EINVAL; -} -static inline void iommu_dma_unmap_sg(struct device *dev, - struct scatterlist *sg, int nents, enum dma_data_direction dir, - unsigned long attrs) -{ -} -static inline void *iommu_dma_alloc(struct device *dev, size_t size, - dma_addr_t *handle, gfp_t gfp, unsigned long attrs) -{ - return NULL; -} -static inline int iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) -{ - return -EINVAL; -} -static inline int iommu_dma_get_sgtable(struct device *dev, - struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, - size_t size, unsigned long attrs) -{ - return -EINVAL; -} -static inline unsigned long iommu_dma_get_merge_boundary(struct device *dev) -{ - return 0; -} -static inline size_t iommu_dma_opt_mapping_size(void) -{ - return 0; -} -static inline size_t iommu_dma_max_mapping_size(struct device *dev) -{ - return 0; -} -static inline void iommu_dma_free(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t handle, unsigned long attrs) -{ -} -static inline dma_addr_t iommu_dma_map_resource(struct device *dev, - phys_addr_t phys, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - return DMA_MAPPING_ERROR; -} -static inline void iommu_dma_unmap_resource(struct device *dev, - dma_addr_t handle, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ -} -static inline struct sg_table * -iommu_dma_alloc_noncontiguous(struct device *dev, size_t size, - enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) -{ - return NULL; -} -static inline void iommu_dma_free_noncontiguous(struct device *dev, size_t size, - struct sg_table *sgt, enum dma_data_direction dir) -{ -} -static inline void iommu_dma_sync_single_for_cpu(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction dir) -{ -} -static inline void iommu_dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) -{ -} -static inline void iommu_dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ -} -static inline void iommu_dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nelems, - enum dma_data_direction dir) -{ -} -#endif /* CONFIG_IOMMU_DMA */ + #endif /* _LINUX_IOMMU_DMA_H */ -- cgit v1.2.3 From d0dd066a0fa26d55c19ace9e89dedd9504c5bcba Mon Sep 17 00:00:00 2001 From: "Christoph Lameter (Ampere)" Date: Wed, 12 Jun 2024 09:49:56 -0700 Subject: seqcount: replace smp_rmb() in read_seqcount() with load acquire Many architectures support load acquire which can replace a memory barrier and save some cycles. A typical sequence do { seq = read_seqcount_begin(&s); } while (read_seqcount_retry(&s, seq); requires 13 cycles on an N1 Neoverse arm64 core (Ampere Altra, to be specific) for an empty loop. Two read memory barriers are needed. One for each of the seqcount_* functions. We can replace the first read barrier with a load acquire of the seqcount which saves us one barrier. On the Altra doing so reduces the cycle count from 13 to 8. According to ARM, this is a general improvement for the ARM64 architecture and not specific to a certain processor. See https://developer.arm.com/documentation/102336/0100/Load-Acquire-and-Store-Release-instructions "Weaker ordering requirements that are imposed by Load-Acquire and Store-Release instructions allow for micro-architectural optimizations, which could reduce some of the performance impacts that are otherwise imposed by an explicit memory barrier. If the ordering requirement is satisfied using either a Load-Acquire or Store-Release, then it would be preferable to use these instructions instead of a DMB" [ NOTE! This is my original minimal patch that unconditionally switches over to using smp_load_acquire(), instead of the much more involved and subtle patch that Christoph Lameter wrote that made it conditional. But Christoph gets authorship credit because I had initially thought that we needed the more complex model, and Christoph ran with it it and did the work. Only after looking at code generation for all the relevant architectures, did I come to the conclusion that nobody actually really needs the old "smp_rmb()" model. Even architectures without load-acquire support generally do as well or better with smp_load_acquire(). So credit to Christoph, but if this then causes issues on other architectures, put the blame solidly on me. Also note as part of the ruthless simplification, this gets rid of the overly subtle optimization where some code uses a non-barrier version of the sequence count (see the __read_seqcount_begin() users in fs/namei.c). They then play games with their own barriers and/or with nested sequence counts. Those optimizations are literally meaningless on x86, and questionable elsewhere. If somebody can show that they matter, we need to re-do them more cleanly than "use an internal helper". - Linus ] Signed-off-by: Christoph Lameter (Ampere) Link: https://lore.kernel.org/all/20240912-seq_optimize-v3-1-8ee25e04dffa@gentwo.org/ Signed-off-by: Linus Torvalds --- include/linux/seqlock.h | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index d90d8ee29d81..fffeb754880f 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -157,7 +157,7 @@ __seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s) \ static __always_inline unsigned \ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ { \ - unsigned seq = READ_ONCE(s->seqcount.sequence); \ + unsigned seq = smp_load_acquire(&s->seqcount.sequence); \ \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return seq; \ @@ -170,7 +170,7 @@ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ * Re-read the sequence counter since the (possibly \ * preempted) writer made progress. \ */ \ - seq = READ_ONCE(s->seqcount.sequence); \ + seq = smp_load_acquire(&s->seqcount.sequence); \ } \ \ return seq; \ @@ -208,7 +208,7 @@ static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s) static inline unsigned __seqprop_sequence(const seqcount_t *s) { - return READ_ONCE(s->sequence); + return smp_load_acquire(&s->sequence); } static inline bool __seqprop_preemptible(const seqcount_t *s) @@ -263,17 +263,9 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) #define seqprop_assert(s) __seqprop(s, assert)(s) /** - * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier + * __read_seqcount_begin() - begin a seqcount_t read section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * - * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb() - * barrier. Callers should ensure that smp_rmb() or equivalent ordering is - * provided before actually loading any of the variables that are to be - * protected in this critical section. - * - * Use carefully, only in critical code, and comment how the barrier is - * provided. - * * Return: count to be passed to read_seqcount_retry() */ #define __read_seqcount_begin(s) \ @@ -293,13 +285,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) * * Return: count to be passed to read_seqcount_retry() */ -#define raw_read_seqcount_begin(s) \ -({ \ - unsigned _seq = __read_seqcount_begin(s); \ - \ - smp_rmb(); \ - _seq; \ -}) +#define raw_read_seqcount_begin(s) __read_seqcount_begin(s) /** * read_seqcount_begin() - begin a seqcount_t read critical section @@ -328,7 +314,6 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) ({ \ unsigned __seq = seqprop_sequence(s); \ \ - smp_rmb(); \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ }) -- cgit v1.2.3 From ffcdc4c628e1a30489da10dd78358e89c823b341 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 6 Sep 2024 16:34:52 +0200 Subject: fs/mnt_idmapping: introduce an invalid_mnt_idmap Link: https://lore.kernel.org/linux-fsdevel/20240904-baugrube-erhoben-b3c1c49a2645@brauner/ Suggested-by: Christian Brauner Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- include/linux/mnt_idmapping.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h index cd4d5c8781f5..b1b219bc3422 100644 --- a/include/linux/mnt_idmapping.h +++ b/include/linux/mnt_idmapping.h @@ -9,6 +9,7 @@ struct mnt_idmap; struct user_namespace; extern struct mnt_idmap nop_mnt_idmap; +extern struct mnt_idmap invalid_mnt_idmap; extern struct user_namespace init_user_ns; typedef struct { -- cgit v1.2.3 From f8eb5bd9a818cc5f2a1e50b22b0091830b28cc36 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 23 Sep 2024 08:58:31 -0700 Subject: mm: fix build on 32-bit targets without MAX_PHYSMEM_BITS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The merge resolution to deal with the conflict between commits ea72ce5da228 ("x86/kaslr: Expose and use the end of the physical memory address space") and 99185c10d5d9 ("resource, kunit: add test case for region_intersects()") ended up being broken in configurations didn't define a MAX_PHYSMEM_BITS and that had a 32-bit 'phys_addr_t'. The fallback to using all bits set (ie "(-1ULL)") ended up causing a build error: kernel/resource.c: In function ‘gfr_start’: include/linux/minmax.h:93:30: error: conversion from ‘long long unsigned int’ to ‘resource_size_t’ {aka ‘unsigned int’} changes value from ‘18446744073709551615’ to ‘4294967295’ [-Werror=overflow] this was reported by Geert for m68k, but he points out that it happens on other 32-bit architectures too, eg mips, xtensa, parisc, and powerpc. Limiting 'PHYSMEM_END' to a 'phys_addr_t' (which is the same as 'resource_size_t') fixes the build, but Geert points out that it will then cause a silent overflow in mm/sparse.c: unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT; so we actually do want PHYSMEM_END to be defined a 64-bit type - just not all ones, and not larger than 'phys_addr_t'. The proper fix is probably to not have some kind of default fallback at all, but just make sure every architecture has a valid MAX_PHYSMEM_BITS. But in the meantime, this just applies the rule that PHYSMEM_END is the largest value that fits in a 'phys_addr_t', but does not have the high bit set in 64 bits. Ugly, ugly. Reported-by: Geert Uytterhoeven Cc: Andrew Morton Cc: Huang Ying Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b62437447077..ecf63d2b0582 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -101,7 +101,7 @@ extern int mmap_rnd_compat_bits __read_mostly; # ifdef MAX_PHYSMEM_BITS # define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) # else -# define PHYSMEM_END (-1ULL) +# define PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) # endif #endif -- cgit v1.2.3 From d98f72272500f505cd7e152ffa456e64ee3855f0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Sep 2024 12:32:03 +1000 Subject: nfs: simplify and guarantee owner uniqueness. I have evidence of an Linux NFS client getting NFS4ERR_BAD_SEQID to a v4.0 LOCK request to a Linux server (which had fixed the problem with RELEASE_LOCKOWNER bug fixed). The LOCK request presented a "new" lock owner so there are two seq ids in the request: that for the open file, and that for the new lock. Given the context I am confident that the new lock owner was reported to have the wrong seqid. As lock owner identifiers are reused, the server must still have a lock owner active which the client thinks is no longer active. I wasn't able to determine a root-cause but the simplest fix seems to be to ensure lock owners are always unique much as open owners are (thanks to a time stamp). The easiest way to ensure uniqueness is with a 64bit counter for each server. That will never cycle (if updated once a nanosecond the last 584 years. A single NFS server would not handle open/lock requests nearly that fast, and a Linux node is unlikely to have an uptime approaching that). This patch removes the 2 ida and instead uses a per-server atomic64_t to provide uniqueness. Note that the lock owner already encodes the id as 64 bits even though it is a 32bit value. So changing to a 64bit value does not change the encoding of the lock owner. The open owner encoding is now 4 bytes larger. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 3 +-- include/linux/nfs_xdr.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 1df86ab98c77..e1e47ebd83ef 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -234,8 +234,7 @@ struct nfs_server { /* the following fields are protected by nfs_client->cl_lock */ struct rb_root state_owners; #endif - struct ida openowner_id; - struct ida lockowner_id; + atomic64_t owner_ctr; struct list_head state_owners_lru; struct list_head layouts; struct list_head delegations; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 45623af3e7b8..96ba04ab24f3 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -446,7 +446,7 @@ struct nfs42_clone_res { struct stateowner_id { __u64 create_time; - __u32 uniquifier; + __u64 uniquifier; }; struct nfs4_open_delegation { -- cgit v1.2.3 From 0b108e83795c9c23101f584ef7e3ab4f1f120ef0 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 19 Aug 2024 08:58:59 -0700 Subject: SUNRPC: convert RPC_TASK_* constants to enum The RPC_TASK_* constants are defined as macros, which means that most kernel builds will not contain their definitions in the debuginfo. However, it's quite useful for debuggers to be able to view the task state constant and interpret it correctly. Conversion to an enum will ensure the constants are present in debuginfo and can be interpreted by debuggers without needing to hard-code them and track their changes. Signed-off-by: Stephen Brennan Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 0c77ba488bba..fec1e8a1570c 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -151,13 +151,15 @@ struct rpc_task_setup { #define RPC_WAS_SENT(t) ((t)->tk_flags & RPC_TASK_SENT) #define RPC_IS_MOVEABLE(t) ((t)->tk_flags & RPC_TASK_MOVEABLE) -#define RPC_TASK_RUNNING 0 -#define RPC_TASK_QUEUED 1 -#define RPC_TASK_ACTIVE 2 -#define RPC_TASK_NEED_XMIT 3 -#define RPC_TASK_NEED_RECV 4 -#define RPC_TASK_MSG_PIN_WAIT 5 -#define RPC_TASK_SIGNALLED 6 +enum { + RPC_TASK_RUNNING, + RPC_TASK_QUEUED, + RPC_TASK_ACTIVE, + RPC_TASK_NEED_XMIT, + RPC_TASK_NEED_RECV, + RPC_TASK_MSG_PIN_WAIT, + RPC_TASK_SIGNALLED, +}; #define rpc_test_and_set_running(t) \ test_and_set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) -- cgit v1.2.3 From dfb07e990a0d019d7ae9b78dd4260620ce32e79a Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Wed, 24 Jul 2024 14:07:12 +0300 Subject: nfs: add 'noalignwrite' option for lock-less 'lost writes' prevention There are some applications that write to predefined non-overlapping file offsets from multiple clients and therefore don't need to rely on file locking. However, if these applications want non-aligned offsets and sizes they need to either use locks or risk data corruption, as the NFS client defaults to extending writes to whole pages. This commit adds a new mount option `noalignwrite`, which allows to turn that off and avoid the need of locking, as long as these applications don't overlap on offsets. Signed-off-by: Dan Aloni Reviewed-by: Jeff Layton Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Anna Schumaker --- include/linux/nfs_fs_sb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index e1e47ebd83ef..c49bfdded5c1 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -158,6 +158,7 @@ struct nfs_server { #define NFS_MOUNT_WRITE_WAIT 0x02000000 #define NFS_MOUNT_TRUNK_DISCOVERY 0x04000000 #define NFS_MOUNT_SHUTDOWN 0x08000000 +#define NFS_MOUNT_NO_ALIGNWRITE 0x10000000 unsigned int fattr_valid; /* Valid attributes */ unsigned int caps; /* server capabilities */ -- cgit v1.2.3 From 4806ded4c14c5e8fdc6ce885d83221a78c06a428 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 5 Sep 2024 15:09:35 -0400 Subject: nfs_common: factor out nfs_errtbl and nfs_stat_to_errno Common nfs_stat_to_errno() is used by both fs/nfs/nfs2xdr.c and fs/nfs/nfs3xdr.c Will also be used by fs/nfsd/localio.c Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/nfs_common.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/linux/nfs_common.h (limited to 'include/linux') diff --git a/include/linux/nfs_common.h b/include/linux/nfs_common.h new file mode 100644 index 000000000000..3395c4a4d372 --- /dev/null +++ b/include/linux/nfs_common.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This file contains constants and methods used by both NFS client and server. + */ +#ifndef _LINUX_NFS_COMMON_H +#define _LINUX_NFS_COMMON_H + +#include +#include + +/* Mapping from NFS error code to "errno" error code. */ +#define errno_NFSERR_IO EIO + +int nfs_stat_to_errno(enum nfs_stat status); + +#endif /* _LINUX_NFS_COMMON_H */ -- cgit v1.2.3 From 1fcb16674e37e434efe68ec3e142229f35b6b9e1 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 5 Sep 2024 15:09:36 -0400 Subject: nfs_common: factor out nfs4_errtbl and nfs4_stat_to_errno Common nfs4_stat_to_errno() is used by fs/nfs/nfs4xdr.c and will be used by fs/nfs/localio.c Signed-off-by: Mike Snitzer Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/nfs_common.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfs_common.h b/include/linux/nfs_common.h index 3395c4a4d372..5fc02df88252 100644 --- a/include/linux/nfs_common.h +++ b/include/linux/nfs_common.h @@ -12,5 +12,6 @@ #define errno_NFSERR_IO EIO int nfs_stat_to_errno(enum nfs_stat status); +int nfs4_stat_to_errno(int stat); #endif /* _LINUX_NFS_COMMON_H */ -- cgit v1.2.3 From 1545e488b1f908b10f6dff0c278c6b7a37122de8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 5 Sep 2024 15:09:37 -0400 Subject: nfs: factor out {encode,decode}_opaque_fixed to nfs_xdr.h Eliminates duplicate functions in various files to allow for additional callers. Signed-off-by: Mike Snitzer Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs_xdr.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 96ba04ab24f3..12d8e47bc5a3 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1853,6 +1853,24 @@ struct nfs_rpc_ops { void (*disable_swap)(struct inode *inode); }; +/* + * Helper functions used by NFS client and/or server + */ +static inline void encode_opaque_fixed(struct xdr_stream *xdr, + const void *buf, size_t len) +{ + WARN_ON_ONCE(xdr_stream_encode_opaque_fixed(xdr, buf, len) < 0); +} + +static inline int decode_opaque_fixed(struct xdr_stream *xdr, + void *buf, size_t len) +{ + ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len); + if (unlikely(ret < 0)) + return -EIO; + return 0; +} + /* * Function vectors etc. for the NFS client */ @@ -1866,4 +1884,4 @@ extern const struct rpc_version nfs_version4; extern const struct rpc_version nfsacl_version3; extern const struct rpc_program nfsacl_program; -#endif +#endif /* _LINUX_NFS_XDR_H */ -- cgit v1.2.3 From 199f2128741077087a2ab33889a6868830465033 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 5 Sep 2024 15:09:46 -0400 Subject: SUNRPC: add svcauth_map_clnt_to_svc_cred_local Add new funtion svcauth_map_clnt_to_svc_cred_local which maps a generic cred to a svc_cred suitable for use in nfsd. This is needed by the localio code to map nfs client creds to nfs server credentials. Following from net/sunrpc/auth_unix.c:unx_marshal() it is clear that ->fsuid and ->fsgid must be used (rather than ->uid and ->gid). In addition, these uid and gid must be translated with from_kuid_munged() so local client uses correct uid and gid when acting as local server. Jeff Layton noted: This is where the magic happens. Since we're working in kuid_t/kgid_t, we don't need to worry about further idmapping. Suggested-by: NeilBrown # to approximate unx_marshal() Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust Co-developed-by: Mike Snitzer Signed-off-by: Mike Snitzer Reviewed-by: Chuck Lever Reviewed-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Anna Schumaker --- include/linux/sunrpc/svcauth.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index 63cf6fb26dcc..2e111153f7cd 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -157,6 +158,10 @@ extern enum svc_auth_status svc_set_client(struct svc_rqst *rqstp); extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops); extern void svc_auth_unregister(rpc_authflavor_t flavor); +extern void svcauth_map_clnt_to_svc_cred_local(struct rpc_clnt *clnt, + const struct cred *, + struct svc_cred *); + extern struct auth_domain *unix_domain_find(char *name); extern void auth_domain_put(struct auth_domain *item); extern struct auth_domain *auth_domain_lookup(char *name, struct auth_domain *new); -- cgit v1.2.3 From 86ab08beb3f07f6e51922a8b8f662a5ec7012d35 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 5 Sep 2024 15:09:47 -0400 Subject: SUNRPC: replace program list with program array A service created with svc_create_pooled() can be given a linked list of programs and all of these will be served. Using a linked list makes it cumbersome when there are several programs that can be optionally selected with CONFIG settings. After this patch is applied, API consumers must use only svc_create_pooled() when creating an RPC service that listens for more than one RPC program. Signed-off-by: NeilBrown Signed-off-by: Mike Snitzer Acked-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/sunrpc/svc.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index c419a61f60e5..e68fecf6eab5 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -67,9 +67,10 @@ enum { * We currently do not support more than one RPC program per daemon. */ struct svc_serv { - struct svc_program * sv_program; /* RPC program */ + struct svc_program * sv_programs; /* RPC programs */ struct svc_stat * sv_stats; /* RPC statistics */ spinlock_t sv_lock; + unsigned int sv_nprogs; /* Number of sv_programs */ unsigned int sv_nrthreads; /* # of server threads */ unsigned int sv_maxconn; /* max connections allowed or * '0' causing max to be based @@ -360,10 +361,9 @@ struct svc_process_info { }; /* - * List of RPC programs on the same transport endpoint + * RPC program - an array of these can use the same transport endpoint */ struct svc_program { - struct svc_program * pg_next; /* other programs (same xprt) */ u32 pg_prog; /* program number */ unsigned int pg_lovers; /* lowest version */ unsigned int pg_hivers; /* highest version */ @@ -441,6 +441,7 @@ bool svc_rqst_replace_page(struct svc_rqst *rqstp, void svc_rqst_release_pages(struct svc_rqst *rqstp); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *prog, + unsigned int nprog, struct svc_stat *stats, unsigned int bufsize, int (*threadfn)(void *data)); -- cgit v1.2.3 From 2a33a85be45178198245e1f656e6224c899895e4 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 5 Sep 2024 15:09:48 -0400 Subject: nfs_common: add NFS LOCALIO auxiliary protocol enablement fs/nfs_common/nfslocalio.c provides interfaces that enable an NFS client to generate a nonce (single-use UUID) and associated nfs_uuid_t struct, register it with nfs_common for subsequent lookup and verification by the NFS server and if matched the NFS server populates members in the nfs_uuid_t struct. nfs_common's nfs_uuids list is the basis for localio enablement, as such it has members that point to nfsd memory for direct use by the client (e.g. 'net' is the server's network namespace, through it the client can access nn->nfsd_serv). This commit also provides the base nfs_uuid_t interfaces to allow proper net namespace refcounting for the LOCALIO use case. CONFIG_NFS_LOCALIO controls the nfs_common, NFS server and NFS client enablement for LOCALIO. If both NFS_FS=m and NFSD=m then NFS_COMMON_LOCALIO_SUPPORT=m and nfs_localio.ko is built (and provides nfs_common's LOCALIO support). # lsmod | grep nfs_localio nfs_localio 12288 2 nfsd,nfs sunrpc 745472 35 nfs_localio,nfsd,auth_rpcgss,lockd,nfsv3,nfs Signed-off-by: Mike Snitzer Co-developed-by: NeilBrown Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 include/linux/nfslocalio.h (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h new file mode 100644 index 000000000000..4165ff8390c1 --- /dev/null +++ b/include/linux/nfslocalio.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024 Mike Snitzer + * Copyright (C) 2024 NeilBrown + */ +#ifndef __LINUX_NFSLOCALIO_H +#define __LINUX_NFSLOCALIO_H + +#include +#include +#include +#include +#include +#include + +/* + * Useful to allow a client to negotiate if localio + * possible with its server. + * + * See Documentation/filesystems/nfs/localio.rst for more detail. + */ +typedef struct { + uuid_t uuid; + struct list_head list; + struct net *net; /* nfsd's network namespace */ + struct auth_domain *dom; /* auth_domain for localio */ +} nfs_uuid_t; + +void nfs_uuid_begin(nfs_uuid_t *); +void nfs_uuid_end(nfs_uuid_t *); +void nfs_uuid_is_local(const uuid_t *, struct list_head *, + struct net *, struct auth_domain *, struct module *); +void nfs_uuid_invalidate_clients(struct list_head *list); +void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid); + +#endif /* __LINUX_NFSLOCALIO_H */ -- cgit v1.2.3 From a61e147e6be6e763d9c6dec8061d2893c0bb3423 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 5 Sep 2024 15:09:49 -0400 Subject: nfs_common: prepare for the NFS client to use nfsd_file for LOCALIO The next commit will introduce nfsd_open_local_fh() which returns an nfsd_file structure. This commit exposes LOCALIO's required NFSD symbols to the NFS client: - Make nfsd_open_local_fh() symbol and other required NFSD symbols available to NFS in a global 'nfs_to' nfsd_localio_operations struct (global access suggested by Trond, nfsd_localio_operations suggested by NeilBrown). The next commit will also introduce nfsd_localio_ops_init() that init_nfsd() will call to initialize 'nfs_to'. - Introduce nfsd_file_file() that provides access to nfsd_file's backing file. Keeps nfsd_file structure opaque to NFS client (as suggested by Jeff Layton). - Introduce nfsd_file_put_local() that will put the reference to the nfsd_file's associated nn->nfsd_serv and then put the reference to the nfsd_file (as suggested by NeilBrown). Suggested-by: Trond Myklebust # nfs_to Suggested-by: NeilBrown # nfsd_localio_operations Suggested-by: Jeff Layton # nfsd_file_file Signed-off-by: Mike Snitzer Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 4165ff8390c1..1a39d7d727bd 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -22,7 +23,7 @@ typedef struct { uuid_t uuid; struct list_head list; - struct net *net; /* nfsd's network namespace */ + struct net __rcu *net; /* nfsd's network namespace */ struct auth_domain *dom; /* auth_domain for localio */ } nfs_uuid_t; @@ -33,4 +34,28 @@ void nfs_uuid_is_local(const uuid_t *, struct list_head *, void nfs_uuid_invalidate_clients(struct list_head *list); void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid); +struct nfsd_file; + +/* localio needs to map filehandle -> struct nfsd_file */ +extern struct nfsd_file * +nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *, + const struct cred *, const struct nfs_fh *, + const fmode_t) __must_hold(rcu); + +struct nfsd_localio_operations { + bool (*nfsd_serv_try_get)(struct net *); + void (*nfsd_serv_put)(struct net *); + struct nfsd_file *(*nfsd_open_local_fh)(struct net *, + struct auth_domain *, + struct rpc_clnt *, + const struct cred *, + const struct nfs_fh *, + const fmode_t); + void (*nfsd_file_put_local)(struct nfsd_file *); + struct file *(*nfsd_file_file)(struct nfsd_file *); +} ____cacheline_aligned; + +extern void nfsd_localio_ops_init(void); +extern const struct nfsd_localio_operations *nfs_to; + #endif /* __LINUX_NFSLOCALIO_H */ -- cgit v1.2.3 From fa4983862e506d395acc1b8d14dbebf63acc2e82 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 5 Sep 2024 15:09:50 -0400 Subject: nfsd: add LOCALIO support Add server support for bypassing NFS for localhost reads, writes, and commits. This is only useful when both the client and server are running on the same host. If nfsd_open_local_fh() fails then the NFS client will both retry and fallback to normal network-based read, write and commit operations if localio is no longer supported. Care is taken to ensure the same NFS security mechanisms are used (authentication, etc) regardless of whether localio or regular NFS access is used. The auth_domain established as part of the traditional NFS client access to the NFS server is also used for localio. Store auth_domain for localio in nfsd_uuid_t and transfer it to the client if it is local to the server. Relative to containers, localio gives the client access to the network namespace the server has. This is required to allow the client to access the server's per-namespace nfsd_net struct. This commit also introduces the use of NFSD's percpu_ref to interlock nfsd_destroy_serv and nfsd_open_local_fh, to ensure nn->nfsd_serv is not destroyed while in use by nfsd_open_local_fh and other LOCALIO client code. CONFIG_NFS_LOCALIO enables NFS server support for LOCALIO. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust Co-developed-by: Mike Snitzer Signed-off-by: Mike Snitzer Co-developed-by: NeilBrown Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 1a39d7d727bd..546f46b4e46f 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -6,6 +6,8 @@ #ifndef __LINUX_NFSLOCALIO_H #define __LINUX_NFSLOCALIO_H +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + #include #include #include @@ -58,4 +60,10 @@ struct nfsd_localio_operations { extern void nfsd_localio_ops_init(void); extern const struct nfsd_localio_operations *nfs_to; +#else /* CONFIG_NFS_LOCALIO */ +static inline void nfsd_localio_ops_init(void) +{ +} +#endif /* CONFIG_NFS_LOCALIO */ + #endif /* __LINUX_NFSLOCALIO_H */ -- cgit v1.2.3 From 946af9b3a0e7571c01447107d5e8968401e659ba Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 5 Sep 2024 15:09:51 -0400 Subject: nfsd: implement server support for NFS_LOCALIO_PROGRAM The LOCALIO auxiliary RPC protocol consists of a single "UUID_IS_LOCAL" RPC method that allows the Linux NFS client to verify the local Linux NFS server can see the nonce (single-use UUID) the client generated and made available in nfs_common. The server expects this protocol to use the same transport as NFS and NFSACL for its RPCs. This protocol isn't part of an IETF standard, nor does it need to be considering it is Linux-to-Linux auxiliary RPC protocol that amounts to an implementation detail. The UUID_IS_LOCAL method encodes the client generated uuid_t in terms of the fixed UUID_SIZE (16 bytes). The fixed size opaque encode and decode XDR methods are used instead of the less efficient variable sized methods. The RPC program number for the NFS_LOCALIO_PROGRAM is 400122 (as assigned by IANA, see https://www.iana.org/assignments/rpc-program-numbers/ ): Linux Kernel Organization 400122 nfslocalio Signed-off-by: Mike Snitzer [neilb: factored out and simplified single localio protocol] Co-developed-by: NeilBrown Signed-off-by: NeilBrown Acked-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs.h b/include/linux/nfs.h index ceb70a926b95..73da75908d95 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h @@ -13,6 +13,13 @@ #include #include +/* The LOCALIO program is entirely private to Linux and is + * NOT part of the uapi. + */ +#define NFS_LOCALIO_PROGRAM 400122 +#define LOCALIOPROC_NULL 0 +#define LOCALIOPROC_UUID_IS_LOCAL 1 + /* * This is the kernel NFS client file handle representation */ -- cgit v1.2.3 From df24c483e28f7f9a421afde15d0497e61bc2d3ea Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 5 Sep 2024 15:09:52 -0400 Subject: nfs: pass struct nfsd_file to nfs_init_pgio and nfs_init_commit The nfsd_file will be passed, in future commits, by callers that enable LOCALIO support (for both regular NFS and pNFS IO). [Derived from patch authored by Weston Andros Adamson, but switched from passing struct file to struct nfsd_file] Signed-off-by: Mike Snitzer Reviewed-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfslocalio.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 546f46b4e46f..2c4e0fd9da6b 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -6,6 +6,9 @@ #ifndef __LINUX_NFSLOCALIO_H #define __LINUX_NFSLOCALIO_H +/* nfsd_file structure is purposely kept opaque to NFS client */ +struct nfsd_file; + #if IS_ENABLED(CONFIG_NFS_LOCALIO) #include @@ -36,8 +39,6 @@ void nfs_uuid_is_local(const uuid_t *, struct list_head *, void nfs_uuid_invalidate_clients(struct list_head *list); void nfs_uuid_invalidate_one_client(nfs_uuid_t *nfs_uuid); -struct nfsd_file; - /* localio needs to map filehandle -> struct nfsd_file */ extern struct nfsd_file * nfsd_open_local_fh(struct net *, struct auth_domain *, struct rpc_clnt *, -- cgit v1.2.3 From 70ba381e1a431245c137ed597ec6a05991c79bd9 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 5 Sep 2024 15:09:53 -0400 Subject: nfs: add LOCALIO support Add client support for bypassing NFS for localhost reads, writes, and commits. This is only useful when the client and the server are running on the same host. nfs_local_probe() is stubbed out, later commits will enable client and server handshake via a Linux-only LOCALIO auxiliary RPC protocol. This has dynamic binding with the nfsd module (via nfs_localio module which is part of nfs_common). LOCALIO will only work if nfsd is already loaded. The "localio_enabled" nfs kernel module parameter can be used to disable and enable the ability to use LOCALIO support. CONFIG_NFS_LOCALIO enables NFS client support for LOCALIO. Lastly, LOCALIO uses an nfsd_file to initiate all IO. To make proper use of nfsd_file (and nfsd's filecache) its lifetime (duration before nfsd_file_put is called) must extend until after commit, read and write operations. So rather than immediately drop the nfsd_file reference in nfs_local_open_fh(), that doesn't happen until nfs_local_pgio_release() for read/write and not until nfs_local_release_commit_data() for commit. The same applies to the reference held on nfsd's nn->nfsd_serv. Both objects' lifetimes and associated references are managed through calls to nfs_to->nfsd_file_put_local(). Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust Co-developed-by: Mike Snitzer Signed-off-by: Mike Snitzer Signed-off-by: NeilBrown # nfs_open_local_fh Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/nfs.h | 2 ++ include/linux/nfs_fs_sb.h | 9 +++++++++ include/linux/nfslocalio.h | 4 ++++ 3 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfs.h b/include/linux/nfs.h index 73da75908d95..9ad727ddfedb 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h @@ -8,6 +8,8 @@ #ifndef _LINUX_NFS_H #define _LINUX_NFS_H +#include +#include #include #include #include diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index c49bfdded5c1..853df3fcd4c2 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -49,6 +50,7 @@ struct nfs_client { #define NFS_CS_DS 7 /* - Server is a DS */ #define NFS_CS_REUSEPORT 8 /* - reuse src port on reconnect */ #define NFS_CS_PNFS 9 /* - Server used for pnfs */ +#define NFS_CS_LOCAL_IO 10 /* - client is local */ struct sockaddr_storage cl_addr; /* server identifier */ size_t cl_addrlen; char * cl_hostname; /* hostname of server */ @@ -125,6 +127,13 @@ struct nfs_client { struct net *cl_net; struct list_head pending_cb_stateids; struct rcu_head rcu; + +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + struct timespec64 cl_nfssvc_boot; + seqlock_t cl_boot_lock; + nfs_uuid_t cl_uuid; + spinlock_t cl_localio_lock; +#endif /* CONFIG_NFS_LOCALIO */ }; /* diff --git a/include/linux/nfslocalio.h b/include/linux/nfslocalio.h index 2c4e0fd9da6b..b353abe00357 100644 --- a/include/linux/nfslocalio.h +++ b/include/linux/nfslocalio.h @@ -61,6 +61,10 @@ struct nfsd_localio_operations { extern void nfsd_localio_ops_init(void); extern const struct nfsd_localio_operations *nfs_to; +struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *, + struct rpc_clnt *, const struct cred *, + const struct nfs_fh *, const fmode_t); + #else /* CONFIG_NFS_LOCALIO */ static inline void nfsd_localio_ops_init(void) { -- cgit v1.2.3 From b166b8ab4189743a717cb93f50d6fcca3a46770d Mon Sep 17 00:00:00 2001 From: Jiqian Chen Date: Tue, 24 Sep 2024 14:14:36 +0800 Subject: xen/pvh: Setup gsi for passthrough device In PVH dom0, the gsis don't get registered, but the gsi of a passthrough device must be configured for it to be able to be mapped into a domU. When assigning a device to passthrough, proactively setup the gsi of the device during that process. Signed-off-by: Jiqian Chen Signed-off-by: Huang Rui Signed-off-by: Jiqian Chen Reviewed-by: Stefano Stabellini Message-ID: <20240924061437.2636766-3-Jiqian.Chen@amd.com> Signed-off-by: Juergen Gross --- include/linux/acpi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 0687a442fec7..02ded9f53a6b 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -362,6 +362,7 @@ void acpi_unregister_gsi (u32 gsi); struct pci_dev; +struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin); int acpi_pci_irq_enable (struct pci_dev *dev); void acpi_penalize_isa_irq(int irq, int active); bool acpi_isa_irq_available(int irq); -- cgit v1.2.3 From 2efddb5575cd9f5f4d61ad417c92365a5f18d2f1 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 25 Sep 2024 10:57:00 +0200 Subject: Revert "driver core: shut down devices asynchronously" This reverts commit 8064952c65045f05ee2671fe437770e50c151776. The series is being reverted before -rc1 as there are still reports of lockups on shutdown, so it's not quite ready for "prime time." Reported-by: Andrey Skvortsov Link: https://lore.kernel.org/r/ZvMkkhyJrohaajuk@skv.local Cc: Christoph Hellwig Cc: David Jeffery Cc: Keith Busch Cc: Laurence Oberman Cc: Nathan Chancellor Cc: Sagi Grimberg Cc: Stuart Hayes Signed-off-by: Greg Kroah-Hartman --- include/linux/device/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index 14c9211b82d6..5c04b8e3833b 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -56,7 +56,6 @@ enum probe_type { * @mod_name: Used for built-in modules. * @suppress_bind_attrs: Disables bind/unbind via sysfs. * @probe_type: Type of the probe (synchronous or asynchronous) to use. - * @async_shutdown_enable: Enables devices to be shutdown asynchronously. * @of_match_table: The open firmware table. * @acpi_match_table: The ACPI match table. * @probe: Called to query the existence of a specific device, @@ -103,7 +102,6 @@ struct device_driver { bool suppress_bind_attrs; /* disables bind/unbind via sysfs */ enum probe_type probe_type; - bool async_shutdown_enable; const struct of_device_id *of_match_table; const struct acpi_device_id *acpi_match_table; -- cgit v1.2.3 From ce4db753de21abfb7516ef64aff907813e8a8e3e Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 26 Aug 2024 11:25:52 +0800 Subject: kprobes: Remove obsoleted declaration for init_test_probes The init_test_probes() have been removed since commit e44e81c5b90f ("kprobes: convert tests to kunit"), and now it is useless, so remove it. Link: https://lore.kernel.org/all/20240826032552.4016314-1-cuigaosheng1@huawei.com/ Signed-off-by: Gaosheng Cui Signed-off-by: Masami Hiramatsu (Google) --- include/linux/kprobes.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 5fcbc254d186..8c4f3bb24429 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -269,15 +269,6 @@ extern unsigned long __stop_kprobe_blacklist[]; extern struct kretprobe_blackpoint kretprobe_blacklist[]; -#ifdef CONFIG_KPROBES_SANITY_TEST -extern int init_test_probes(void); -#else /* !CONFIG_KPROBES_SANITY_TEST */ -static inline int init_test_probes(void) -{ - return 0; -} -#endif /* CONFIG_KPROBES_SANITY_TEST */ - extern int arch_prepare_kprobe(struct kprobe *p); extern void arch_arm_kprobe(struct kprobe *p); extern void arch_disarm_kprobe(struct kprobe *p); -- cgit v1.2.3 From d5dbf8b48a4620db771f399ed7fce32d447f04a6 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 18 Aug 2024 19:42:58 +0900 Subject: tracepoint: Support iterating over tracepoints on modules Add for_each_module_tracepoint() for iterating over tracepoints on modules. This is similar to the for_each_kernel_tracepoint() but only for the tracepoints on modules (not including kernel built-in tracepoints). Link: https://lore.kernel.org/all/172397777800.286558.14554748203446214056.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- include/linux/tracepoint.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 6be396bb4297..837fcf8ec0d5 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -64,6 +64,8 @@ struct tp_module { bool trace_module_has_bad_taint(struct module *mod); extern int register_tracepoint_module_notifier(struct notifier_block *nb); extern int unregister_tracepoint_module_notifier(struct notifier_block *nb); +void for_each_module_tracepoint(void (*fct)(struct tracepoint *, void *), + void *priv); #else static inline bool trace_module_has_bad_taint(struct module *mod) { @@ -79,6 +81,11 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb) { return 0; } +static inline +void for_each_module_tracepoint(void (*fct)(struct tracepoint *, void *), + void *priv) +{ +} #endif /* CONFIG_MODULES */ /* -- cgit v1.2.3 From d4df54f338e43c790460674a3cc7db35b8395421 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sun, 18 Aug 2024 19:43:07 +0900 Subject: tracepoint: Support iterating tracepoints in a loading module Add for_each_tracepoint_in_module() function to iterate tracepoints in a module. This API is needed for handling tracepoints in a loading module from tracepoint_module_notifier callback function. This also update for_each_module_tracepoint() to pass the module to callback function so that it can find module easily. Link: https://lore.kernel.org/all/172397778740.286558.15781131277732977643.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- include/linux/tracepoint.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 837fcf8ec0d5..93a9f3070b48 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -64,8 +64,13 @@ struct tp_module { bool trace_module_has_bad_taint(struct module *mod); extern int register_tracepoint_module_notifier(struct notifier_block *nb); extern int unregister_tracepoint_module_notifier(struct notifier_block *nb); -void for_each_module_tracepoint(void (*fct)(struct tracepoint *, void *), +void for_each_module_tracepoint(void (*fct)(struct tracepoint *, + struct module *, void *), void *priv); +void for_each_tracepoint_in_module(struct module *, + void (*fct)(struct tracepoint *, + struct module *, void *), + void *priv); #else static inline bool trace_module_has_bad_taint(struct module *mod) { @@ -82,10 +87,18 @@ int unregister_tracepoint_module_notifier(struct notifier_block *nb) return 0; } static inline -void for_each_module_tracepoint(void (*fct)(struct tracepoint *, void *), +void for_each_module_tracepoint(void (*fct)(struct tracepoint *, + struct module *, void *), void *priv) { } +static inline +void for_each_tracepoint_in_module(struct module *mod, + void (*fct)(struct tracepoint *, + struct module *, void *), + void *priv) +{ +} #endif /* CONFIG_MODULES */ /* -- cgit v1.2.3 From 04e906839a053f092ef53f4fb2d610983412b904 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 19 Sep 2024 14:33:42 +0200 Subject: usbnet: fix cyclical race on disconnect with work queue The work can submit URBs and the URBs can schedule the work. This cycle needs to be broken, when a device is to be stopped. Use a flag to do so. This is a design issue as old as the driver. Signed-off-by: Oliver Neukum Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") CC: stable@vger.kernel.org Link: https://patch.msgid.link/20240919123525.688065-1-oneukum@suse.com Signed-off-by: Paolo Abeni --- include/linux/usb/usbnet.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 9f08a584d707..0b9f1e598e3a 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -76,8 +76,23 @@ struct usbnet { # define EVENT_LINK_CHANGE 11 # define EVENT_SET_RX_MODE 12 # define EVENT_NO_IP_ALIGN 13 +/* This one is special, as it indicates that the device is going away + * there are cyclic dependencies between tasklet, timer and bh + * that must be broken + */ +# define EVENT_UNPLUG 31 }; +static inline bool usbnet_going_away(struct usbnet *ubn) +{ + return test_bit(EVENT_UNPLUG, &ubn->flags); +} + +static inline void usbnet_mark_going_away(struct usbnet *ubn) +{ + set_bit(EVENT_UNPLUG, &ubn->flags); +} + static inline struct usb_driver *driver_of(struct usb_interface *intf) { return to_usb_driver(intf->dev.driver); -- cgit v1.2.3 From 8af79d3edb5fd2dce35ea0a71595b6d4f9962350 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Sep 2024 15:13:39 +0200 Subject: netfilter: nfnetlink_queue: remove old clash resolution logic For historical reasons there are two clash resolution spots in netfilter, one in nfnetlink_queue and one in conntrack core. nfnetlink_queue one was added first: If a colliding entry is found, NAT NAT transformation is reversed by calling nat engine again with altered tuple. See commit 368982cd7d1b ("netfilter: nfnetlink_queue: resolve clash for unconfirmed conntracks") for details. One problem is that nf_reroute() won't take an action if the queueing doesn't occur in the OUTPUT hook, i.e. when queueing in forward or postrouting, packet will be sent via the wrong path. Another problem is that the scenario addressed (2nd UDP packet sent with identical addresses while first packet is still being processed) can also occur without any nfqueue involvement due to threaded resolvers doing A and AAAA requests back-to-back. This lead us to add clash resolution logic to the conntrack core, see commit 6a757c07e51f ("netfilter: conntrack: allow insertion of clashing entries"). Instead of fixing the nfqueue based logic, lets remove it and let conntrack core handle this instead. Retain the ->update hook for sake of nfqueue based conntrack helpers. We could axe this hook completely but we'd have to split confirm and helper logic again, see commit ee04805ff54a ("netfilter: conntrack: make conntrack userspace helpers work again"). This SHOULD NOT be backported to kernels earlier than v5.6; they lack adequate clash resolution handling. Patch was originally written by Pablo Neira Ayuso. Reported-by: Antonio Ojea Closes: https://bugzilla.netfilter.org/show_bug.cgi?id=1766 Signed-off-by: Florian Westphal Tested-by: Antonio Ojea Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 2683b2b77612..2b8aac2c70ad 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -376,15 +376,11 @@ int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, struct nf_conn; enum nf_nat_manip_type; struct nlattr; -enum ip_conntrack_dir; struct nf_nat_hook { int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr); void (*decode_session)(struct sk_buff *skb, struct flowi *fl); - unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct, - enum nf_nat_manip_type mtype, - enum ip_conntrack_dir dir); void (*remove_nat_bysrc)(struct nf_conn *ct); }; -- cgit v1.2.3 From a78282e2c94f4ca80a2d7c56e4d1e9546be5596d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 26 Sep 2024 11:39:02 -0700 Subject: Revert "binfmt_elf, coredump: Log the reason of the failed core dumps" This reverts commit fb97d2eb542faf19a8725afbd75cbc2518903210. The logging was questionable to begin with, but it seems to actively deadlock on the task lock. "On second thought, let's not log core dump failures. 'Tis a silly place" because if you can't tell your core dump is truncated, maybe you should just fix your debugger instead of adding bugs to the kernel. Reported-by: Vegard Nossum Link: https://lore.kernel.org/all/d122ece6-3606-49de-ae4d-8da88846bef2@oracle.com/ Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index edeb8532ce0f..45e598fe3476 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -42,7 +42,7 @@ extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); -extern int do_coredump(const kernel_siginfo_t *siginfo); +extern void do_coredump(const kernel_siginfo_t *siginfo); /* * Logging for the coredump code, ratelimited. @@ -62,11 +62,7 @@ extern int do_coredump(const kernel_siginfo_t *siginfo); #define coredump_report_failure(fmt, ...) __COREDUMP_PRINTK(KERN_WARNING, fmt, ##__VA_ARGS__) #else -static inline int do_coredump(const kernel_siginfo_t *siginfo) -{ - /* Coredump support is not available, can't fail. */ - return 0; -} +static inline void do_coredump(const kernel_siginfo_t *siginfo) {} #define coredump_report(...) #define coredump_report_failure(...) -- cgit v1.2.3 From 26a8ea80929c518bdec5e53a5776f95919b7c88e Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 3 Sep 2024 07:25:19 -0700 Subject: mm/hugetlb: fix memfd_pin_folios resv_huge_pages leak memfd_pin_folios followed by unpin_folios leaves resv_huge_pages elevated if the pages were not already faulted in. During a normal page fault, resv_huge_pages is consumed here: hugetlb_fault() alloc_hugetlb_folio() dequeue_hugetlb_folio_vma() dequeue_hugetlb_folio_nodemask() dequeue_hugetlb_folio_node_exact() free_huge_pages-- resv_huge_pages-- During memfd_pin_folios, the page is created by calling alloc_hugetlb_folio_nodemask instead of alloc_hugetlb_folio, and resv_huge_pages is not modified: memfd_alloc_folio() alloc_hugetlb_folio_nodemask() dequeue_hugetlb_folio_nodemask() dequeue_hugetlb_folio_node_exact() free_huge_pages-- alloc_hugetlb_folio_nodemask has other callers that must not modify resv_huge_pages. Therefore, to fix, define an alternate version of alloc_hugetlb_folio_nodemask for this call site that adjusts resv_huge_pages. Link: https://lkml.kernel.org/r/1725373521-451395-4-git-send-email-steven.sistare@oracle.com Fixes: 89c1905d9c14 ("mm/gup: introduce memfd_pin_folios() for pinning memfd folios") Signed-off-by: Steve Sistare Acked-by: Vivek Kasireddy Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 98c47c394b89..e4697539b665 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -692,6 +692,9 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback); +struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, + nodemask_t *nmask, gfp_t gfp_mask); + int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, @@ -1059,6 +1062,13 @@ static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, return NULL; } +static inline struct folio * +alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, + nodemask_t *nmask, gfp_t gfp_mask) +{ + return NULL; +} + static inline struct folio * alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask, -- cgit v1.2.3 From c5b1184decc819756ae549ba54c63b6790c4ddfd Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 24 Sep 2024 14:27:10 +0800 Subject: compiler.h: specify correct attribute for .rodata..c_jump_table Currently, there is an assembler message when generating kernel/bpf/core.o under CONFIG_OBJTOOL with LoongArch compiler toolchain: Warning: setting incorrect section attributes for .rodata..c_jump_table This is because the section ".rodata..c_jump_table" should be readonly, but there is a "W" (writable) part of the flags: $ readelf -S kernel/bpf/core.o | grep -A 1 "rodata..c" [34] .rodata..c_j[...] PROGBITS 0000000000000000 0000d2e0 0000000000000800 0000000000000000 WA 0 0 8 There is no above issue on x86 due to the generated section flag is only "A" (allocatable). In order to silence the warning on LoongArch, specify the attribute like ".rodata..c_jump_table,\"a\",@progbits #" explicitly, then the section attribute of ".rodata..c_jump_table" must be readonly in the kernel/bpf/core.o file. Before: $ objdump -h kernel/bpf/core.o | grep -A 1 "rodata..c" 21 .rodata..c_jump_table 00000800 0000000000000000 0000000000000000 0000d2e0 2**3 CONTENTS, ALLOC, LOAD, RELOC, DATA After: $ objdump -h kernel/bpf/core.o | grep -A 1 "rodata..c" 21 .rodata..c_jump_table 00000800 0000000000000000 0000000000000000 0000d2e0 2**3 CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA By the way, AFAICT, maybe the root cause is related with the different compiler behavior of various archs, so to some extent this change is a workaround for LoongArch, and also there is no effect for x86 which is the only port supported by objtool before LoongArch with this patch. Link: https://lkml.kernel.org/r/20240924062710.1243-1-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: [6.9+] Signed-off-by: Andrew Morton --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ec55bcce4146..4d4e23b6e3e7 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -133,7 +133,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define annotate_unreachable() __annotate_unreachable(__COUNTER__) /* Annotate a C jump table to allow objtool to follow the code flow */ -#define __annotate_jump_table __section(".rodata..c_jump_table") +#define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #") #else /* !CONFIG_OBJTOOL */ #define annotate_reachable() -- cgit v1.2.3 From cb787f4ac0c2e439ea8d7e6387b925f74576bdf8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 27 Sep 2024 02:56:11 +0100 Subject: [tree-wide] finally take no_llseek out no_llseek had been defined to NULL two years ago, in commit 868941b14441 ("fs: remove no_llseek") To quote that commit, At -rc1 we'll need do a mechanical removal of no_llseek - git grep -l -w no_llseek | grep -v porting.rst | while read i; do sed -i '/\/d' $i done would do it. Unfortunately, that hadn't been done. Linus, could you do that now, so that we could finally put that thing to rest? All instances are of the form .llseek = no_llseek, so it's obviously safe. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/linux/debugfs.h | 1 - include/linux/fs.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index c9c65b132c0f..0928a6c8ae1e 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -57,7 +57,6 @@ static const struct file_operations __fops = { \ .release = simple_attr_release, \ .read = debugfs_attr_read, \ .write = (__is_signed) ? debugfs_attr_write_signed : debugfs_attr_write, \ - .llseek = no_llseek, \ } #define DEFINE_DEBUGFS_ATTRIBUTE(__fops, __get, __set, __fmt) \ diff --git a/include/linux/fs.h b/include/linux/fs.h index eae5b67e4a15..e3c603d01337 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3234,7 +3234,6 @@ extern ssize_t iter_file_splice_write(struct pipe_inode_info *, extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t noop_llseek(struct file *file, loff_t offset, int whence); -#define no_llseek NULL extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, -- cgit v1.2.3